From c4ff97a12c3d4da8b04f20cabad386b5cdad2728 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 15:56:44 -0600
Subject: [PATCH 01/15] Better logging

---
 compass/scripts/download.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/compass/scripts/download.py b/compass/scripts/download.py
index 3393dd58..f89f7f29 100644
--- a/compass/scripts/download.py
+++ b/compass/scripts/download.py
@@ -695,9 +695,7 @@ async def filter_ordinance_docs(
         "Found %d potential ordinance documents for %s\n\t- %s",
         len(docs),
         jurisdiction.full_name,
-        "\n\t- ".join(
-            [doc.attrs.get("source", "Unknown source") for doc in docs]
-        ),
+        "\n\t- ".join([str(doc) for doc in docs]),
     )
     return docs
 

From eaff498063129f606346da32fb56b10396272c43 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 15:56:54 -0600
Subject: [PATCH 02/15] Use neg inf

---
 compass/scripts/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compass/scripts/download.py b/compass/scripts/download.py
index f89f7f29..11882280 100644
--- a/compass/scripts/download.py
+++ b/compass/scripts/download.py
@@ -811,7 +811,7 @@ def _sort_final_ord_docs(all_ord_docs):
 
 def _ord_doc_sorting_key(doc):
     """Compute a composite sorting score for ordinance documents"""
-    no_date = (-1, -1, -1)
+    no_date = (_NEG_INF, _NEG_INF, _NEG_INF)
     latest_year, latest_month, latest_day = doc.attrs.get("date") or no_date
     best_docs_from_website = doc.attrs.get(_SCORE_KEY, 0)
     prefer_pdf_files = isinstance(doc, PDFDocument)

From b95097d485c03d825f8036bcc60c843d64fecce9 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 15:57:59 -0600
Subject: [PATCH 03/15] Move to `doc_selection_method` for plugin

---
 compass/plugin/one_shot/base.py               | 73 ++++++++++++++++---
 .../one-shot/plugin_config.yaml               |  2 +-
 2 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py
index e5254a73..0ef69e47 100644
--- a/compass/plugin/one_shot/base.py
+++ b/compass/plugin/one_shot/base.py
@@ -123,11 +123,26 @@ def create_schema_based_one_shot_extraction_plugin(config, tech):  # noqa: C901
               may provide a custom system prompt if you want to provide
               more specific instructions to the LLM for the structured
               data extraction step.
-            - `allow_multi_doc_extraction`: Boolean flag indicating
-              whether to allow multiple documents to be used for the
-              extraction context simultaneously. By default, ``False``,
-              which means the first document that returns some extracted
-              data will be marked as the source.
+            - `doc_selection_method`: String defining the multi-doc
+              selection option. Specifically, if multiple documents pass
+              the filter, this method detemines how the documents are
+              submitted to the extraction context. Allowed options are:
+
+                - "single doc": Use the first document that returns some
+                  extracted data as the source document for the
+                  extraction context.
+                - "multi doc context": Submit text from multiple
+                  documents to the extraction context simultaneously.
+                - "multi doc all": Each document is extracted separately
+                  and the results concatenated. This may give duplicated
+                  feature results if the same feature is mentioned in
+                  multiple documents.
+                - "multi doc mixed": Each document is extracted
+                  separately and the results are merged together at the
+                  end. In this approach, each feature is reported at
+                  most once.
+
+              By default, ``"single doc"``.
 
     tech : str
         Technology identifier to use for the plugin (e.g., "wind",
@@ -161,10 +176,25 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
         SCHEMA = config["schema"]
         """dict: Schema for the output of the text extraction step"""
 
-        ALLOW_MULTI_DOC_EXTRACTION = config.get(
-            "allow_multi_doc_extraction", False
-        )
-        """bool: Whether to allow extraction over multiple documents"""
+        DOC_SELECTION_METHOD = _doc_selection_method(config)
+        """str: Method for selecting documents for extraction context
+
+        Allowed options:
+
+            - "single doc": Use the first document that returns some
+              extracted data as the source document for the extraction
+              context.
+            - "multi doc context": Submit text from multiple documents
+              to the extraction context simultaneously.
+            - "multi doc all": Each document is extracted separately
+              and the results concatenated. This may give duplicated
+              feature results if the same feature is mentioned in
+              multiple documents.
+            - "multi doc mixed": Each document is extracted separately
+              and the results are merged together at the end. In this
+              approach, each feature is reported at most once.
+
+        """
 
         IDENTIFIER = tech
         """str: Identifier for extraction task """
@@ -568,3 +598,28 @@ def _normalize_keyword_list(items):
         normalized.add(keyword)
 
     return list(normalized)
+
+
+def _doc_selection_method(config):
+    """Parse and normalize the document selection method"""
+    allowed_methods = {
+        "single_doc",
+        "multi_doc_context",
+        "multi_doc_all",
+        "multi_doc_mixed",
+    }
+    og_doc_selection_method = config.get("doc_selection_method", "single doc")
+    doc_selection_method = (
+        og_doc_selection_method.replace(" ", "_")
+        .replace("-", "_")
+        .strip()
+        .casefold()
+    )
+    if doc_selection_method not in allowed_methods:
+        msg = (
+            f"Invalid doc_selection_method: {og_doc_selection_method!r}. "
+            f"Allowed options are: {sorted(allowed_methods)}."
+        )
+        raise COMPASSPluginConfigurationError(msg)
+
+    return doc_selection_method
diff --git a/examples/water_rights_demo/one-shot/plugin_config.yaml b/examples/water_rights_demo/one-shot/plugin_config.yaml
index b5f7e47a..0627bc8b 100755
--- a/examples/water_rights_demo/one-shot/plugin_config.yaml
+++ b/examples/water_rights_demo/one-shot/plugin_config.yaml
@@ -2,7 +2,7 @@ schema: ./water_rights_schema.json5
 
 data_type_short_desc: water rights and regulations
 
-allow_multi_doc_extraction: True  # Important for water rights!
+doc_selection_method: "multi doc context"  # Important for water rights!
 
 query_templates:
   - "{jurisdiction} rules"

From a4b9bfb7c8e17a7e4c41c1913ef0afe721559cdb Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 15:58:16 -0600
Subject: [PATCH 04/15] Implement the multi-doc concat method

---
 compass/plugin/ordinance.py | 137 +++++++++++++++++++++++++++---------
 1 file changed, 105 insertions(+), 32 deletions(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 998bbbc7..79f2092e 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -596,8 +596,8 @@ class OrdinanceExtractionPlugin(FilteredExtractionPlugin):
     methods as needed.
     """
 
-    ALLOW_MULTI_DOC_EXTRACTION = False
-    """bool: Whether to allow extraction over multiple documents"""
+    DOC_SELECTION_METHOD = "single doc"
+    """str: Only allow one document to be output"""
 
     @property
     @abstractmethod
@@ -701,13 +701,71 @@ async def parse_docs_for_structured_data(self, extraction_context):
             Context with extracted data/information stored in the
             ``.attrs`` dictionary, or ``None`` if no data was extracted.
         """
-        if self.ALLOW_MULTI_DOC_EXTRACTION:
+        if self.DOC_SELECTION_METHOD == "single_doc":
+            return await self.parse_single_doc_for_structured_data(
+                extraction_context
+            )
+
+        if self.DOC_SELECTION_METHOD == "multi_doc_context":
             return await self.parse_multi_doc_context_for_structured_data(
                 extraction_context
             )
-        return await self.parse_single_doc_for_structured_data(
-            extraction_context
+
+        if self.DOC_SELECTION_METHOD == "multi_doc_all":
+            return await self.parse_multi_doc_concat(extraction_context)
+
+        if self.DOC_SELECTION_METHOD == "multi_doc_mixed":
+            msg = "TODO"
+            raise NotImplementedError(msg)
+
+        msg = (
+            f"Invalid DOC_SELECTION_METHOD: {self.DOC_SELECTION_METHOD!r}. "
+            "Supported methods are: 'single_doc' and 'multi_doc_context'."
         )
+        raise COMPASSPluginConfigurationError(msg)
+
+    async def parse_single_doc_for_structured_data(self, extraction_context):
+        """Parse documents one at a time to extract structured data
+
+        The first document to return some extracted data will be marked
+        as the source and will be returned from this method.
+
+        Parameters
+        ----------
+        extraction_context : ExtractionContext
+            Context containing candidate documents to parse.
+
+        Returns
+        -------
+        ExtractionContext or None
+            Context with extracted data/information stored in the
+            ``.attrs`` dictionary, or ``None`` if no data was extracted.
+        """
+        for doc_for_extraction in extraction_context:
+            data_df = await self.parse_for_structured_data(doc_for_extraction)
+            row_count = self.get_structured_data_row_count(data_df)
+            if row_count > 0:
+                data_df["source"] = doc_for_extraction.attrs.get("source")
+                data_df["year"] = extract_year_from_doc_attrs(
+                    doc_for_extraction.attrs
+                )
+                await extraction_context.mark_doc_as_data_source(
+                    doc_for_extraction, out_fn_stem=self.jurisdiction.full_name
+                )
+                extraction_context.attrs["structured_data"] = data_df
+                logger.info(
+                    "%d ordinance value(s) found in doc from %s for %s. ",
+                    num_ordinances_dataframe(data_df),
+                    doc_for_extraction.attrs.get("source", "unknown source"),
+                    self.jurisdiction.full_name,
+                )
+                return extraction_context
+
+        logger.debug(
+            "No ordinances found; searched %d docs",
+            extraction_context.num_documents,
+        )
+        return None
 
     async def parse_multi_doc_context_for_structured_data(
         self, extraction_context
@@ -755,11 +813,8 @@ async def parse_multi_doc_context_for_structured_data(
         )
         return extraction_context
 
-    async def parse_single_doc_for_structured_data(self, extraction_context):
-        """Parse documents one at a time to extract structured data
-
-        The first document to return some extracted data will be marked
-        as the source and will be returned from this method.
+    async def parse_multi_doc_concat(self, extraction_context):
+        """Parse all documents and concatenate extracted data
 
         Parameters
         ----------
@@ -772,31 +827,49 @@ async def parse_single_doc_for_structured_data(self, extraction_context):
             Context with extracted data/information stored in the
             ``.attrs`` dictionary, or ``None`` if no data was extracted.
         """
-        for doc_for_extraction in extraction_context:
-            data_df = await self.parse_for_structured_data(doc_for_extraction)
+
+        tasks = [
+            asyncio.create_task(
+                self.parse_for_structured_data(doc_for_extraction),
+                name=self.jurisdiction.full_name,
+            )
+            for doc_for_extraction in extraction_context
+        ]
+        data_dfs = await asyncio.gather(*tasks)
+
+        all_data = []
+        for data_df, doc_for_extraction in zip(
+            data_dfs, extraction_context, strict=True
+        ):
             row_count = self.get_structured_data_row_count(data_df)
-            if row_count > 0:
-                data_df["source"] = doc_for_extraction.attrs.get("source")
-                data_df["year"] = extract_year_from_doc_attrs(
-                    doc_for_extraction.attrs
-                )
-                await extraction_context.mark_doc_as_data_source(
-                    doc_for_extraction, out_fn_stem=self.jurisdiction.full_name
-                )
-                extraction_context.attrs["structured_data"] = data_df
-                logger.info(
-                    "%d ordinance value(s) found in doc from %s for %s. ",
-                    num_ordinances_dataframe(data_df),
-                    doc_for_extraction.attrs.get("source", "unknown source"),
-                    self.jurisdiction.full_name,
-                )
-                return extraction_context
+            if row_count == 0:
+                continue
+            data_df["source"] = doc_for_extraction.attrs.get("source")
+            data_df["year"] = extract_year_from_doc_attrs(
+                doc_for_extraction.attrs
+            )
+            await extraction_context.mark_doc_as_data_source(
+                doc_for_extraction, out_fn_stem=self.jurisdiction.full_name
+            )
+            logger.info(
+                "%d ordinance value(s) found in doc from %s for %s. ",
+                num_ordinances_dataframe(data_df),
+                doc_for_extraction.attrs.get("source", "unknown source"),
+                self.jurisdiction.full_name,
+            )
+            all_data.append(data_df)
 
-        logger.debug(
-            "No ordinances found; searched %d docs",
-            extraction_context.num_documents,
+        if not all_data:
+            logger.debug(
+                "No ordinances found; searched %d docs",
+                extraction_context.num_documents,
+            )
+            return None
+
+        extraction_context.attrs["structured_data"] = pd.concat(
+            all_data, ignore_index=True
         )
-        return None
+        return extraction_context
 
     async def parse_for_structured_data(self, source):
         """Extract all possible structured data from a document

From 9e9ca809fd07bffb0f13cf7aecadaa82a7f2db39 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 16:54:39 -0600
Subject: [PATCH 05/15] Correct out file

Co-authored-by: Copilot <copilot@github.com>
---
 compass/plugin/ordinance.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 79f2092e..1bd33223 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -838,24 +838,22 @@ async def parse_multi_doc_concat(self, extraction_context):
         data_dfs = await asyncio.gather(*tasks)
 
         all_data = []
-        for data_df, doc_for_extraction in zip(
-            data_dfs, extraction_context, strict=True
+        for doc_ind, (data_df, doc) in enumerate(
+            zip(data_dfs, extraction_context, strict=True), start=1
         ):
             row_count = self.get_structured_data_row_count(data_df)
             if row_count == 0:
                 continue
-            data_df["source"] = doc_for_extraction.attrs.get("source")
-            data_df["year"] = extract_year_from_doc_attrs(
-                doc_for_extraction.attrs
-            )
+            data_df["source"] = doc.attrs.get("source")
+            data_df["year"] = extract_year_from_doc_attrs(doc.attrs)
             await extraction_context.mark_doc_as_data_source(
-                doc_for_extraction, out_fn_stem=self.jurisdiction.full_name
+                doc, out_fn_stem=f"{self.jurisdiction.full_name}_{doc_ind}"
             )
             logger.info(
-                "%d ordinance value(s) found in doc from %s for %s. ",
+                "%d ordinance value(s) found for %s from doc:\n%s. ",
                 num_ordinances_dataframe(data_df),
-                doc_for_extraction.attrs.get("source", "unknown source"),
                 self.jurisdiction.full_name,
+                doc,
             )
             all_data.append(data_df)
 

From 3f80c6f4f68e9e49df92358a464ce5ae1da82226 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 16:56:17 -0600
Subject: [PATCH 06/15] Adjust logger

---
 compass/plugin/ordinance.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 1bd33223..50379af8 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -754,10 +754,10 @@ async def parse_single_doc_for_structured_data(self, extraction_context):
                 )
                 extraction_context.attrs["structured_data"] = data_df
                 logger.info(
-                    "%d ordinance value(s) found in doc from %s for %s. ",
+                    "%d ordinance value(s) found for %s from doc:\n%s. ",
                     num_ordinances_dataframe(data_df),
-                    doc_for_extraction.attrs.get("source", "unknown source"),
                     self.jurisdiction.full_name,
+                    doc_for_extraction,
                 )
                 return extraction_context
 
@@ -806,10 +806,10 @@ async def parse_multi_doc_context_for_structured_data(
 
         extraction_context.attrs["structured_data"] = data_df
         logger.info(
-            "%d ordinance value(s) found in %d docs for %s. ",
+            "%d ordinance value(s) found for %s in %d docs. ",
             num_ordinances_dataframe(data_df),
-            extraction_context.num_documents,
             self.jurisdiction.full_name,
+            extraction_context.num_documents,
         )
         return extraction_context
 

From d0732b338531bdcc1f6b02c76b47961c3cefaaea Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 17:13:47 -0600
Subject: [PATCH 07/15] Formatting

---
 compass/plugin/ordinance.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 50379af8..2c2ef98f 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -844,6 +844,7 @@ async def parse_multi_doc_concat(self, extraction_context):
             row_count = self.get_structured_data_row_count(data_df)
             if row_count == 0:
                 continue
+
             data_df["source"] = doc.attrs.get("source")
             data_df["year"] = extract_year_from_doc_attrs(doc.attrs)
             await extraction_context.mark_doc_as_data_source(

From cc262bf9a4403505466653d29ee927b4a6604789 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 17:24:00 -0600
Subject: [PATCH 08/15] Add `parse_multi_doc_merge`

Co-authored-by: Copilot <copilot@github.com>
---
 compass/plugin/ordinance.py | 189 +++++++++++++++++++++++++++++++++++-
 1 file changed, 185 insertions(+), 4 deletions(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 2c2ef98f..35a9a094 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import logging
+import operator
 from warnings import warn
 from textwrap import dedent
 from itertools import chain
@@ -715,8 +716,7 @@ async def parse_docs_for_structured_data(self, extraction_context):
             return await self.parse_multi_doc_concat(extraction_context)
 
         if self.DOC_SELECTION_METHOD == "multi_doc_mixed":
-            msg = "TODO"
-            raise NotImplementedError(msg)
+            return await self.parse_multi_doc_merge(extraction_context)
 
         msg = (
             f"Invalid DOC_SELECTION_METHOD: {self.DOC_SELECTION_METHOD!r}. "
@@ -727,8 +727,17 @@ async def parse_docs_for_structured_data(self, extraction_context):
     async def parse_single_doc_for_structured_data(self, extraction_context):
         """Parse documents one at a time to extract structured data
 
-        The first document to return some extracted data will be marked
-        as the source and will be returned from this method.
+        This mode evaluates candidate documents in sequence and stops
+        at the first document that produces ordinance data. Once a
+        usable source is found, later candidate documents are not used
+        to supplement, compare, or override that result. This is the
+        simplest selection strategy and is best suited to workflows
+        where one document is expected to contain the authoritative
+        ordinance language on its own.
+
+        Documents are expected to come sorted by priority, with the most
+        likely source of ordinance language appearing first in the
+        `extraction_context`.
 
         Parameters
         ----------
@@ -772,6 +781,16 @@ async def parse_multi_doc_context_for_structured_data(
     ):
         """Parse all documents to extract structured data/information
 
+        This mode combines the relevant text from all candidate
+        documents into one shared extraction context before structured
+        data are parsed. It is useful when the information needed for a
+        single ordinance feature may be split across multiple sources
+        and should be interpreted together rather than compared as
+        separate document-level outputs. When source references can be
+        recovered from the extracted rows, each row is mapped back to
+        its originating document; otherwise the result falls back to
+        reporting the full document set as the source context.
+
         Parameters
         ----------
         extraction_context : ExtractionContext
@@ -816,6 +835,14 @@ async def parse_multi_doc_context_for_structured_data(
     async def parse_multi_doc_concat(self, extraction_context):
         """Parse all documents and concatenate extracted data
 
+        This mode keeps all extracted ordinance rows from every
+        candidate document that produced structured data. Unlike the
+        merge mode, it does not try to choose a single best row for a
+        feature or resolve conflicts between sources. If the same
+        feature is extracted from multiple ordinances, each version is
+        preserved in the output with its own source and year so users
+        can compare the results directly.
+
         Parameters
         ----------
         extraction_context : ExtractionContext
@@ -870,6 +897,79 @@ async def parse_multi_doc_concat(self, extraction_context):
         )
         return extraction_context
 
+    async def parse_multi_doc_merge(self, extraction_context):
+        """Parse all documents and merge the extracted data
+
+        This mode keeps at most one row per extracted feature across
+        all candidate documents. When every document with extracted
+        data has a known ordinance year, newer ordinances take
+        precedence and older ordinances are only used to fill in
+        features that are missing from the newer sources. If any
+        candidate document has an unknown year, documents are instead
+        prioritized by how many ordinance features they contain.
+
+        Documents with extracted prohibitions are treated specially.
+        If any candidate document contains a prohibition, only
+        prohibition-bearing documents are considered for the final
+        merged output. The returned rows keep the source and year of
+        the document they came from so downstream consumers can still
+        trace each retained feature back to its originating ordinance.
+
+        Parameters
+        ----------
+        extraction_context : ExtractionContext
+            Context containing candidate documents to parse.
+
+        Returns
+        -------
+        ExtractionContext or None
+            Context with extracted data/information stored in the
+            ``.attrs`` dictionary, or ``None`` if no data was extracted.
+        """
+
+        tasks = [
+            asyncio.create_task(
+                self.parse_for_structured_data(doc_for_extraction),
+                name=self.jurisdiction.full_name,
+            )
+            for doc_for_extraction in extraction_context
+        ]
+        data_dfs = await asyncio.gather(*tasks)
+
+        candidates = []
+        for doc_ind, (data_df, doc) in enumerate(
+            zip(data_dfs, extraction_context, strict=True), start=1
+        ):
+            row_count = self.get_structured_data_row_count(data_df)
+            if row_count == 0:
+                continue
+
+            data_df["source"] = doc.attrs.get("source")
+            data_df["year"] = year = extract_year_from_doc_attrs(doc.attrs)
+            candidates.append(
+                {
+                    "data_df": data_df,
+                    "doc": doc,
+                    "doc_ind": doc_ind,
+                    "row_count": row_count,
+                    "year": year,
+                }
+            )
+
+        if not candidates:
+            logger.debug(
+                "No ordinances found; searched %d docs",
+                extraction_context.num_documents,
+            )
+            return None
+
+        candidates = _filter_to_prohibition_cands_if_needed(candidates)
+        candidates = _prioritize_candidates(candidates)
+        extraction_context.attrs["structured_data"] = await _merge_candidates(
+            candidates, extraction_context, self.jurisdiction.full_name
+        )
+        return extraction_context
+
     async def parse_for_structured_data(self, source):
         """Extract all possible structured data from a document
 
@@ -1262,3 +1362,84 @@ async def _fill_in_all_sources(data_df, extraction_context, out_fn_stem):
         )
 
     return data_df
+
+
+def _filter_to_prohibition_cands_if_needed(candidates):
+    """Filter to just candidates with prohibitions, if any"""
+    prohibition_candidates = [
+        candidate
+        for candidate in candidates
+        if _has_prohibitions(candidate["data_df"])
+    ]
+    return prohibition_candidates or candidates
+
+
+def _prioritize_candidates(candidates):
+    """Sort candidates by year (only if all have years) and row count"""
+    if len(candidates) <= 1:
+        return candidates
+
+    if all(candidate["year"] is not None for candidate in candidates):
+        return sorted(
+            candidates,
+            key=operator.itemgetter("year", "row_count"),
+            reverse=True,
+        )
+
+    return sorted(
+        candidates,
+        key=operator.itemgetter("row_count"),
+        reverse=True,
+    )
+
+
+async def _merge_candidates(candidates, extraction_context, out_stem):
+    """Merge extracted features while respecting candidate priority"""
+    merged_rows = []
+    merged_features = set()
+    contributing_candidates = []
+    for candidate in candidates:
+        candidate_rows = []
+        for _, row in candidate["data_df"].iterrows():
+            feature_key = _feature_key(row.get("feature"))
+            if feature_key is None or feature_key in merged_features:
+                continue
+
+            merged_features.add(feature_key)
+            candidate_rows.append(row.to_dict())
+
+        if not candidate_rows:
+            continue
+
+        merged_rows.extend(candidate_rows)
+        contributing_candidates.append(candidate)
+
+    if not merged_rows:
+        return None
+
+    for candidate in contributing_candidates:
+        await extraction_context.mark_doc_as_data_source(
+            candidate["doc"],
+            out_fn_stem=f"{out_stem}_{candidate['doc_ind']}",
+        )
+
+    return pd.DataFrame(merged_rows).reset_index(drop=True)
+
+
+def _feature_key(feature):
+    """Get normalized feature key"""
+    if pd.isna(feature):
+        return None
+    return str(feature).strip().casefold()
+
+
+def _has_prohibitions(data_df):
+    """Check for prohibition in data"""
+    if data_df is None or data_df.empty or "feature" not in data_df:
+        return False
+
+    prohibition_mask = data_df["feature"].map(_feature_key).eq("prohibitions")
+    if not prohibition_mask.any():
+        return False
+
+    return num_ordinances_dataframe(data_df.loc[prohibition_mask]) > 0

From 126658b03c16f68fb4470a665be3b0ee4259a958 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 17:25:14 -0600
Subject: [PATCH 09/15] Add tests

Co-authored-by: Copilot <copilot@github.com>
---
 .../unit/plugin/test_plugin_ordinances.py     | 259 ++++++++++++++++++
 1 file changed, 259 insertions(+)

diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py
index 074c2daa..141e5cba 100644
--- a/tests/python/unit/plugin/test_plugin_ordinances.py
+++ b/tests/python/unit/plugin/test_plugin_ordinances.py
@@ -1,7 +1,11 @@
 """COMPASS ordinance plugin tests"""
 
+import asyncio
+from collections import UserList
 from pathlib import Path
+from types import SimpleNamespace
 
+import pandas as pd
 import pytest
 
 from compass.plugin.ordinance import (
@@ -13,6 +17,75 @@
 from compass.exceptions import COMPASSPluginConfigurationError
 
 
+class MergePlugin(OrdinanceExtractionPlugin):
+    """Concrete ordinance plugin for merge tests"""
+
+    TEXT_COLLECTORS = []
+    TEXT_EXTRACTORS = []
+    PARSERS = []
+
+    IDENTIFIER = "test"
+    WEBSITE_KEYWORDS = ["test"]
+    QUERY_TEMPLATES = ["test"]
+    HEURISTIC = None
+
+    async def parse_docs_for_structured_data(self, extraction_context):
+        return extraction_context
+
+
+class FakeDoc:
+    def __init__(self, source, year=None, structured_data=None):
+        self.attrs = {"source": source}
+        if year is not None:
+            self.attrs["date"] = (year, 1, 1)
+        if structured_data is not None:
+            self.attrs["structured_data"] = structured_data
+
+
+class FakeExtractionContext(UserList):
+    """List-like extraction context for merge tests"""
+
+    def __init__(self, docs):
+        super().__init__(docs)
+        self.attrs = {}
+        self.marked_sources = []
+
+    @property
+    def num_documents(self):
+        return len(self)
+
+    async def mark_doc_as_data_source(self, doc, out_fn_stem):
+        self.marked_sources.append((doc.attrs.get("source"), out_fn_stem))
+
+
+@pytest.fixture
+def merge_plugin():
+    """Build a concrete plugin for merge-path tests"""
+
+    plugin = MergePlugin(None, None, None)
+    plugin.jurisdiction = SimpleNamespace(full_name="Test County")
+    return plugin
+
+
+def _data_df(*rows):
+    return pd.DataFrame(rows)
+
+
+async def _run_multi_doc_merge(plugin, context, data_dfs):
+    """Run the public merge path with controlled per-doc outputs"""
+
+    for doc, data_df in zip(context, data_dfs, strict=True):
+        doc.attrs["structured_data"] = data_df
+
+    async def _fake_parse_for_structured_data(doc):
+        await asyncio.sleep(0)
+        return doc.attrs["structured_data"]
+
+    plugin.parse_for_structured_data = _fake_parse_for_structured_data
+    out = await plugin.parse_multi_doc_merge(context)
+    return out.attrs["structured_data"]
+
+
 def test_plugin_validation_parse_key_same():
     """Test plugin interface validation logic"""
 
@@ -189,5 +262,191 @@ async def parse_docs_for_structured_data(self, extraction_context):
         MYPlugin(None, None, None).validate_plugin_configuration()
 
 
+@pytest.mark.asyncio
+async def test_merge_multi_doc_data_prefers_latest_year(merge_plugin):
+    """Latest dated doc should win overlapping features"""
+
+    context = FakeExtractionContext(
+        [
+            FakeDoc("older", 2021),
+            FakeDoc("newer", 2024),
+        ]
+    )
+    data_dfs = [
+        _data_df(
+            {"feature": "setback", "value": 100, "summary": "old"},
+            {"feature": "height", "value": 80, "summary": "old"},
+        ),
+        _data_df(
+            {"feature": "setback", "value": 150, "summary": "new"},
+        ),
+    ]
+
+    merged = await _run_multi_doc_merge(merge_plugin, context, data_dfs)
+
+    assert set(merged["feature"].str.casefold()) == {"setback", "height"}
+    setback = merged.loc[merged["feature"].str.casefold() == "setback"]
+    height = merged.loc[merged["feature"].str.casefold() == "height"]
+    assert setback.iloc[0]["value"] == 150
+    assert setback.iloc[0]["source"] == "newer"
+    assert setback.iloc[0]["year"] == 2024
+    assert height.iloc[0]["value"] == 80
+    assert height.iloc[0]["source"] == "older"
+    assert height.iloc[0]["year"] == 2021
+    assert context.marked_sources == [
+        ("newer", "Test County_2"),
+        ("older", "Test County_1"),
+    ]
+
+
+@pytest.mark.asyncio
+async def test_merge_multi_doc_data_falls_back_to_ordinance_count(
+    merge_plugin,
+):
+    """Unknown years should fall back to ordinance count priority"""
+
+    context = FakeExtractionContext(
+        [
+            FakeDoc("unknown-year"),
+            FakeDoc("known-year", 2025),
+        ]
+    )
+    data_dfs = [
+        _data_df(
+            {"feature": "setback", "value": 100, "summary": "one"},
+            {"feature": "height", "value": 50, "summary": "two"},
+        ),
+        _data_df(
+            {"feature": "setback", "value": 200, "summary": "other"},
+        ),
+    ]
+
+    merged = await _run_multi_doc_merge(merge_plugin, context, data_dfs)
+
+    setback = merged.loc[merged["feature"].str.casefold() == "setback"]
+    assert setback.iloc[0]["value"] == 100
+    assert setback.iloc[0]["source"] == "unknown-year"
+    assert pd.isna(setback.iloc[0]["year"])
+
+
+@pytest.mark.asyncio
+async def test_merge_multi_doc_data_breaks_year_ties_by_row_count(
+    merge_plugin,
+):
+    """Equal years should break ties using ordinance count"""
+
+    context = FakeExtractionContext(
+        [
+            FakeDoc("fewer", 2024),
+            FakeDoc("more", 2024),
+        ]
+    )
+    data_dfs = [
+        _data_df(
+            {"feature": "setback", "value": 100, "summary": "one"},
+        ),
+        _data_df(
+            {"feature": "setback", "value": 200, "summary": "two"},
+            {"feature": "height", "value": 70, "summary": "two"},
+        ),
+    ]
+
+    merged = await _run_multi_doc_merge(merge_plugin, context, data_dfs)
+
+    setback = merged.loc[merged["feature"].str.casefold() == "setback"]
+    assert setback.iloc[0]["value"] == 200
+    assert setback.iloc[0]["source"] == "more"
+
+
+@pytest.mark.asyncio
+async def test_merge_multi_doc_data_limits_to_prohibition_documents(
+    merge_plugin,
+):
+    """Any prohibition should limit merging to prohibition docs only"""
+
+    context = FakeExtractionContext(
+        [
+            FakeDoc("prohibition-older", 2022),
+            FakeDoc("prohibition-newer", 2024),
+            FakeDoc("non-prohibition", 2026),
+        ]
+    )
+    data_dfs = [
+        _data_df(
+            {
+                "feature": "prohibitions",
+                "value": None,
+                "summary": "older prohibition",
+            },
+            {"feature": "height", "value": 90, "summary": "older"},
+        ),
+        _data_df(
+            {
+                "feature": "Prohibitions",
+                "value": None,
+                "summary": "newer prohibition",
+            },
+            {"feature": "setback", "value": 300, "summary": "newer"},
+        ),
+        _data_df(
+            {"feature": "noise", "value": 45, "summary": "ignored"},
+        ),
+    ]
+
+    merged = await _run_multi_doc_merge(merge_plugin, context, data_dfs)
+
+    assert set(merged["feature"].str.casefold()) == {
+        "prohibitions",
+        "setback",
+        "height",
+    }
+    assert "noise" not in set(merged["feature"].str.casefold())
+    prohibition = merged.loc[
+        merged["feature"].str.casefold() == "prohibitions"
+    ]
+    assert prohibition.iloc[0]["source"] == "prohibition-newer"
+    assert context.marked_sources == [
+        ("prohibition-newer", "Test County_2"),
+        ("prohibition-older", "Test County_1"),
+    ]
+
+
+@pytest.mark.asyncio
+async def test_parse_multi_doc_merge_returns_context(merge_plugin):
+    """Public merge path should attach merged structured data"""
+
+    docs = [
+        FakeDoc(
+            "older",
+            2022,
+            _data_df(
+                {"feature": "height", "value": 60, "summary": "older"},
+            ),
+        ),
+        FakeDoc(
+            "newer",
+            2024,
+            _data_df(
+                {"feature": "setback", "value": 100, "summary": "newer"},
+            ),
+        ),
+    ]
+    context = FakeExtractionContext(docs)
+
+    async def _fake_parse_for_structured_data(doc):
+        await asyncio.sleep(0)
+        return doc.attrs["structured_data"]
+
+    merge_plugin.parse_for_structured_data = _fake_parse_for_structured_data
+
+    out = await merge_plugin.parse_multi_doc_merge(context)
+
+    assert out is context
+    assert set(out.attrs["structured_data"]["feature"].str.casefold()) == {
+        "setback",
+        "height",
+    }
+
+
 if __name__ == "__main__":
     pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])

From f43e95e61424883b3e684505e03169f1c9303a91 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 17:28:44 -0600
Subject: [PATCH 10/15] Add tests

Co-authored-by: Copilot <copilot@github.com>
---
 .../unit/plugin/test_plugin_ordinances.py     | 233 +++++++++++++++++-
 1 file changed, 232 insertions(+), 1 deletion(-)

diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py
index 141e5cba..e7e2bd55 100644
--- a/tests/python/unit/plugin/test_plugin_ordinances.py
+++ b/tests/python/unit/plugin/test_plugin_ordinances.py
@@ -13,8 +13,21 @@
     BaseTextExtractor,
     BaseParser,
     OrdinanceExtractionPlugin,
+    _feature_key,
+    _fill_in_all_sources,
+    _fill_out_multi_file_sources,
+    _filter_to_prohibition_cands_if_needed,
+    _get_source_inds,
+    _has_prohibitions,
+    _merge_candidates,
+    _prioritize_candidates,
+    _valid_chunk,
+    _validate_in_out_keys,
+)
+from compass.exceptions import (
+    COMPASSPluginConfigurationError,
+    COMPASSRuntimeError,
 )
-from compass.exceptions import COMPASSPluginConfigurationError
 
 
 class MergePlugin(OrdinanceExtractionPlugin):
@@ -448,5 +461,223 @@ async def _fake_parse_for_structured_data(doc):
     }
 
 
+@pytest.mark.parametrize(
+    "chunk,expected",
+    [("Useful text", True), ("No relevant text.", False), ("", "")],
+)
+def test_valid_chunk(chunk, expected):
+    """Helper should reject empty and negative extraction responses"""
+
+    assert _valid_chunk(chunk) == expected
+
+
+def test_validate_in_out_keys_raises_for_missing_key():
+    """Helper should fail when no producer satisfies a required input"""
+
+    class Producer:
+        OUT_LABEL = "produced"
+
+    class Consumer:
+        IN_LABEL = "missing"
+
+    with pytest.raises(
+        COMPASSPluginConfigurationError,
+        match=r"IN_LABEL 'missing'",
+    ):
+        _validate_in_out_keys([Consumer], [Producer])
+
+
+def test_get_source_inds_returns_integer_indices():
+    """Helper should extract integer source indices from rows"""
+
+    data_df = _data_df(
+        {"feature": "setback", "source": 0},
+        {"feature": "height", "source": 1},
+        {"feature": "noise", "source": 1},
+    )
+
+    source_inds = _get_source_inds(data_df, 3)
+
+    assert list(source_inds) == [0, 1]
+
+
+@pytest.mark.parametrize(
+    "data_df,num_docs,match",
+    [
+        (_data_df({"feature": "setback"}), 2, "column not found"),
+        (
+            _data_df({"feature": "setback", "source": "one"}),
+            2,
+            "non-integer values",
+        ),
+        (
+            _data_df({"feature": "setback", "source": 2}),
+            2,
+            "out-of-bounds indices",
+        ),
+    ],
+)
+def test_get_source_inds_raises_for_invalid_source_values(
+    data_df, num_docs, match
+):
+    """Helper should reject missing, invalid, and out-of-range sources"""
+
+    with pytest.raises(COMPASSRuntimeError, match=match):
+        _get_source_inds(data_df, num_docs)
+
+
+@pytest.mark.asyncio
+async def test_fill_out_multi_file_sources_maps_valid_source_indices():
+    """Helper should map per-row source indices back to document metadata"""
+
+    context = FakeExtractionContext(
+        [FakeDoc("doc-one", 2021), FakeDoc("doc-two", 2024)]
+    )
+    data_df = _data_df(
+        {"feature": "setback", "source": 0},
+        {"feature": "height", "source": 1},
+    )
+
+    filled = await _fill_out_multi_file_sources(data_df, context, "County")
+
+    assert list(filled["source"]) == ["doc-one", "doc-two"]
+    assert list(filled["year"]) == [2021, 2024]
+    assert context.marked_sources == [
+        ("doc-one", "County_1"),
+        ("doc-two", "County_2"),
+    ]
+
+
+@pytest.mark.asyncio
+async def test_fill_in_all_sources_reports_full_context_when_needed():
+    """Fallback helper should report all documents when row sources fail"""
+
+    context = FakeExtractionContext(
+        [FakeDoc("doc-one", 2020), FakeDoc("doc-two", 2024)]
+    )
+    data_df = _data_df({"feature": "setback", "value": 100})
+
+    filled = await _fill_in_all_sources(data_df, context, "County")
+
+    assert filled.iloc[0]["source"] == "doc-one ;\ndoc-two"
+    assert filled.iloc[0]["year"] == 2024
+    assert context.marked_sources == [
+        ("doc-one", "County_1"),
+        ("doc-two", "County_2"),
+    ]
+
+
+def test_feature_key_normalizes_values_and_handles_missing():
+    """Feature-key helper should normalize strings and preserve missing"""
+
+    assert _feature_key("  Prohibitions ") == "prohibitions"
+    assert _feature_key(pd.NA) is None
+
+
+def test_has_prohibitions_requires_ordinance_content():
+    """Prohibition helper should only flag rows with actual ordinance data"""
+
+    with_prohibition = _data_df(
+        {"feature": "Prohibitions", "summary": "Wind is prohibited."}
+    )
+    without_prohibition = _data_df(
+        {"feature": "Prohibitions", "summary": None, "value": None}
+    )
+
+    assert _has_prohibitions(with_prohibition)
+    assert not _has_prohibitions(without_prohibition)
+
+
+def test_filter_to_prohibition_candidates_only_when_present():
+    """Candidate helper should narrow to prohibition-bearing documents"""
+
+    candidates = [
+        {
+            "data_df": _data_df(
+                {"feature": "setback", "summary": "Regular standard"}
+            )
+        },
+        {
+            "data_df": _data_df(
+                {
+                    "feature": "prohibitions",
+                    "summary": "Wind systems are prohibited.",
+                }
+            )
+        },
+    ]
+
+    filtered = _filter_to_prohibition_cands_if_needed(candidates)
+
+    assert filtered == [candidates[1]]
+
+
+def test_prioritize_candidates_prefers_latest_year_then_row_count():
+    """Priority helper should sort by year when every candidate has one"""
+
+    candidates = [
+        {"year": 2021, "row_count": 5},
+        {"year": 2024, "row_count": 1},
+        {"year": 2024, "row_count": 3},
+    ]
+
+    prioritized = _prioritize_candidates(candidates)
+
+    assert prioritized == [candidates[2], candidates[1], candidates[0]]
+
+
+def test_prioritize_candidates_falls_back_to_row_count_without_years():
+    """Priority helper should ignore year sorting when any year is unknown"""
+
+    candidates = [
+        {"year": 2024, "row_count": 1},
+        {"year": None, "row_count": 3},
+        {"year": 2021, "row_count": 2},
+    ]
+
+    prioritized = _prioritize_candidates(candidates)
+
+    assert prioritized == [candidates[1], candidates[2], candidates[0]]
+
+
+@pytest.mark.asyncio
+async def test_merge_candidates_keeps_first_feature_and_marks_sources():
+    """Merge helper should keep first-seen features by candidate priority"""
+
+    context = FakeExtractionContext([FakeDoc("older"), FakeDoc("newer")])
+    candidates = [
+        {
+            "data_df": _data_df(
+                {"feature": "setback", "value": 200, "source": "newer"},
+                {"feature": "height", "value": 80, "source": "newer"},
+            ),
+            "doc": context[1],
+            "doc_ind": 2,
+        },
+        {
+            "data_df": _data_df(
+                {"feature": "setback", "value": 100, "source": "older"},
+                {"feature": "noise", "value": 45, "source": "older"},
+            ),
+            "doc": context[0],
+            "doc_ind": 1,
+        },
+    ]
+
+    merged = await _merge_candidates(candidates, context, "County")
+
+    assert set(merged["feature"].str.casefold()) == {
+        "setback",
+        "height",
+        "noise",
+    }
+    setback = merged.loc[merged["feature"].str.casefold() == "setback"]
+    assert setback.iloc[0]["value"] == 200
+    assert context.marked_sources == [
+        ("newer", "County_2"),
+        ("older", "County_1"),
+    ]
+
+
 if __name__ == "__main__":
     pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])

From 7d4fcd095c62ab51812df42d313998d3330c8531 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Wed, 6 May 2026 17:46:27 -0600
Subject: [PATCH 11/15] Move to enum

Co-authored-by: Copilot <copilot@github.com>
---
 compass/plugin/__init__.py                    |  1 +
 compass/plugin/one_shot/base.py               | 30 +-----
 compass/plugin/ordinance.py                   | 98 +++++++++++++++----
 .../unit/plugin/test_plugin_ordinances.py     | 37 +++++++
 4 files changed, 122 insertions(+), 44 deletions(-)

diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py
index 5f50ee3f..42e00488 100644
--- a/compass/plugin/__init__.py
+++ b/compass/plugin/__init__.py
@@ -9,6 +9,7 @@
 from .ordinance import (
     BaseTextExtractor,
     BaseParser,
+    DocSelectionMethod,
     KeywordBasedHeuristic,
     PromptBasedTextCollector,
     PromptBasedTextExtractor,
diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py
index 0ef69e47..ad6de218 100644
--- a/compass/plugin/one_shot/base.py
+++ b/compass/plugin/one_shot/base.py
@@ -11,6 +11,7 @@
     NoOpHeuristic,
     NoOpTextCollector,
     NoOpTextExtractor,
+    DocSelectionMethod,
     PromptBasedTextCollector,
     PromptBasedTextExtractor,
     OrdinanceExtractionPlugin,
@@ -176,7 +177,9 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
         SCHEMA = config["schema"]
         """dict: Schema for the output of the text extraction step"""
 
-        DOC_SELECTION_METHOD = _doc_selection_method(config)
+        DOC_SELECTION_METHOD = DocSelectionMethod.normalize(
+            config.get("doc_selection_method", "single doc")
+        )
         """str: Method for selecting documents for extraction context
 
         Allowed options:
@@ -598,28 +601,3 @@ def _normalize_keyword_list(items):
         normalized.add(keyword)
 
     return list(normalized)
-
-
-def _doc_selection_method(config):
-    """Parse and normalize the document selection method"""
-    allowed_methods = {
-        "single_doc",
-        "multi_doc_context",
-        "multi_doc_all",
-        "multi_doc_mixed",
-    }
-    og_doc_selection_method = config.get("doc_selection_method", "single doc")
-    doc_selection_method = (
-        og_doc_selection_method.replace(" ", "_")
-        .replace("-", "_")
-        .strip()
-        .casefold()
-    )
-    if doc_selection_method not in allowed_methods:
-        msg = (
-            f"Invalid doc_selection_method: {og_doc_selection_method!r}. "
-            f"Allowed options are: {sorted(allowed_methods)}."
-        )
-        raise COMPASSPluginConfigurationError(msg)
-
-    return doc_selection_method
diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 35a9a094..b335b4cc 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -3,6 +3,7 @@
 import asyncio
 import logging
 import operator
+from enum import StrEnum
 from warnings import warn
 from textwrap import dedent
 from itertools import chain
@@ -62,6 +63,63 @@
 }
 
 
+class DocSelectionMethod(StrEnum):
+    """Document selection modes for structured extraction"""
+
+    SINGLE_DOC = "single_doc"
+    """Evaluate candidate documents one at a time until data is found"""
+    MULTI_DOC_CONTEXT = "multi_doc_context"
+    """Combine multiple documents into one extraction context"""
+    MULTI_DOC_ALL = "multi_doc_all"
+    """Parse each document separately and keep all extracted rows"""
+    MULTI_DOC_MIXED = "multi_doc_mixed"
+    """Parse separately and merge rows so each feature appears once"""
+
+    @classmethod
+    def normalize(cls, value):
+        """Normalize a config value into a selection mode
+
+        Parameters
+        ----------
+        value : str or DocSelectionMethod
+            Input selection mode from plugin configuration or an
+            existing enum value.
+
+        Returns
+        -------
+        DocSelectionMethod
+            Normalized document selection mode.
+
+        Raises
+        ------
+        COMPASSPluginConfigurationError
+            Raised if ``value`` is not a string or enum member, or if
+            it does not map to a supported selection mode.
+        """
+        if isinstance(value, cls):
+            return value
+
+        if not isinstance(value, str):
+            msg = (
+                "doc_selection_method must be a string or "
+                f"{cls.__name__} value."
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        normalized = (
+            value.replace(" ", "_").replace("-", "_").strip().casefold()
+        )
+        try:
+            return cls(normalized)
+        except ValueError as err:
+            msg = (
+                f"Invalid doc_selection_method: {value!r}. "
+                "Allowed options are: "
+                f"{sorted(method.value for method in cls)}."
+            )
+            raise COMPASSPluginConfigurationError(msg) from err
+
+
 class BaseTextExtractor(BaseLLMCaller, ABC):
     """Extract succinct extraction text from input"""
 
@@ -597,7 +655,7 @@ class OrdinanceExtractionPlugin(FilteredExtractionPlugin):
     methods as needed.
     """
 
-    DOC_SELECTION_METHOD = "single doc"
+    DOC_SELECTION_METHOD = DocSelectionMethod.SINGLE_DOC
     """str: Only allow one document to be output"""
 
     @property
@@ -702,27 +760,31 @@ async def parse_docs_for_structured_data(self, extraction_context):
             Context with extracted data/information stored in the
             ``.attrs`` dictionary, or ``None`` if no data was extracted.
         """
-        if self.DOC_SELECTION_METHOD == "single_doc":
-            return await self.parse_single_doc_for_structured_data(
-                extraction_context
-            )
+        match DocSelectionMethod.normalize(self.DOC_SELECTION_METHOD):
+            case DocSelectionMethod.SINGLE_DOC:
+                return await self.parse_single_doc_for_structured_data(
+                    extraction_context
+                )
 
-        if self.DOC_SELECTION_METHOD == "multi_doc_context":
-            return await self.parse_multi_doc_context_for_structured_data(
-                extraction_context
-            )
+            case DocSelectionMethod.MULTI_DOC_CONTEXT:
+                return await self.parse_multi_doc_context_for_structured_data(
+                    extraction_context
+                )
 
-        if self.DOC_SELECTION_METHOD == "multi_doc_all":
-            return await self.parse_multi_doc_concat(extraction_context)
+            case DocSelectionMethod.MULTI_DOC_ALL:
+                return await self.parse_multi_doc_concat(extraction_context)
 
-        if self.DOC_SELECTION_METHOD == "multi_doc_mixed":
-            return await self.parse_multi_doc_merge(extraction_context)
+            case DocSelectionMethod.MULTI_DOC_MIXED:
+                return await self.parse_multi_doc_merge(extraction_context)
 
-        msg = (
-            f"Invalid DOC_SELECTION_METHOD: {self.DOC_SELECTION_METHOD!r}. "
-            "Supported methods are: 'single_doc' and 'multi_doc_context'."
-        )
-        raise COMPASSPluginConfigurationError(msg)
+            case _:
+                msg = (
+                    "Invalid DOC_SELECTION_METHOD: "
+                    f"{self.DOC_SELECTION_METHOD!r}. "
+                    "Supported methods are: "
+                    f"{sorted(method.value for method in DocSelectionMethod)}."
+                )
+                raise COMPASSPluginConfigurationError(msg)
 
     async def parse_single_doc_for_structured_data(self, extraction_context):
         """Parse documents one at a time to extract structured data
diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py
index e7e2bd55..e7a581d5 100644
--- a/tests/python/unit/plugin/test_plugin_ordinances.py
+++ b/tests/python/unit/plugin/test_plugin_ordinances.py
@@ -12,6 +12,7 @@
     BaseTextCollector,
     BaseTextExtractor,
     BaseParser,
+    DocSelectionMethod,
     OrdinanceExtractionPlugin,
     _feature_key,
     _fill_in_all_sources,
@@ -275,6 +276,42 @@ async def parse_docs_for_structured_data(self, extraction_context):
         MYPlugin(None, None, None).validate_plugin_configuration()
 
 
+@pytest.mark.asyncio
+async def test_parse_docs_for_structured_data_accepts_enum_value():
+    """Enum-valued doc selection should dispatch correctly"""
+
+    class MYPlugin(OrdinanceExtractionPlugin):
+        TEXT_COLLECTORS = []
+        TEXT_EXTRACTORS = []
+        PARSERS = []
+
+        IDENTIFIER = "test"
+        WEBSITE_KEYWORDS = ["test"]
+        QUERY_TEMPLATES = ["test"]
+        HEURISTIC = None
+        DOC_SELECTION_METHOD = DocSelectionMethod.MULTI_DOC_ALL
+
+        async def parse_single_doc_for_structured_data(
+            self, extraction_context
+        ):
+            raise AssertionError("wrong dispatch")
+
+        async def parse_multi_doc_context_for_structured_data(
+            self, extraction_context
+        ):
+            raise AssertionError("wrong dispatch")
+
+        async def parse_multi_doc_concat(self, extraction_context):
+            return "concat"
+
+        async def parse_multi_doc_merge(self, extraction_context):
+            raise AssertionError("wrong dispatch")
+
+    plugin = MYPlugin(None, None, None)
+
+    assert await plugin.parse_docs_for_structured_data(None) == "concat"
+
+
 @pytest.mark.asyncio
 async def test_merge_multi_doc_data_prefers_latest_year(merge_plugin):
     """Latest dated doc should win overlapping features"""

From 25962c09251002bc0efaf5fce8abfe0165c6de0d Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Thu, 7 May 2026 15:03:50 -0600
Subject: [PATCH 12/15] Fix function

---
 compass/plugin/ordinance.py                        | 2 +-
 tests/python/unit/plugin/test_plugin_ordinances.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index b335b4cc..e9508f45 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -1298,7 +1298,7 @@ def _register_clean_file_names(self):
 
 def _valid_chunk(chunk):
     """True if chunk has content"""
-    return chunk and "no relevant text" not in chunk.lower()
+    return bool(chunk and "no relevant text" not in chunk.lower())
 
 
 def _validate_in_out_keys(consumers, producers):
diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py
index e7a581d5..6f0f2b03 100644
--- a/tests/python/unit/plugin/test_plugin_ordinances.py
+++ b/tests/python/unit/plugin/test_plugin_ordinances.py
@@ -500,7 +500,7 @@ async def _fake_parse_for_structured_data(doc):
 
 @pytest.mark.parametrize(
     "chunk,expected",
-    [("Useful text", True), ("No relevant text.", False), ("", "")],
+    [("Useful text", True), ("No relevant text.", False), ("", False)],
 )
 def test_valid_chunk(chunk, expected):
     """Helper should reject empty and negative extraction responses"""

From f21cafc6852a8e0776c4c616841c19fd3e3d16a6 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Thu, 7 May 2026 15:03:55 -0600
Subject: [PATCH 13/15] Fix typo

---
 compass/plugin/one_shot/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py
index ad6de218..d7ca83ee 100644
--- a/compass/plugin/one_shot/base.py
+++ b/compass/plugin/one_shot/base.py
@@ -126,7 +126,7 @@ def create_schema_based_one_shot_extraction_plugin(config, tech):  # noqa: C901
               data extraction step.
             - `doc_selection_method`: String defining the multi-doc
               selection option. Specifically, if multiple documents pass
-              the filter, this method detemines how the documents are
+              the filter, this method determines how the documents are
               submitted to the extraction context. Allowed options are:
 
                 - "single doc": Use the first document that returns some

From ebc472481a952676b9b34ba75fd0809a750363be Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Thu, 7 May 2026 15:07:52 -0600
Subject: [PATCH 14/15] PR review update

---
 compass/plugin/ordinance.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index e9508f45..8e53180b 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -1461,19 +1461,21 @@ async def _merge_candidates(candidates, extraction_context, out_stem):
     merged_features = set()
     contributing_candidates = []
     for candidate in candidates:
-        candidate_rows = []
-        for _, row in candidate["data_df"].iterrows():
-            feature_key = _feature_key(row.get("feature"))
-            if feature_key is None or feature_key in merged_features:
-                continue
-
-            merged_features.add(feature_key)
-            candidate_rows.append(row.to_dict())
+        data_df = candidate["data_df"]
+        if data_df is None or data_df.empty or "feature" not in data_df:
+            continue
 
-        if not candidate_rows:
+        feature_keys = data_df["feature"].map(_feature_key)
+        keep_mask = feature_keys.notna()
+        if merged_features:
+            keep_mask &= ~feature_keys.isin(merged_features)
+        keep_mask &= ~feature_keys.duplicated()
+        if not keep_mask.any():
             continue
 
-        merged_rows.extend(candidate_rows)
+        selected_feature_keys = feature_keys.loc[keep_mask]
+        merged_features.update(selected_feature_keys.tolist())
+        merged_rows.extend(data_df.loc[keep_mask].to_dict("records"))
         contributing_candidates.append(candidate)
 
     if not merged_rows:

From 4890b00feeb6cefc152955e3cef0d7c00890ef42 Mon Sep 17 00:00:00 2001
From: Paul <pinchukpaul@gmail.com>
Date: Thu, 7 May 2026 15:09:47 -0600
Subject: [PATCH 15/15] No docling in tox for now

---
 tox.ini | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tox.ini b/tox.ini
index cbe07c74..ff3ad93b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,7 +1,7 @@
 [tox]
 min_version = 4.26
 envlist =
-    py{312,313,314}-cl{817}-docl{290}-lts{1}-nx{342}-nltk{391}-np{224}-oai{234}-pd{223}-pw{149}-pj{5168}-rich{1394}-toml{0102}
+    py{312,313,314}-cl{817}-lts{1}-nx{342}-nltk{391}-np{224}-oai{234}-pd{223}-pw{149}-pj{5168}-rich{1394}-toml{0102}
 
 [gh-actions]
 python =
@@ -15,7 +15,6 @@ commands = pytest tests --dist loadscope {posargs}
 deps =
     cl817: click>=8.1.7,<9
     c4ai063: crawl4ai>=0.6.3,<0.7
-    docl290: docling>=2.90.0,<3
     lts1: langchain-text-splitters>=1.0.0,<2
     nx342: networkx>=3.4.2,<4
     nltk391: nltk>=3.9.1,<4
@@ -38,7 +37,6 @@ description = minimum supported versions
 deps=
     click~=8.1.7
     crawl4ai~=0.6.3
-    docling~=2.90.0
     langchain-text-splitters~=1.0.0
     networkx~=3.4.2
     nltk~=3.9.1