From c4ff97a12c3d4da8b04f20cabad386b5cdad2728 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 15:56:44 -0600 Subject: [PATCH 01/15] Better logging --- compass/scripts/download.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 3393dd58..f89f7f29 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -695,9 +695,7 @@ async def filter_ordinance_docs( "Found %d potential ordinance documents for %s\n\t- %s", len(docs), jurisdiction.full_name, - "\n\t- ".join( - [doc.attrs.get("source", "Unknown source") for doc in docs] - ), + "\n\t- ".join([str(doc) for doc in docs]), ) return docs From eaff498063129f606346da32fb56b10396272c43 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 15:56:54 -0600 Subject: [PATCH 02/15] Use neg inf --- compass/scripts/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compass/scripts/download.py b/compass/scripts/download.py index f89f7f29..11882280 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -811,7 +811,7 @@ def _sort_final_ord_docs(all_ord_docs): def _ord_doc_sorting_key(doc): """Compute a composite sorting score for ordinance documents""" - no_date = (-1, -1, -1) + no_date = (_NEG_INF, _NEG_INF, _NEG_INF) latest_year, latest_month, latest_day = doc.attrs.get("date") or no_date best_docs_from_website = doc.attrs.get(_SCORE_KEY, 0) prefer_pdf_files = isinstance(doc, PDFDocument) From b95097d485c03d825f8036bcc60c843d64fecce9 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 15:57:59 -0600 Subject: [PATCH 03/15] Move to `doc_selection_method` for plugin --- compass/plugin/one_shot/base.py | 73 ++++++++++++++++--- .../one-shot/plugin_config.yaml | 2 +- 2 files changed, 65 insertions(+), 10 deletions(-) diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py index e5254a73..0ef69e47 100644 --- a/compass/plugin/one_shot/base.py +++ b/compass/plugin/one_shot/base.py @@ -123,11 +123,26 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): # noqa: C901 may provide a custom system prompt if you want to provide more specific instructions to the LLM for the structured data extraction step. - - `allow_multi_doc_extraction`: Boolean flag indicating - whether to allow multiple documents to be used for the - extraction context simultaneously. By default, ``False``, - which means the first document that returns some extracted - data will be marked as the source. + - `doc_selection_method`: String defining the multi-doc + selection option. Specifically, if multiple documents pass + the filter, this method detemines how the documents are + submitted to the extraction context. Allowed options are: + + - "single doc": Use the first document that returns some + extracted data as the source document for the + extraction context. + - "multi doc context": Submit text from multiple + documents to the extraction context simultaneously. + - "multi doc all": Each document is extracted separately + and the results concatenated. This may give duplicated + feature results if the same feature is mentioned in + multiple documents. + - "multi doc mixed": Each document is extracted + separately and the results are merged together at the + end. In this approach, each feature is reported at + most once. + + By default, ``"single doc"``. tech : str Technology identifier to use for the plugin (e.g., "wind", @@ -161,10 +176,25 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin): SCHEMA = config["schema"] """dict: Schema for the output of the text extraction step""" - ALLOW_MULTI_DOC_EXTRACTION = config.get( - "allow_multi_doc_extraction", False - ) - """bool: Whether to allow extraction over multiple documents""" + DOC_SELECTION_METHOD = _doc_selection_method(config) + """str: Method for selecting documents for extraction context + + Allowed options: + + - "single doc": Use the first document that returns some + extracted data as the source document for the extraction + context. + - "multi doc context": Submit text from multiple documents + to the extraction context simultaneously. + - "multi doc all": Each document is extracted separately + and the results concatenated. This may give duplicated + feature results if the same feature is mentioned in + multiple documents. + - "multi doc mixed": Each document is extracted separately + and the results are merged together at the end. In this + approach, each feature is reported at most once. + + """ IDENTIFIER = tech """str: Identifier for extraction task """ @@ -568,3 +598,28 @@ def _normalize_keyword_list(items): normalized.add(keyword) return list(normalized) + + +def _doc_selection_method(config): + """Parse and normalize the document selection method""" + allowed_methods = { + "single_doc", + "multi_doc_context", + "multi_doc_all", + "multi_doc_mixed", + } + og_doc_selection_method = config.get("doc_selection_method", "single doc") + doc_selection_method = ( + og_doc_selection_method.replace(" ", "_") + .replace("-", "_") + .strip() + .casefold() + ) + if doc_selection_method not in allowed_methods: + msg = ( + f"Invalid doc_selection_method: {og_doc_selection_method!r}. " + f"Allowed options are: {sorted(allowed_methods)}." + ) + raise COMPASSPluginConfigurationError(msg) + + return doc_selection_method diff --git a/examples/water_rights_demo/one-shot/plugin_config.yaml b/examples/water_rights_demo/one-shot/plugin_config.yaml index b5f7e47a..0627bc8b 100755 --- a/examples/water_rights_demo/one-shot/plugin_config.yaml +++ b/examples/water_rights_demo/one-shot/plugin_config.yaml @@ -2,7 +2,7 @@ schema: ./water_rights_schema.json5 data_type_short_desc: water rights and regulations -allow_multi_doc_extraction: True # Important for water rights! +doc_selection_method: "multi doc context" # Important for water rights! query_templates: - "{jurisdiction} rules" From a4b9bfb7c8e17a7e4c41c1913ef0afe721559cdb Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 15:58:16 -0600 Subject: [PATCH 04/15] Implement the multi-doc concat method --- compass/plugin/ordinance.py | 137 +++++++++++++++++++++++++++--------- 1 file changed, 105 insertions(+), 32 deletions(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 998bbbc7..79f2092e 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -596,8 +596,8 @@ class OrdinanceExtractionPlugin(FilteredExtractionPlugin): methods as needed. """ - ALLOW_MULTI_DOC_EXTRACTION = False - """bool: Whether to allow extraction over multiple documents""" + DOC_SELECTION_METHOD = "single doc" + """str: Only allow one document to be output""" @property @abstractmethod @@ -701,13 +701,71 @@ async def parse_docs_for_structured_data(self, extraction_context): Context with extracted data/information stored in the ``.attrs`` dictionary, or ``None`` if no data was extracted. """ - if self.ALLOW_MULTI_DOC_EXTRACTION: + if self.DOC_SELECTION_METHOD == "single_doc": + return await self.parse_single_doc_for_structured_data( + extraction_context + ) + + if self.DOC_SELECTION_METHOD == "multi_doc_context": return await self.parse_multi_doc_context_for_structured_data( extraction_context ) - return await self.parse_single_doc_for_structured_data( - extraction_context + + if self.DOC_SELECTION_METHOD == "multi_doc_all": + return await self.parse_multi_doc_concat(extraction_context) + + if self.DOC_SELECTION_METHOD == "multi_doc_mixed": + msg = "TODO" + raise NotImplementedError(msg) + + msg = ( + f"Invalid DOC_SELECTION_METHOD: {self.DOC_SELECTION_METHOD!r}. " + "Supported methods are: 'single_doc' and 'multi_doc_context'." ) + raise COMPASSPluginConfigurationError(msg) + + async def parse_single_doc_for_structured_data(self, extraction_context): + """Parse documents one at a time to extract structured data + + The first document to return some extracted data will be marked + as the source and will be returned from this method. + + Parameters + ---------- + extraction_context : ExtractionContext + Context containing candidate documents to parse. + + Returns + ------- + ExtractionContext or None + Context with extracted data/information stored in the + ``.attrs`` dictionary, or ``None`` if no data was extracted. + """ + for doc_for_extraction in extraction_context: + data_df = await self.parse_for_structured_data(doc_for_extraction) + row_count = self.get_structured_data_row_count(data_df) + if row_count > 0: + data_df["source"] = doc_for_extraction.attrs.get("source") + data_df["year"] = extract_year_from_doc_attrs( + doc_for_extraction.attrs + ) + await extraction_context.mark_doc_as_data_source( + doc_for_extraction, out_fn_stem=self.jurisdiction.full_name + ) + extraction_context.attrs["structured_data"] = data_df + logger.info( + "%d ordinance value(s) found in doc from %s for %s. ", + num_ordinances_dataframe(data_df), + doc_for_extraction.attrs.get("source", "unknown source"), + self.jurisdiction.full_name, + ) + return extraction_context + + logger.debug( + "No ordinances found; searched %d docs", + extraction_context.num_documents, + ) + return None async def parse_multi_doc_context_for_structured_data( self, extraction_context @@ -755,11 +813,8 @@ async def parse_multi_doc_context_for_structured_data( ) return extraction_context - async def parse_single_doc_for_structured_data(self, extraction_context): - """Parse documents one at a time to extract structured data - - The first document to return some extracted data will be marked - as the source and will be returned from this method. + async def parse_multi_doc_concat(self, extraction_context): + """Parse all documents and concatenate extracted data Parameters ---------- @@ -772,31 +827,49 @@ async def parse_single_doc_for_structured_data(self, extraction_context): Context with extracted data/information stored in the ``.attrs`` dictionary, or ``None`` if no data was extracted. """ - for doc_for_extraction in extraction_context: - data_df = await self.parse_for_structured_data(doc_for_extraction) + + tasks = [ + asyncio.create_task( + self.parse_for_structured_data(doc_for_extraction), + name=self.jurisdiction.full_name, + ) + for doc_for_extraction in extraction_context + ] + data_dfs = await asyncio.gather(*tasks) + + all_data = [] + for data_df, doc_for_extraction in zip( + data_dfs, extraction_context, strict=True + ): row_count = self.get_structured_data_row_count(data_df) - if row_count > 0: - data_df["source"] = doc_for_extraction.attrs.get("source") - data_df["year"] = extract_year_from_doc_attrs( - doc_for_extraction.attrs - ) - await extraction_context.mark_doc_as_data_source( - doc_for_extraction, out_fn_stem=self.jurisdiction.full_name - ) - extraction_context.attrs["structured_data"] = data_df - logger.info( - "%d ordinance value(s) found in doc from %s for %s. ", - num_ordinances_dataframe(data_df), - doc_for_extraction.attrs.get("source", "unknown source"), - self.jurisdiction.full_name, - ) - return extraction_context + if row_count == 0: + continue + data_df["source"] = doc_for_extraction.attrs.get("source") + data_df["year"] = extract_year_from_doc_attrs( + doc_for_extraction.attrs + ) + await extraction_context.mark_doc_as_data_source( + doc_for_extraction, out_fn_stem=self.jurisdiction.full_name + ) + logger.info( + "%d ordinance value(s) found in doc from %s for %s. ", + num_ordinances_dataframe(data_df), + doc_for_extraction.attrs.get("source", "unknown source"), + self.jurisdiction.full_name, + ) + all_data.append(data_df) - logger.debug( - "No ordinances found; searched %d docs", - extraction_context.num_documents, + if not all_data: + logger.debug( + "No ordinances found; searched %d docs", + extraction_context.num_documents, + ) + return None + + extraction_context.attrs["structured_data"] = pd.concat( + all_data, ignore_index=True ) - return None + return extraction_context async def parse_for_structured_data(self, source): """Extract all possible structured data from a document From 9e9ca809fd07bffb0f13cf7aecadaa82a7f2db39 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 16:54:39 -0600 Subject: [PATCH 05/15] Correct out file Co-authored-by: Copilot --- compass/plugin/ordinance.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 79f2092e..1bd33223 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -838,24 +838,22 @@ async def parse_multi_doc_concat(self, extraction_context): data_dfs = await asyncio.gather(*tasks) all_data = [] - for data_df, doc_for_extraction in zip( - data_dfs, extraction_context, strict=True + for doc_ind, (data_df, doc) in enumerate( + zip(data_dfs, extraction_context, strict=True), start=1 ): row_count = self.get_structured_data_row_count(data_df) if row_count == 0: continue - data_df["source"] = doc_for_extraction.attrs.get("source") - data_df["year"] = extract_year_from_doc_attrs( - doc_for_extraction.attrs - ) + data_df["source"] = doc.attrs.get("source") + data_df["year"] = extract_year_from_doc_attrs(doc.attrs) await extraction_context.mark_doc_as_data_source( - doc_for_extraction, out_fn_stem=self.jurisdiction.full_name + doc, out_fn_stem=f"{self.jurisdiction.full_name}_{doc_ind}" ) logger.info( - "%d ordinance value(s) found in doc from %s for %s. ", + "%d ordinance value(s) found for %s from doc:\n%s. ", num_ordinances_dataframe(data_df), - doc_for_extraction.attrs.get("source", "unknown source"), self.jurisdiction.full_name, + doc, ) all_data.append(data_df) From 3f80c6f4f68e9e49df92358a464ce5ae1da82226 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 16:56:17 -0600 Subject: [PATCH 06/15] Adjust logger --- compass/plugin/ordinance.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 1bd33223..50379af8 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -754,10 +754,10 @@ async def parse_single_doc_for_structured_data(self, extraction_context): ) extraction_context.attrs["structured_data"] = data_df logger.info( - "%d ordinance value(s) found in doc from %s for %s. ", + "%d ordinance value(s) found for %s from doc:\n%s. ", num_ordinances_dataframe(data_df), - doc_for_extraction.attrs.get("source", "unknown source"), self.jurisdiction.full_name, + doc_for_extraction, ) return extraction_context @@ -806,10 +806,10 @@ async def parse_multi_doc_context_for_structured_data( extraction_context.attrs["structured_data"] = data_df logger.info( - "%d ordinance value(s) found in %d docs for %s. ", + "%d ordinance value(s) found for %s in %d docs. ", num_ordinances_dataframe(data_df), - extraction_context.num_documents, self.jurisdiction.full_name, + extraction_context.num_documents, ) return extraction_context From d0732b338531bdcc1f6b02c76b47961c3cefaaea Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 17:13:47 -0600 Subject: [PATCH 07/15] Formatting --- compass/plugin/ordinance.py | 1 + 1 file changed, 1 insertion(+) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 50379af8..2c2ef98f 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -844,6 +844,7 @@ async def parse_multi_doc_concat(self, extraction_context): row_count = self.get_structured_data_row_count(data_df) if row_count == 0: continue + data_df["source"] = doc.attrs.get("source") data_df["year"] = extract_year_from_doc_attrs(doc.attrs) await extraction_context.mark_doc_as_data_source( From cc262bf9a4403505466653d29ee927b4a6604789 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 17:24:00 -0600 Subject: [PATCH 08/15] Add `parse_multi_doc_merge` Co-authored-by: Copilot --- compass/plugin/ordinance.py | 189 +++++++++++++++++++++++++++++++++++- 1 file changed, 185 insertions(+), 4 deletions(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 2c2ef98f..35a9a094 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -2,6 +2,7 @@ import asyncio import logging +import operator from warnings import warn from textwrap import dedent from itertools import chain @@ -715,8 +716,7 @@ async def parse_docs_for_structured_data(self, extraction_context): return await self.parse_multi_doc_concat(extraction_context) if self.DOC_SELECTION_METHOD == "multi_doc_mixed": - msg = "TODO" - raise NotImplementedError(msg) + return await self.parse_multi_doc_merge(extraction_context) msg = ( f"Invalid DOC_SELECTION_METHOD: {self.DOC_SELECTION_METHOD!r}. " @@ -727,8 +727,17 @@ async def parse_docs_for_structured_data(self, extraction_context): async def parse_single_doc_for_structured_data(self, extraction_context): """Parse documents one at a time to extract structured data - The first document to return some extracted data will be marked - as the source and will be returned from this method. + This mode evaluates candidate documents in sequence and stops + at the first document that produces ordinance data. Once a + usable source is found, later candidate documents are not used + to supplement, compare, or override that result. This is the + simplest selection strategy and is best suited to workflows + where one document is expected to contain the authoritative + ordinance language on its own. + + Documents are expected to come sorted by priority, with the most + likely source of ordinance language appearing first in the + `extraction_context`. Parameters ---------- @@ -772,6 +781,16 @@ async def parse_multi_doc_context_for_structured_data( ): """Parse all documents to extract structured data/information + This mode combines the relevant text from all candidate + documents into one shared extraction context before structured + data are parsed. It is useful when the information needed for a + single ordinance feature may be split across multiple sources + and should be interpreted together rather than compared as + separate document-level outputs. When source references can be + recovered from the extracted rows, each row is mapped back to + its originating document; otherwise the result falls back to + reporting the full document set as the source context. + Parameters ---------- extraction_context : ExtractionContext @@ -816,6 +835,14 @@ async def parse_multi_doc_context_for_structured_data( async def parse_multi_doc_concat(self, extraction_context): """Parse all documents and concatenate extracted data + This mode keeps all extracted ordinance rows from every + candidate document that produced structured data. Unlike the + merge mode, it does not try to choose a single best row for a + feature or resolve conflicts between sources. If the same + feature is extracted from multiple ordinances, each version is + preserved in the output with its own source and year so users + can compare the results directly. + Parameters ---------- extraction_context : ExtractionContext @@ -870,6 +897,79 @@ async def parse_multi_doc_concat(self, extraction_context): ) return extraction_context + async def parse_multi_doc_merge(self, extraction_context): + """Parse all documents and merge the extracted data + + This mode keeps at most one row per extracted feature across + all candidate documents. When every document with extracted + data has a known ordinance year, newer ordinances take + precedence and older ordinances are only used to fill in + features that are missing from the newer sources. If any + candidate document has an unknown year, documents are instead + prioritized by how many ordinance features they contain. + + Documents with extracted prohibitions are treated specially. + If any candidate document contains a prohibition, only + prohibition-bearing documents are considered for the final + merged output. The returned rows keep the source and year of + the document they came from so downstream consumers can still + trace each retained feature back to its originating ordinance. + + Parameters + ---------- + extraction_context : ExtractionContext + Context containing candidate documents to parse. + + Returns + ------- + ExtractionContext or None + Context with extracted data/information stored in the + ``.attrs`` dictionary, or ``None`` if no data was extracted. + """ + + tasks = [ + asyncio.create_task( + self.parse_for_structured_data(doc_for_extraction), + name=self.jurisdiction.full_name, + ) + for doc_for_extraction in extraction_context + ] + data_dfs = await asyncio.gather(*tasks) + + candidates = [] + for doc_ind, (data_df, doc) in enumerate( + zip(data_dfs, extraction_context, strict=True), start=1 + ): + row_count = self.get_structured_data_row_count(data_df) + if row_count == 0: + continue + + data_df["source"] = doc.attrs.get("source") + data_df["year"] = year = extract_year_from_doc_attrs(doc.attrs) + candidates.append( + { + "data_df": data_df, + "doc": doc, + "doc_ind": doc_ind, + "row_count": row_count, + "year": year, + } + ) + + if not candidates: + logger.debug( + "No ordinances found; searched %d docs", + extraction_context.num_documents, + ) + return None + + candidates = _filter_to_prohibition_cands_if_needed(candidates) + candidates = _prioritize_candidates(candidates) + extraction_context.attrs["structured_data"] = await _merge_candidates( + candidates, extraction_context, self.jurisdiction.full_name + ) + return extraction_context + async def parse_for_structured_data(self, source): """Extract all possible structured data from a document @@ -1262,3 +1362,84 @@ async def _fill_in_all_sources(data_df, extraction_context, out_fn_stem): ) return data_df + + +def _filter_to_prohibition_cands_if_needed(candidates): + """Filter to just candidates with prohibitions, if any""" + prohibition_candidates = [ + candidate + for candidate in candidates + if _has_prohibitions(candidate["data_df"]) + ] + return prohibition_candidates or candidates + + +def _prioritize_candidates(candidates): + """Sort candidates by year (only if all have years) and row count""" + if len(candidates) <= 1: + return candidates + + if all(candidate["year"] is not None for candidate in candidates): + return sorted( + candidates, + key=operator.itemgetter("year", "row_count"), + reverse=True, + ) + + return sorted( + candidates, + key=operator.itemgetter("row_count"), + reverse=True, + ) + + +async def _merge_candidates(candidates, extraction_context, out_stem): + """Merge extracted features while respecting candidate priority""" + merged_rows = [] + merged_features = set() + contributing_candidates = [] + for candidate in candidates: + candidate_rows = [] + for _, row in candidate["data_df"].iterrows(): + feature_key = _feature_key(row.get("feature")) + if feature_key is None or feature_key in merged_features: + continue + + merged_features.add(feature_key) + candidate_rows.append(row.to_dict()) + + if not candidate_rows: + continue + + merged_rows.extend(candidate_rows) + contributing_candidates.append(candidate) + + if not merged_rows: + return None + + for candidate in contributing_candidates: + await extraction_context.mark_doc_as_data_source( + candidate["doc"], + out_fn_stem=f"{out_stem}_{candidate['doc_ind']}", + ) + + return pd.DataFrame(merged_rows).reset_index(drop=True) + + +def _feature_key(feature): + """Get normalized feature key""" + if pd.isna(feature): + return None + return str(feature).strip().casefold() + + +def _has_prohibitions(data_df): + """Check for prohibition in data""" + if data_df is None or data_df.empty or "feature" not in data_df: + return False + + prohibition_mask = data_df["feature"].map(_feature_key).eq("prohibitions") + if not prohibition_mask.any(): + return False + + return num_ordinances_dataframe(data_df.loc[prohibition_mask]) > 0 From 126658b03c16f68fb4470a665be3b0ee4259a958 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 17:25:14 -0600 Subject: [PATCH 09/15] Add tests Co-authored-by: Copilot --- .../unit/plugin/test_plugin_ordinances.py | 259 ++++++++++++++++++ 1 file changed, 259 insertions(+) diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py index 074c2daa..141e5cba 100644 --- a/tests/python/unit/plugin/test_plugin_ordinances.py +++ b/tests/python/unit/plugin/test_plugin_ordinances.py @@ -1,7 +1,11 @@ """COMPASS ordinance plugin tests""" +import asyncio +from collections import UserList from pathlib import Path +from types import SimpleNamespace +import pandas as pd import pytest from compass.plugin.ordinance import ( @@ -13,6 +17,75 @@ from compass.exceptions import COMPASSPluginConfigurationError +class MergePlugin(OrdinanceExtractionPlugin): + """Concrete ordinance plugin for merge tests""" + + TEXT_COLLECTORS = [] + TEXT_EXTRACTORS = [] + PARSERS = [] + + IDENTIFIER = "test" + WEBSITE_KEYWORDS = ["test"] + QUERY_TEMPLATES = ["test"] + HEURISTIC = None + + async def parse_docs_for_structured_data(self, extraction_context): + return extraction_context + + +class FakeDoc: + def __init__(self, source, year=None, structured_data=None): + self.attrs = {"source": source} + if year is not None: + self.attrs["date"] = (year, 1, 1) + if structured_data is not None: + self.attrs["structured_data"] = structured_data + + +class FakeExtractionContext(UserList): + """List-like extraction context for merge tests""" + + def __init__(self, docs): + super().__init__(docs) + self.attrs = {} + self.marked_sources = [] + + @property + def num_documents(self): + return len(self) + + async def mark_doc_as_data_source(self, doc, out_fn_stem): + self.marked_sources.append((doc.attrs.get("source"), out_fn_stem)) + + +@pytest.fixture +def merge_plugin(): + """Build a concrete plugin for merge-path tests""" + + plugin = MergePlugin(None, None, None) + plugin.jurisdiction = SimpleNamespace(full_name="Test County") + return plugin + + +def _data_df(*rows): + return pd.DataFrame(rows) + + +async def _run_multi_doc_merge(plugin, context, data_dfs): + """Run the public merge path with controlled per-doc outputs""" + + for doc, data_df in zip(context, data_dfs, strict=True): + doc.attrs["structured_data"] = data_df + + async def _fake_parse_for_structured_data(doc): + await asyncio.sleep(0) + return doc.attrs["structured_data"] + + plugin.parse_for_structured_data = _fake_parse_for_structured_data + out = await plugin.parse_multi_doc_merge(context) + return out.attrs["structured_data"] + + def test_plugin_validation_parse_key_same(): """Test plugin interface validation logic""" @@ -189,5 +262,191 @@ async def parse_docs_for_structured_data(self, extraction_context): MYPlugin(None, None, None).validate_plugin_configuration() +@pytest.mark.asyncio +async def test_merge_multi_doc_data_prefers_latest_year(merge_plugin): + """Latest dated doc should win overlapping features""" + + context = FakeExtractionContext( + [ + FakeDoc("older", 2021), + FakeDoc("newer", 2024), + ] + ) + data_dfs = [ + _data_df( + {"feature": "setback", "value": 100, "summary": "old"}, + {"feature": "height", "value": 80, "summary": "old"}, + ), + _data_df( + {"feature": "setback", "value": 150, "summary": "new"}, + ), + ] + + merged = await _run_multi_doc_merge(merge_plugin, context, data_dfs) + + assert set(merged["feature"].str.casefold()) == {"setback", "height"} + setback = merged.loc[merged["feature"].str.casefold() == "setback"] + height = merged.loc[merged["feature"].str.casefold() == "height"] + assert setback.iloc[0]["value"] == 150 + assert setback.iloc[0]["source"] == "newer" + assert setback.iloc[0]["year"] == 2024 + assert height.iloc[0]["value"] == 80 + assert height.iloc[0]["source"] == "older" + assert height.iloc[0]["year"] == 2021 + assert context.marked_sources == [ + ("newer", "Test County_2"), + ("older", "Test County_1"), + ] + + +@pytest.mark.asyncio +async def test_merge_multi_doc_data_falls_back_to_ordinance_count( + merge_plugin, +): + """Unknown years should fall back to ordinance count priority""" + + context = FakeExtractionContext( + [ + FakeDoc("unknown-year"), + FakeDoc("known-year", 2025), + ] + ) + data_dfs = [ + _data_df( + {"feature": "setback", "value": 100, "summary": "one"}, + {"feature": "height", "value": 50, "summary": "two"}, + ), + _data_df( + {"feature": "setback", "value": 200, "summary": "other"}, + ), + ] + + merged = await _run_multi_doc_merge(merge_plugin, context, data_dfs) + + setback = merged.loc[merged["feature"].str.casefold() == "setback"] + assert setback.iloc[0]["value"] == 100 + assert setback.iloc[0]["source"] == "unknown-year" + assert pd.isna(setback.iloc[0]["year"]) + + +@pytest.mark.asyncio +async def test_merge_multi_doc_data_breaks_year_ties_by_row_count( + merge_plugin, +): + """Equal years should break ties using ordinance count""" + + context = FakeExtractionContext( + [ + FakeDoc("fewer", 2024), + FakeDoc("more", 2024), + ] + ) + data_dfs = [ + _data_df( + {"feature": "setback", "value": 100, "summary": "one"}, + ), + _data_df( + {"feature": "setback", "value": 200, "summary": "two"}, + {"feature": "height", "value": 70, "summary": "two"}, + ), + ] + + merged = await _run_multi_doc_merge(merge_plugin, context, data_dfs) + + setback = merged.loc[merged["feature"].str.casefold() == "setback"] + assert setback.iloc[0]["value"] == 200 + assert setback.iloc[0]["source"] == "more" + + +@pytest.mark.asyncio +async def test_merge_multi_doc_data_limits_to_prohibition_documents( + merge_plugin, +): + """Any prohibition should limit merging to prohibition docs only""" + + context = FakeExtractionContext( + [ + FakeDoc("prohibition-older", 2022), + FakeDoc("prohibition-newer", 2024), + FakeDoc("non-prohibition", 2026), + ] + ) + data_dfs = [ + _data_df( + { + "feature": "prohibitions", + "value": None, + "summary": "older prohibition", + }, + {"feature": "height", "value": 90, "summary": "older"}, + ), + _data_df( + { + "feature": "Prohibitions", + "value": None, + "summary": "newer prohibition", + }, + {"feature": "setback", "value": 300, "summary": "newer"}, + ), + _data_df( + {"feature": "noise", "value": 45, "summary": "ignored"}, + ), + ] + + merged = await _run_multi_doc_merge(merge_plugin, context, data_dfs) + + assert set(merged["feature"].str.casefold()) == { + "prohibitions", + "setback", + "height", + } + assert "noise" not in set(merged["feature"].str.casefold()) + prohibition = merged.loc[ + merged["feature"].str.casefold() == "prohibitions" + ] + assert prohibition.iloc[0]["source"] == "prohibition-newer" + assert context.marked_sources == [ + ("prohibition-newer", "Test County_2"), + ("prohibition-older", "Test County_1"), + ] + + +@pytest.mark.asyncio +async def test_parse_multi_doc_merge_returns_context(merge_plugin): + """Public merge path should attach merged structured data""" + + docs = [ + FakeDoc( + "older", + 2022, + _data_df( + {"feature": "height", "value": 60, "summary": "older"}, + ), + ), + FakeDoc( + "newer", + 2024, + _data_df( + {"feature": "setback", "value": 100, "summary": "newer"}, + ), + ), + ] + context = FakeExtractionContext(docs) + + async def _fake_parse_for_structured_data(doc): + await asyncio.sleep(0) + return doc.attrs["structured_data"] + + merge_plugin.parse_for_structured_data = _fake_parse_for_structured_data + + out = await merge_plugin.parse_multi_doc_merge(context) + + assert out is context + assert set(out.attrs["structured_data"]["feature"].str.casefold()) == { + "setback", + "height", + } + + if __name__ == "__main__": pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) From f43e95e61424883b3e684505e03169f1c9303a91 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 17:28:44 -0600 Subject: [PATCH 10/15] Add tests Co-authored-by: Copilot --- .../unit/plugin/test_plugin_ordinances.py | 233 +++++++++++++++++- 1 file changed, 232 insertions(+), 1 deletion(-) diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py index 141e5cba..e7e2bd55 100644 --- a/tests/python/unit/plugin/test_plugin_ordinances.py +++ b/tests/python/unit/plugin/test_plugin_ordinances.py @@ -13,8 +13,21 @@ BaseTextExtractor, BaseParser, OrdinanceExtractionPlugin, + _feature_key, + _fill_in_all_sources, + _fill_out_multi_file_sources, + _filter_to_prohibition_cands_if_needed, + _get_source_inds, + _has_prohibitions, + _merge_candidates, + _prioritize_candidates, + _valid_chunk, + _validate_in_out_keys, +) +from compass.exceptions import ( + COMPASSPluginConfigurationError, + COMPASSRuntimeError, ) -from compass.exceptions import COMPASSPluginConfigurationError class MergePlugin(OrdinanceExtractionPlugin): @@ -448,5 +461,223 @@ async def _fake_parse_for_structured_data(doc): } +@pytest.mark.parametrize( + "chunk,expected", + [("Useful text", True), ("No relevant text.", False), ("", "")], +) +def test_valid_chunk(chunk, expected): + """Helper should reject empty and negative extraction responses""" + + assert _valid_chunk(chunk) == expected + + +def test_validate_in_out_keys_raises_for_missing_key(): + """Helper should fail when no producer satisfies a required input""" + + class Producer: + OUT_LABEL = "produced" + + class Consumer: + IN_LABEL = "missing" + + with pytest.raises( + COMPASSPluginConfigurationError, + match=r"IN_LABEL 'missing'", + ): + _validate_in_out_keys([Consumer], [Producer]) + + +def test_get_source_inds_returns_integer_indices(): + """Helper should extract integer source indices from rows""" + + data_df = _data_df( + {"feature": "setback", "source": 0}, + {"feature": "height", "source": 1}, + {"feature": "noise", "source": 1}, + ) + + source_inds = _get_source_inds(data_df, 3) + + assert list(source_inds) == [0, 1] + + +@pytest.mark.parametrize( + "data_df,num_docs,match", + [ + (_data_df({"feature": "setback"}), 2, "column not found"), + ( + _data_df({"feature": "setback", "source": "one"}), + 2, + "non-integer values", + ), + ( + _data_df({"feature": "setback", "source": 2}), + 2, + "out-of-bounds indices", + ), + ], +) +def test_get_source_inds_raises_for_invalid_source_values( + data_df, num_docs, match +): + """Helper should reject missing, invalid, and out-of-range sources""" + + with pytest.raises(COMPASSRuntimeError, match=match): + _get_source_inds(data_df, num_docs) + + +@pytest.mark.asyncio +async def test_fill_out_multi_file_sources_maps_valid_source_indices(): + """Helper should map per-row source indices back to document metadata""" + + context = FakeExtractionContext( + [FakeDoc("doc-one", 2021), FakeDoc("doc-two", 2024)] + ) + data_df = _data_df( + {"feature": "setback", "source": 0}, + {"feature": "height", "source": 1}, + ) + + filled = await _fill_out_multi_file_sources(data_df, context, "County") + + assert list(filled["source"]) == ["doc-one", "doc-two"] + assert list(filled["year"]) == [2021, 2024] + assert context.marked_sources == [ + ("doc-one", "County_1"), + ("doc-two", "County_2"), + ] + + +@pytest.mark.asyncio +async def test_fill_in_all_sources_reports_full_context_when_needed(): + """Fallback helper should report all documents when row sources fail""" + + context = FakeExtractionContext( + [FakeDoc("doc-one", 2020), FakeDoc("doc-two", 2024)] + ) + data_df = _data_df({"feature": "setback", "value": 100}) + + filled = await _fill_in_all_sources(data_df, context, "County") + + assert filled.iloc[0]["source"] == "doc-one ;\ndoc-two" + assert filled.iloc[0]["year"] == 2024 + assert context.marked_sources == [ + ("doc-one", "County_1"), + ("doc-two", "County_2"), + ] + + +def test_feature_key_normalizes_values_and_handles_missing(): + """Feature-key helper should normalize strings and preserve missing""" + + assert _feature_key(" Prohibitions ") == "prohibitions" + assert _feature_key(pd.NA) is None + + +def test_has_prohibitions_requires_ordinance_content(): + """Prohibition helper should only flag rows with actual ordinance data""" + + with_prohibition = _data_df( + {"feature": "Prohibitions", "summary": "Wind is prohibited."} + ) + without_prohibition = _data_df( + {"feature": "Prohibitions", "summary": None, "value": None} + ) + + assert _has_prohibitions(with_prohibition) + assert not _has_prohibitions(without_prohibition) + + +def test_filter_to_prohibition_candidates_only_when_present(): + """Candidate helper should narrow to prohibition-bearing documents""" + + candidates = [ + { + "data_df": _data_df( + {"feature": "setback", "summary": "Regular standard"} + ) + }, + { + "data_df": _data_df( + { + "feature": "prohibitions", + "summary": "Wind systems are prohibited.", + } + ) + }, + ] + + filtered = _filter_to_prohibition_cands_if_needed(candidates) + + assert filtered == [candidates[1]] + + +def test_prioritize_candidates_prefers_latest_year_then_row_count(): + """Priority helper should sort by year when every candidate has one""" + + candidates = [ + {"year": 2021, "row_count": 5}, + {"year": 2024, "row_count": 1}, + {"year": 2024, "row_count": 3}, + ] + + prioritized = _prioritize_candidates(candidates) + + assert prioritized == [candidates[2], candidates[1], candidates[0]] + + +def test_prioritize_candidates_falls_back_to_row_count_without_years(): + """Priority helper should ignore year sorting when any year is unknown""" + + candidates = [ + {"year": 2024, "row_count": 1}, + {"year": None, "row_count": 3}, + {"year": 2021, "row_count": 2}, + ] + + prioritized = _prioritize_candidates(candidates) + + assert prioritized == [candidates[1], candidates[2], candidates[0]] + + +@pytest.mark.asyncio +async def test_merge_candidates_keeps_first_feature_and_marks_sources(): + """Merge helper should keep first-seen features by candidate priority""" + + context = FakeExtractionContext([FakeDoc("older"), FakeDoc("newer")]) + candidates = [ + { + "data_df": _data_df( + {"feature": "setback", "value": 200, "source": "newer"}, + {"feature": "height", "value": 80, "source": "newer"}, + ), + "doc": context[1], + "doc_ind": 2, + }, + { + "data_df": _data_df( + {"feature": "setback", "value": 100, "source": "older"}, + {"feature": "noise", "value": 45, "source": "older"}, + ), + "doc": context[0], + "doc_ind": 1, + }, + ] + + merged = await _merge_candidates(candidates, context, "County") + + assert set(merged["feature"].str.casefold()) == { + "setback", + "height", + "noise", + } + setback = merged.loc[merged["feature"].str.casefold() == "setback"] + assert setback.iloc[0]["value"] == 200 + assert context.marked_sources == [ + ("newer", "County_2"), + ("older", "County_1"), + ] + + if __name__ == "__main__": pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) From 7d4fcd095c62ab51812df42d313998d3330c8531 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 6 May 2026 17:46:27 -0600 Subject: [PATCH 11/15] Move to enum Co-authored-by: Copilot --- compass/plugin/__init__.py | 1 + compass/plugin/one_shot/base.py | 30 +----- compass/plugin/ordinance.py | 98 +++++++++++++++---- .../unit/plugin/test_plugin_ordinances.py | 37 +++++++ 4 files changed, 122 insertions(+), 44 deletions(-) diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py index 5f50ee3f..42e00488 100644 --- a/compass/plugin/__init__.py +++ b/compass/plugin/__init__.py @@ -9,6 +9,7 @@ from .ordinance import ( BaseTextExtractor, BaseParser, + DocSelectionMethod, KeywordBasedHeuristic, PromptBasedTextCollector, PromptBasedTextExtractor, diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py index 0ef69e47..ad6de218 100644 --- a/compass/plugin/one_shot/base.py +++ b/compass/plugin/one_shot/base.py @@ -11,6 +11,7 @@ NoOpHeuristic, NoOpTextCollector, NoOpTextExtractor, + DocSelectionMethod, PromptBasedTextCollector, PromptBasedTextExtractor, OrdinanceExtractionPlugin, @@ -176,7 +177,9 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin): SCHEMA = config["schema"] """dict: Schema for the output of the text extraction step""" - DOC_SELECTION_METHOD = _doc_selection_method(config) + DOC_SELECTION_METHOD = DocSelectionMethod.normalize( + config.get("doc_selection_method", "single doc") + ) """str: Method for selecting documents for extraction context Allowed options: @@ -598,28 +601,3 @@ def _normalize_keyword_list(items): normalized.add(keyword) return list(normalized) - - -def _doc_selection_method(config): - """Parse and normalize the document selection method""" - allowed_methods = { - "single_doc", - "multi_doc_context", - "multi_doc_all", - "multi_doc_mixed", - } - og_doc_selection_method = config.get("doc_selection_method", "single doc") - doc_selection_method = ( - og_doc_selection_method.replace(" ", "_") - .replace("-", "_") - .strip() - .casefold() - ) - if doc_selection_method not in allowed_methods: - msg = ( - f"Invalid doc_selection_method: {og_doc_selection_method!r}. " - f"Allowed options are: {sorted(allowed_methods)}." - ) - raise COMPASSPluginConfigurationError(msg) - - return doc_selection_method diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 35a9a094..b335b4cc 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -3,6 +3,7 @@ import asyncio import logging import operator +from enum import StrEnum from warnings import warn from textwrap import dedent from itertools import chain @@ -62,6 +63,63 @@ } +class DocSelectionMethod(StrEnum): + """Document selection modes for structured extraction""" + + SINGLE_DOC = "single_doc" + """Evaluate candidate documents one at a time until data is found""" + MULTI_DOC_CONTEXT = "multi_doc_context" + """Combine multiple documents into one extraction context""" + MULTI_DOC_ALL = "multi_doc_all" + """Parse each document separately and keep all extracted rows""" + MULTI_DOC_MIXED = "multi_doc_mixed" + """Parse separately and merge rows so each feature appears once""" + + @classmethod + def normalize(cls, value): + """Normalize a config value into a selection mode + + Parameters + ---------- + value : str or DocSelectionMethod + Input selection mode from plugin configuration or an + existing enum value. + + Returns + ------- + DocSelectionMethod + Normalized document selection mode. + + Raises + ------ + COMPASSPluginConfigurationError + Raised if ``value`` is not a string or enum member, or if + it does not map to a supported selection mode. + """ + if isinstance(value, cls): + return value + + if not isinstance(value, str): + msg = ( + "doc_selection_method must be a string or " + f"{cls.__name__} value." + ) + raise COMPASSPluginConfigurationError(msg) + + normalized = ( + value.replace(" ", "_").replace("-", "_").strip().casefold() + ) + try: + return cls(normalized) + except ValueError as err: + msg = ( + f"Invalid doc_selection_method: {value!r}. " + "Allowed options are: " + f"{sorted(method.value for method in cls)}." + ) + raise COMPASSPluginConfigurationError(msg) from err + + class BaseTextExtractor(BaseLLMCaller, ABC): """Extract succinct extraction text from input""" @@ -597,7 +655,7 @@ class OrdinanceExtractionPlugin(FilteredExtractionPlugin): methods as needed. """ - DOC_SELECTION_METHOD = "single doc" + DOC_SELECTION_METHOD = DocSelectionMethod.SINGLE_DOC """str: Only allow one document to be output""" @property @@ -702,27 +760,31 @@ async def parse_docs_for_structured_data(self, extraction_context): Context with extracted data/information stored in the ``.attrs`` dictionary, or ``None`` if no data was extracted. """ - if self.DOC_SELECTION_METHOD == "single_doc": - return await self.parse_single_doc_for_structured_data( - extraction_context - ) + match DocSelectionMethod.normalize(self.DOC_SELECTION_METHOD): + case DocSelectionMethod.SINGLE_DOC: + return await self.parse_single_doc_for_structured_data( + extraction_context + ) - if self.DOC_SELECTION_METHOD == "multi_doc_context": - return await self.parse_multi_doc_context_for_structured_data( - extraction_context - ) + case DocSelectionMethod.MULTI_DOC_CONTEXT: + return await self.parse_multi_doc_context_for_structured_data( + extraction_context + ) - if self.DOC_SELECTION_METHOD == "multi_doc_all": - return await self.parse_multi_doc_concat(extraction_context) + case DocSelectionMethod.MULTI_DOC_ALL: + return await self.parse_multi_doc_concat(extraction_context) - if self.DOC_SELECTION_METHOD == "multi_doc_mixed": - return await self.parse_multi_doc_merge(extraction_context) + case DocSelectionMethod.MULTI_DOC_MIXED: + return await self.parse_multi_doc_merge(extraction_context) - msg = ( - f"Invalid DOC_SELECTION_METHOD: {self.DOC_SELECTION_METHOD!r}. " - "Supported methods are: 'single_doc' and 'multi_doc_context'." - ) - raise COMPASSPluginConfigurationError(msg) + case _: + msg = ( + "Invalid DOC_SELECTION_METHOD: " + f"{self.DOC_SELECTION_METHOD!r}. " + "Supported methods are: " + f"{sorted(method.value for method in DocSelectionMethod)}." + ) + raise COMPASSPluginConfigurationError(msg) async def parse_single_doc_for_structured_data(self, extraction_context): """Parse documents one at a time to extract structured data diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py index e7e2bd55..e7a581d5 100644 --- a/tests/python/unit/plugin/test_plugin_ordinances.py +++ b/tests/python/unit/plugin/test_plugin_ordinances.py @@ -12,6 +12,7 @@ BaseTextCollector, BaseTextExtractor, BaseParser, + DocSelectionMethod, OrdinanceExtractionPlugin, _feature_key, _fill_in_all_sources, @@ -275,6 +276,42 @@ async def parse_docs_for_structured_data(self, extraction_context): MYPlugin(None, None, None).validate_plugin_configuration() +@pytest.mark.asyncio +async def test_parse_docs_for_structured_data_accepts_enum_value(): + """Enum-valued doc selection should dispatch correctly""" + + class MYPlugin(OrdinanceExtractionPlugin): + TEXT_COLLECTORS = [] + TEXT_EXTRACTORS = [] + PARSERS = [] + + IDENTIFIER = "test" + WEBSITE_KEYWORDS = ["test"] + QUERY_TEMPLATES = ["test"] + HEURISTIC = None + DOC_SELECTION_METHOD = DocSelectionMethod.MULTI_DOC_ALL + + async def parse_single_doc_for_structured_data( + self, extraction_context + ): + raise AssertionError("wrong dispatch") + + async def parse_multi_doc_context_for_structured_data( + self, extraction_context + ): + raise AssertionError("wrong dispatch") + + async def parse_multi_doc_concat(self, extraction_context): + return "concat" + + async def parse_multi_doc_merge(self, extraction_context): + raise AssertionError("wrong dispatch") + + plugin = MYPlugin(None, None, None) + + assert await plugin.parse_docs_for_structured_data(None) == "concat" + + @pytest.mark.asyncio async def test_merge_multi_doc_data_prefers_latest_year(merge_plugin): """Latest dated doc should win overlapping features""" From 25962c09251002bc0efaf5fce8abfe0165c6de0d Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 7 May 2026 15:03:50 -0600 Subject: [PATCH 12/15] Fix function --- compass/plugin/ordinance.py | 2 +- tests/python/unit/plugin/test_plugin_ordinances.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index b335b4cc..e9508f45 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -1298,7 +1298,7 @@ def _register_clean_file_names(self): def _valid_chunk(chunk): """True if chunk has content""" - return chunk and "no relevant text" not in chunk.lower() + return bool(chunk and "no relevant text" not in chunk.lower()) def _validate_in_out_keys(consumers, producers): diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py index e7a581d5..6f0f2b03 100644 --- a/tests/python/unit/plugin/test_plugin_ordinances.py +++ b/tests/python/unit/plugin/test_plugin_ordinances.py @@ -500,7 +500,7 @@ async def _fake_parse_for_structured_data(doc): @pytest.mark.parametrize( "chunk,expected", - [("Useful text", True), ("No relevant text.", False), ("", "")], + [("Useful text", True), ("No relevant text.", False), ("", False)], ) def test_valid_chunk(chunk, expected): """Helper should reject empty and negative extraction responses""" From f21cafc6852a8e0776c4c616841c19fd3e3d16a6 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 7 May 2026 15:03:55 -0600 Subject: [PATCH 13/15] Fix typo --- compass/plugin/one_shot/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py index ad6de218..d7ca83ee 100644 --- a/compass/plugin/one_shot/base.py +++ b/compass/plugin/one_shot/base.py @@ -126,7 +126,7 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): # noqa: C901 data extraction step. - `doc_selection_method`: String defining the multi-doc selection option. Specifically, if multiple documents pass - the filter, this method detemines how the documents are + the filter, this method determines how the documents are submitted to the extraction context. Allowed options are: - "single doc": Use the first document that returns some From ebc472481a952676b9b34ba75fd0809a750363be Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 7 May 2026 15:07:52 -0600 Subject: [PATCH 14/15] PR review update --- compass/plugin/ordinance.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index e9508f45..8e53180b 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -1461,19 +1461,21 @@ async def _merge_candidates(candidates, extraction_context, out_stem): merged_features = set() contributing_candidates = [] for candidate in candidates: - candidate_rows = [] - for _, row in candidate["data_df"].iterrows(): - feature_key = _feature_key(row.get("feature")) - if feature_key is None or feature_key in merged_features: - continue - - merged_features.add(feature_key) - candidate_rows.append(row.to_dict()) + data_df = candidate["data_df"] + if data_df is None or data_df.empty or "feature" not in data_df: + continue - if not candidate_rows: + feature_keys = data_df["feature"].map(_feature_key) + keep_mask = feature_keys.notna() + if merged_features: + keep_mask &= ~feature_keys.isin(merged_features) + keep_mask &= ~feature_keys.duplicated() + if not keep_mask.any(): continue - merged_rows.extend(candidate_rows) + selected_feature_keys = feature_keys.loc[keep_mask] + merged_features.update(selected_feature_keys.tolist()) + merged_rows.extend(data_df.loc[keep_mask].to_dict("records")) contributing_candidates.append(candidate) if not merged_rows: From 4890b00feeb6cefc152955e3cef0d7c00890ef42 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 7 May 2026 15:09:47 -0600 Subject: [PATCH 15/15] No docling in tox for now --- tox.ini | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tox.ini b/tox.ini index cbe07c74..ff3ad93b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,7 @@ [tox] min_version = 4.26 envlist = - py{312,313,314}-cl{817}-docl{290}-lts{1}-nx{342}-nltk{391}-np{224}-oai{234}-pd{223}-pw{149}-pj{5168}-rich{1394}-toml{0102} + py{312,313,314}-cl{817}-lts{1}-nx{342}-nltk{391}-np{224}-oai{234}-pd{223}-pw{149}-pj{5168}-rich{1394}-toml{0102} [gh-actions] python = @@ -15,7 +15,6 @@ commands = pytest tests --dist loadscope {posargs} deps = cl817: click>=8.1.7,<9 c4ai063: crawl4ai>=0.6.3,<0.7 - docl290: docling>=2.90.0,<3 lts1: langchain-text-splitters>=1.0.0,<2 nx342: networkx>=3.4.2,<4 nltk391: nltk>=3.9.1,<4 @@ -38,7 +37,6 @@ description = minimum supported versions deps= click~=8.1.7 crawl4ai~=0.6.3 - docling~=2.90.0 langchain-text-splitters~=1.0.0 networkx~=3.4.2 nltk~=3.9.1