Skip to content
Merged
1 change: 1 addition & 0 deletions compass/plugin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .ordinance import (
BaseTextExtractor,
BaseParser,
DocSelectionMethod,
KeywordBasedHeuristic,
PromptBasedTextCollector,
PromptBasedTextExtractor,
Expand Down
49 changes: 41 additions & 8 deletions compass/plugin/one_shot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
NoOpHeuristic,
NoOpTextCollector,
NoOpTextExtractor,
DocSelectionMethod,
PromptBasedTextCollector,
PromptBasedTextExtractor,
OrdinanceExtractionPlugin,
Expand Down Expand Up @@ -123,11 +124,26 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): # noqa: C901
may provide a custom system prompt if you want to provide
more specific instructions to the LLM for the structured
data extraction step.
- `allow_multi_doc_extraction`: Boolean flag indicating
whether to allow multiple documents to be used for the
extraction context simultaneously. By default, ``False``,
which means the first document that returns some extracted
data will be marked as the source.
- `doc_selection_method`: String defining the multi-doc
selection option. Specifically, if multiple documents pass
the filter, this method determines how the documents are
submitted to the extraction context. Allowed options are:

- "single doc": Use the first document that returns some
extracted data as the source document for the
extraction context.
- "multi doc context": Submit text from multiple
documents to the extraction context simultaneously.
- "multi doc all": Each document is extracted separately
and the results concatenated. This may give duplicated
feature results if the same feature is mentioned in
multiple documents.
- "multi doc mixed": Each document is extracted
separately and the results are merged together at the
end. In this approach, each feature is reported at
most once.

By default, ``"single doc"``.

tech : str
Technology identifier to use for the plugin (e.g., "wind",
Expand Down Expand Up @@ -161,10 +177,27 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
SCHEMA = config["schema"]
"""dict: Schema for the output of the text extraction step"""

ALLOW_MULTI_DOC_EXTRACTION = config.get(
"allow_multi_doc_extraction", False
DOC_SELECTION_METHOD = DocSelectionMethod.normalize(
config.get("doc_selection_method", "single doc")
)
"""bool: Whether to allow extraction over multiple documents"""
"""str: Method for selecting documents for extraction context

Allowed options:

- "single doc": Use the first document that returns some
extracted data as the source document for the extraction
context.
- "multi doc context": Submit text from multiple documents
to the extraction context simultaneously.
- "multi doc all": Each document is extracted separately
and the results concatenated. This may give duplicated
feature results if the same feature is mentioned in
multiple documents.
- "multi doc mixed": Each document is extracted separately
and the results are merged together at the end. In this
approach, each feature is reported at most once.

"""

IDENTIFIER = tech
"""str: Identifier for extraction task """
Expand Down
Loading
Loading