Skip to content

feat: add query rephraser options #560

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions docs/api_reference/document_search/retrieval/rephrasers.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# Query Rephrasers

::: ragbits.document_search.retrieval.rephrasers.QueryRephraser
::: ragbits.document_search.retrieval.rephrasers.LLMQueryRephraser
::: ragbits.document_search.retrieval.rephrasers.MultiQueryRephraser
::: ragbits.document_search.retrieval.rephrasers.NoopQueryRephraser
::: ragbits.document_search.retrieval.rephrasers.base.QueryRephraserOptions

::: ragbits.document_search.retrieval.rephrasers.llm.LLMQueryRephraserOptions

::: ragbits.document_search.retrieval.rephrasers.base.QueryRephraser

::: ragbits.document_search.retrieval.rephrasers.llm.LLMQueryRephraser

::: ragbits.document_search.retrieval.rephrasers.noop.NoopQueryRephraser
17 changes: 11 additions & 6 deletions docs/how-to/document_search/search-documents.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ By default, the input query is provided directly to the embedding model. However
=== "Multi query"

```python
from ragbits.document_search.retrieval.rephrasers import MultiQueryRephraser
from ragbits.document_search.retrieval.rephrasers import LLMQueryRephraser, LLMQueryRephraserOptions
from ragbits.document_search import DocumentSearch

query_rephraser = MultiQueryRephraser(LiteLLM(model_name="gpt-3.5-turbo"), n=3)
query_rephraser = LLMQueryRephraser(LiteLLM(model_name="gpt-3.5-turbo"), default_options=LLMQueryRephraserOptions(n=3))
document_search = DocumentSearch(query_rephraser=query_rephraser, ...)

elements = await document_search.search("What is the capital of Poland?")
Expand All @@ -108,25 +108,28 @@ By default, the input query is provided directly to the embedding model. However
To define a new rephraser, extend the the [`QueryRephraser`][ragbits.document_search.retrieval.rephrasers.base.QueryRephraser] class.

```python
from ragbits.document_search.retrieval.rephrasers import QueryRephraser
from ragbits.document_search.retrieval.rephrasers import QueryRephraser, QueryRephraserOptions


class CustomRephraser(QueryRephraser):
class CustomRephraser(QueryRephraser[QueryRephraserOptions]):
"""
Rephraser that uses a LLM to rephrase queries.
"""

async def rephrase(self, query: str) -> list[str]:
options_cls: type[QueryRephraserOptions] = QueryRephraserOptions

async def rephrase(self, query: str, options: QueryRephraserOptions | None = None) -> Iterable[str]:
"""
Rephrase a query using the LLM.

Args:
query: The query to be rephrased.
options: The options for rephrasing.

Returns:
List containing the rephrased query.
"""
responses = await llm.generate(QueryRephraserPrompt(...))
responses = await llm.generate(CustomRephraserPrompt(...))
...
return [...]
```
Expand Down Expand Up @@ -175,6 +178,8 @@ class CustomReranker(Reranker[RerankerOptions]):
Reranker that uses a LLM to rerank elements.
"""

options_cls: type[RerankerOptions] = RerankerOptions

async def rerank(
self,
elements: Sequence[Sequence[Element]],
Expand Down
13 changes: 9 additions & 4 deletions examples/document-search/configurable.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class to rephrase the query.

import asyncio

from ragbits.core.audit import set_trace_handlers
from ragbits.core.audit.traces import set_trace_handlers
from ragbits.document_search import DocumentSearch
from ragbits.document_search.documents.document import DocumentMeta

Expand Down Expand Up @@ -86,13 +86,12 @@ class to rephrase the query.
"model": "cohere/rerank-english-v3.0",
"default_options": {
"top_n": 3,
"max_chunks_per_doc": None,
},
},
},
"parser_router": {"txt": {"type": "TextDocumentParser"}},
"rephraser": {
"type": "LLMQueryRephraser",
"type": "ragbits.document_search.retrieval.rephrasers:LLMQueryRephraser",
"config": {
"llm": {
"type": "ragbits.core.llms.litellm:LiteLLM",
Expand All @@ -101,7 +100,13 @@ class to rephrase the query.
},
},
"prompt": {
"type": "QueryRephraserPrompt",
"type": "ragbits.document_search.retrieval.rephrasers:LLMQueryRephraserPrompt",
},
"default_options": {
"n": 2,
"llm_options": {
"temperature": 0.0,
},
},
},
},
Expand Down
1 change: 1 addition & 0 deletions packages/ragbits-document-search/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## Unreleased

- Add query rephraser options (#560)
- Rename DocumentMeta create_text_document_from_literal to from_literal (#561)
- Update audit imports (#427)
- BREAKING CHANGE: Adjust document search configurable interface (#554)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,23 @@
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
from ragbits.document_search.ingestion.strategies import IngestStrategy, SequentialIngestStrategy
from ragbits.document_search.ingestion.strategies.base import IngestExecutionError, IngestExecutionResult
from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser
from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptionsT
from ragbits.document_search.retrieval.rephrasers.noop import NoopQueryRephraser
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptionsT
from ragbits.document_search.retrieval.rerankers.noop import NoopReranker


class DocumentSearchOptions(Options, Generic[VectorStoreOptionsT, RerankerOptionsT]):
class DocumentSearchOptions(Options, Generic[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT]):
"""
Object representing the options for the document search.

Attributes:
query_rephraser_options: The options for the query rephraser.
vector_store_options: The options for the vector store.
reranker_options: The options for the reranker.
"""

query_rephraser_options: QueryRephraserOptionsT | None | NotGiven = NOT_GIVEN
vector_store_options: VectorStoreOptionsT | None | NotGiven = NOT_GIVEN
reranker_options: RerankerOptionsT | None | NotGiven = NOT_GIVEN

Expand All @@ -57,7 +59,9 @@ class DocumentSearchConfig(BaseModel):
enricher_router: dict[str, ObjectConstructionConfig] = {}


class DocumentSearch(ConfigurableComponent[DocumentSearchOptions[VectorStoreOptionsT, RerankerOptionsT]]):
class DocumentSearch(
ConfigurableComponent[DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT]]
):
"""
Main entrypoint to the document search functionality. It provides methods for document retrieval and ingestion.

Expand All @@ -80,9 +84,14 @@ def __init__(
self,
vector_store: VectorStore[VectorStoreOptionsT],
*,
query_rephraser: QueryRephraser | None = None,
query_rephraser: QueryRephraser[QueryRephraserOptionsT] | None = None,
reranker: Reranker[RerankerOptionsT] | None = None,
default_options: DocumentSearchOptions[VectorStoreOptionsT, RerankerOptionsT] | None = None,
default_options: DocumentSearchOptions[
QueryRephraserOptionsT,
VectorStoreOptionsT,
RerankerOptionsT,
]
| None = None,
ingest_strategy: IngestStrategy | None = None,
parser_router: DocumentParserRouter | None = None,
enricher_router: ElementEnricherRouter | None = None,
Expand Down Expand Up @@ -124,9 +133,9 @@ def from_config(cls, config: dict) -> Self:
"""
model = DocumentSearchConfig.model_validate(config)

query_rephraser = QueryRephraser.subclass_from_config(model.rephraser)
reranker: Reranker = Reranker.subclass_from_config(model.reranker)
query_rephraser: QueryRephraser = QueryRephraser.subclass_from_config(model.rephraser)
vector_store: VectorStore = VectorStore.subclass_from_config(model.vector_store)
reranker: Reranker = Reranker.subclass_from_config(model.reranker)

ingest_strategy = IngestStrategy.subclass_from_config(model.ingest_strategy)
parser_router = DocumentParserRouter.from_config(model.parser_router)
Expand Down Expand Up @@ -192,7 +201,7 @@ def preferred_subclass(
async def search(
self,
query: str,
options: DocumentSearchOptions[VectorStoreOptionsT, RerankerOptionsT] | None = None,
options: DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT] | None = None,
) -> Sequence[Element]:
"""
Search for the most relevant chunks for a query.
Expand All @@ -205,11 +214,12 @@ async def search(
A list of chunks.
"""
merged_options = (self.default_options | options) if options else self.default_options
query_rephraser_options = merged_options.query_rephraser_options or None
vector_store_options = merged_options.vector_store_options or None
reranker_options = merged_options.reranker_options or None

with trace(query=query, options=merged_options) as outputs:
queries = await self.query_rephraser.rephrase(query)
queries = await self.query_rephraser.rephrase(query, query_rephraser_options)
elements = [
[
Element.from_vector_db_entry(result.entry, result.score)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser
from ragbits.document_search.retrieval.rephrasers.llm import LLMQueryRephraser
from ragbits.document_search.retrieval.rephrasers.multi import MultiQueryRephraser
from ragbits.document_search.retrieval.rephrasers.noop import NoopQueryRephraser
from ragbits.document_search.retrieval.rephrasers.prompts import (
MultiQueryRephraserInput,
MultiQueryRephraserPrompt,
QueryRephraserInput,
QueryRephraserPrompt,
from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptions
from ragbits.document_search.retrieval.rephrasers.llm import (
LLMQueryRephraser,
LLMQueryRephraserOptions,
LLMQueryRephraserPrompt,
LLMQueryRephraserPromptInput,
)
from ragbits.document_search.retrieval.rephrasers.noop import NoopQueryRephraser

__all__ = [
"LLMQueryRephraser",
"MultiQueryRephraser",
"MultiQueryRephraserInput",
"MultiQueryRephraserPrompt",
"LLMQueryRephraserOptions",
"LLMQueryRephraserPrompt",
"LLMQueryRephraserPromptInput",
"NoopQueryRephraser",
"QueryRephraser",
"QueryRephraserInput",
"QueryRephraserPrompt",
"QueryRephraserOptions",
]
Original file line number Diff line number Diff line change
@@ -1,25 +1,38 @@
from abc import ABC, abstractmethod
from typing import ClassVar
from collections.abc import Iterable
from typing import ClassVar, TypeVar

from ragbits.core.utils.config_handling import WithConstructionConfig
from ragbits.core.options import Options
from ragbits.core.utils.config_handling import ConfigurableComponent
from ragbits.document_search.retrieval import rephrasers


class QueryRephraser(WithConstructionConfig, ABC):
class QueryRephraserOptions(Options):
"""
Object representing the options for the rephraser.
"""


QueryRephraserOptionsT = TypeVar("QueryRephraserOptionsT", bound=QueryRephraserOptions)


class QueryRephraser(ConfigurableComponent[QueryRephraserOptionsT], ABC):
"""
Rephrases a query. Can provide multiple rephrased queries from one sentence / question.
"""

options_cls: type[QueryRephraserOptionsT]
default_module: ClassVar = rephrasers
configuration_key: ClassVar = "rephraser"

@abstractmethod
async def rephrase(self, query: str) -> list[str]:
async def rephrase(self, query: str, options: QueryRephraserOptionsT | None = None) -> Iterable[str]:
"""
Rephrase a query.

Args:
query: The query to rephrase.
options: The options for the rephraser.

Returns:
The rephrased queries.
Expand Down
Loading
Loading