diff --git a/Makefile b/Makefile index 5896e734..d9f978e7 100644 --- a/Makefile +++ b/Makefile @@ -75,10 +75,10 @@ test-opensearch: .PHONY: test test: - -$(run_es) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh elasticsearch:9200 && cd stac_fastapi/tests/ && pytest --cov=stac_fastapi --cov-report=term-missing' + -$(run_es) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh elasticsearch:9200 && cd stac_fastapi/tests/ && pytest -s --cov=stac_fastapi --cov-report=term-missing' docker compose down - -$(run_os) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh opensearch:9202 && cd stac_fastapi/tests/ && pytest --cov=stac_fastapi --cov-report=term-missing' + -$(run_os) /bin/bash -c 'export && ./scripts/wait-for-it-es.sh opensearch:9202 && cd stac_fastapi/tests/ && pytest -s --cov=stac_fastapi --cov-report=term-missing' docker compose down .PHONY: run-database-es diff --git a/README.md b/README.md index 11619f86..71920ec2 100644 --- a/README.md +++ b/README.md @@ -201,29 +201,32 @@ There are two main ways to run the API locally: You can customize additional settings in your `.env` file: -| Variable | Description | Default | Required | -|------------------------------|--------------------------------------------------------------------------------------|--------------------------|---------------------------------------------------------------------------------------------| -| `ES_HOST` | Hostname for external Elasticsearch/OpenSearch. | `localhost` | Optional | -| `ES_PORT` | Port for Elasticsearch/OpenSearch. | `9200` (ES) / `9202` (OS)| Optional | -| `ES_USE_SSL` | Use SSL for connecting to Elasticsearch/OpenSearch. | `false` | Optional | -| `ES_VERIFY_CERTS` | Verify SSL certificates when connecting. | `false` | Optional | -| `STAC_FASTAPI_TITLE` | Title of the API in the documentation. | `stac-fastapi-` | Optional | -| `STAC_FASTAPI_DESCRIPTION` | Description of the API in the documentation. | N/A | Optional | -| `STAC_FASTAPI_VERSION` | API version. | `2.1` | Optional | -| `STAC_FASTAPI_LANDING_PAGE_ID` | Landing page ID | `stac-fastapi` | Optional | -| `APP_HOST` | Server bind address. | `0.0.0.0` | Optional | -| `APP_PORT` | Server port. | `8080` | Optional | -| `ENVIRONMENT` | Runtime environment. | `local` | Optional | -| `WEB_CONCURRENCY` | Number of worker processes. | `10` | Optional | -| `RELOAD` | Enable auto-reload for development. | `true` | Optional | -| `STAC_FASTAPI_RATE_LIMIT` | API rate limit per client. | `200/minute` | Optional | -| `BACKEND` | Tests-related variable | `elasticsearch` or `opensearch` based on the backend | Optional | -| `ELASTICSEARCH_VERSION` | Version of Elasticsearch to use. | `8.11.0` | Optional | | -| `OPENSEARCH_VERSION` | OpenSearch version | `2.11.1` | Optional -| `ENABLE_DIRECT_RESPONSE` | Enable direct response for maximum performance (disables all FastAPI dependencies, including authentication, custom status codes, and validation) | `false` | Optional -| `RAISE_ON_BULK_ERROR` | Controls whether bulk insert operations raise exceptions on errors. If set to `true`, the operation will stop and raise an exception when an error occurs. If set to `false`, errors will be logged, and the operation will continue. **Note:** STAC Item and ItemCollection validation errors will always raise, regardless of this flag. | `false` Optional | -| `DATABASE_REFRESH` | Controls whether database operations refresh the index immediately after changes. If set to `true`, changes will be immediately searchable. If set to `false`, changes may not be immediately visible but can improve performance for bulk operations. If set to `wait_for`, changes will wait for the next refresh cycle to become visible. | `false` | Optional | -| `ENABLE_TRANSACTIONS_EXTENSIONS` | Enables or disables the Transactions and Bulk Transactions API extensions. If set to `false`, the POST `/collections` route and related transaction endpoints (including bulk transaction operations) will be unavailable in the API. This is useful for deployments where mutating the catalog via the API should be prevented. | `true` | Optional | +| Variable | Description | Default | Required | +|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|---------------------------------------------------------------------------------------------| +| `ES_HOST` | Hostname for external Elasticsearch/OpenSearch. | `localhost` | Optional | +| `ES_PORT` | Port for Elasticsearch/OpenSearch. | `9200` (ES) / `9202` (OS) | Optional | +| `ES_USE_SSL` | Use SSL for connecting to Elasticsearch/OpenSearch. | `false` | Optional | +| `ES_VERIFY_CERTS` | Verify SSL certificates when connecting. | `false` | Optional | +| `STAC_FASTAPI_TITLE` | Title of the API in the documentation. | `stac-fastapi-` | Optional | +| `STAC_FASTAPI_DESCRIPTION` | Description of the API in the documentation. | N/A | Optional | +| `STAC_FASTAPI_VERSION` | API version. | `2.1` | Optional | +| `STAC_FASTAPI_LANDING_PAGE_ID` | Landing page ID | `stac-fastapi` | Optional | +| `APP_HOST` | Server bind address. | `0.0.0.0` | Optional | +| `APP_PORT` | Server port. | `8080` | Optional | +| `ENVIRONMENT` | Runtime environment. | `local` | Optional | +| `WEB_CONCURRENCY` | Number of worker processes. | `10` | Optional | +| `RELOAD` | Enable auto-reload for development. | `true` | Optional | +| `STAC_FASTAPI_RATE_LIMIT` | API rate limit per client. | `200/minute` | Optional | +| `BACKEND` | Tests-related variable | `elasticsearch` or `opensearch` based on the backend | Optional | +| `ELASTICSEARCH_VERSION` | Version of Elasticsearch to use. | `8.11.0` | Optional | | +| `OPENSEARCH_VERSION` | OpenSearch version | `2.11.1` | Optional +| `ENABLE_DIRECT_RESPONSE` | Enable direct response for maximum performance (disables all FastAPI dependencies, including authentication, custom status codes, and validation) | `false` | Optional +| `RAISE_ON_BULK_ERROR` | Controls whether bulk insert operations raise exceptions on errors. If set to `true`, the operation will stop and raise an exception when an error occurs. If set to `false`, errors will be logged, and the operation will continue. **Note:** STAC Item and ItemCollection validation errors will always raise, regardless of this flag. | `false` Optional | +| `DATABASE_REFRESH` | Controls whether database operations refresh the index immediately after changes. If set to `true`, changes will be immediately searchable. If set to `false`, changes may not be immediately visible but can improve performance for bulk operations. If set to `wait_for`, changes will wait for the next refresh cycle to become visible. | `false` | Optional | +| `ENABLE_TRANSACTIONS_EXTENSIONS` | Enables or disables the Transactions and Bulk Transactions API extensions. If set to `false`, the POST `/collections` route and related transaction endpoints (including bulk transaction operations) will be unavailable in the API. This is useful for deployments where mutating the catalog via the API should be prevented. | `true` | Optional | +| `ENABLE_DATETIME_INDEX_FILTERING` | Enable datetime-based index selection using collection IDs. Requires indexes in format: STAC_ITEMS_INDEX_PREFIX_collection-id_start_year-start_month-start_day-end_year-end_month-end_day, e.g. items_sentinel-2-l2a_2025-06-06-2025-09-22. | `false` | Optional | +| `DATETIME_INDEX_MAX_SIZE_GB` | Maximum size limit in GB for datetime-based indexes. When an index exceeds this size, a new time-partitioned index will be created. Only applies when `ENABLE_DATETIME_INDEX_FILTERING` is enabled. | `25` | Optional | + > [!NOTE] > The variables `ES_HOST`, `ES_PORT`, `ES_USE_SSL`, and `ES_VERIFY_CERTS` apply to both Elasticsearch and OpenSearch backends, so there is no need to rename the key names to `OS_` even if you're using OpenSearch. diff --git a/compose.yml b/compose.yml index 240934d6..76693fbf 100644 --- a/compose.yml +++ b/compose.yml @@ -21,6 +21,7 @@ services: - ES_USE_SSL=false - ES_VERIFY_CERTS=false - BACKEND=elasticsearch + - ENABLE_DATETIME_INDEX_FILTERING=true ports: - "8080:8080" volumes: @@ -55,6 +56,7 @@ services: - ES_VERIFY_CERTS=false - BACKEND=opensearch - STAC_FASTAPI_RATE_LIMIT=200/minute + - ENABLE_DATETIME_INDEX_FILTERING=true ports: - "8082:8082" volumes: @@ -69,9 +71,11 @@ services: elasticsearch: container_name: es-container image: docker.elastic.co/elasticsearch/elasticsearch:${ELASTICSEARCH_VERSION:-8.11.0} + platform: linux/amd64 hostname: elasticsearch environment: ES_JAVA_OPTS: -Xms512m -Xmx1g + action.destructive_requires_name: false volumes: - ./elasticsearch/config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml - ./elasticsearch/snapshots:/usr/share/elasticsearch/snapshots @@ -81,6 +85,7 @@ services: opensearch: container_name: os-container image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.11.1} + platform: linux/amd64 hostname: opensearch environment: - discovery.type=single-node diff --git a/stac_fastapi/core/stac_fastapi/core/core.py b/stac_fastapi/core/stac_fastapi/core/core.py index 866b429a..ee0c5fe1 100644 --- a/stac_fastapi/core/stac_fastapi/core/core.py +++ b/stac_fastapi/core/stac_fastapi/core/core.py @@ -31,6 +31,7 @@ BulkTransactionMethod, Items, ) +from stac_fastapi.sfeos_helpers.database import return_date from stac_fastapi.types import stac as stac_types from stac_fastapi.types.conformance import BASE_CONFORMANCE_CLASSES from stac_fastapi.types.core import AsyncBaseCoreClient, AsyncBaseTransactionsClient @@ -315,9 +316,10 @@ async def item_collection( search=search, collection_ids=[collection_id] ) + datetime_search = return_date(datetime) if datetime: search = self.database.apply_datetime_filter( - search=search, interval=datetime + search=search, datetime_search=datetime_search ) if bbox: @@ -333,6 +335,7 @@ async def item_collection( sort=None, token=token, collection_ids=[collection_id], + datetime_search=datetime_search, ) items = [ @@ -491,9 +494,10 @@ async def post_search( search=search, collection_ids=search_request.collections ) + datetime_search = return_date(search_request.datetime) if search_request.datetime: search = self.database.apply_datetime_filter( - search=search, interval=search_request.datetime + search=search, datetime_search=datetime_search ) if search_request.bbox: @@ -551,6 +555,7 @@ async def post_search( token=search_request.token, sort=sort, collection_ids=search_request.collections, + datetime_search=datetime_search, ) fields = ( diff --git a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py index d529ce01..542438db 100644 --- a/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py +++ b/stac_fastapi/elasticsearch/stac_fastapi/elasticsearch/database_logic.py @@ -22,6 +22,11 @@ ) from stac_fastapi.sfeos_helpers import filter from stac_fastapi.sfeos_helpers.database import ( + AsyncIndexInserter, + IndexInsertionFactory, + IndexSelectionStrategy, + IndexSelectorFactory, + SyncIndexInserter, apply_free_text_filter_shared, apply_intersects_filter_shared, create_index_templates_shared, @@ -30,7 +35,6 @@ index_alias_by_collection_id, index_by_collection_id, indices, - mk_actions, mk_item_id, populate_sort_shared, return_date, @@ -45,7 +49,6 @@ Geometry, ) from stac_fastapi.types.errors import ConflictError, NotFoundError -from stac_fastapi.types.rfc3339 import DateTimeType from stac_fastapi.types.stac import Collection, Item logger = logging.getLogger(__name__) @@ -123,6 +126,10 @@ class DatabaseLogic(BaseDatabaseLogic): sync_settings: SyncElasticsearchSettings = attr.ib( factory=SyncElasticsearchSettings ) + async_index_selector: IndexSelectionStrategy = attr.ib(init=False) + sync_index_selector: IndexSelectionStrategy = attr.ib(init=False) + async_index_inserter: AsyncIndexInserter = attr.ib(init=False) + sync_index_inserter: SyncIndexInserter = attr.ib(init=False) client = attr.ib(init=False) sync_client = attr.ib(init=False) @@ -131,6 +138,14 @@ def __attrs_post_init__(self): """Initialize clients after the class is instantiated.""" self.client = self.async_settings.create_client self.sync_client = self.sync_settings.create_client + self.async_index_inserter = IndexInsertionFactory.create_insertion_strategy( + self.client + ) + self.sync_index_inserter = IndexInsertionFactory.create_sync_insertion_strategy( + self.sync_client + ) + self.async_index_selector = IndexSelectorFactory.create_async_selector(self.client) + self.sync_index_selector = IndexSelectorFactory.create_sync_selector(self.sync_client) item_serializer: Type[ItemSerializer] = attr.ib(default=ItemSerializer) collection_serializer: Type[CollectionSerializer] = attr.ib( @@ -244,19 +259,18 @@ def apply_collections_filter(search: Search, collection_ids: List[str]): @staticmethod def apply_datetime_filter( - search: Search, interval: Optional[Union[DateTimeType, str]] + search: Search, datetime_search: Dict[str, Optional[str]] ): """Apply a filter to search on datetime, start_datetime, and end_datetime fields. Args: search (Search): The search object to filter. - interval: Optional[Union[DateTimeType, str]] + datetime_search: Dict[str, Optional[str]] Returns: Search: The filtered search object. """ should = [] - datetime_search = return_date(interval) # If the request is a single datetime return # items with datetimes equal to the requested datetime OR @@ -501,6 +515,7 @@ async def execute_search( token: Optional[str], sort: Optional[Dict[str, Dict[str, str]]], collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], ignore_unavailable: bool = True, ) -> Tuple[Iterable[Dict[str, Any]], Optional[int], Optional[str]]: """Execute a search query with limit and other optional parameters. @@ -511,6 +526,7 @@ async def execute_search( token (Optional[str]): The token used to return the next set of results. sort (Optional[Dict[str, Dict[str, str]]]): Specifies how the results should be sorted. collection_ids (Optional[List[str]]): The collection ids to search. + datetime_search (Dict[str, Optional[str]]): Datetime range used for index selection. ignore_unavailable (bool, optional): Whether to ignore unavailable collections. Defaults to True. Returns: @@ -531,7 +547,9 @@ async def execute_search( query = search.query.to_dict() if search.query else None - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) max_result_window = MAX_LIMIT @@ -595,6 +613,7 @@ async def aggregate( geometry_geohash_grid_precision: int, geometry_geotile_grid_precision: int, datetime_frequency_interval: str, + datetime_search, ignore_unavailable: Optional[bool] = True, ): """Return aggregations of STAC Items.""" @@ -630,7 +649,10 @@ def _fill_aggregation_parameters(name: str, agg: dict) -> dict: if k in aggregations } - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) + search_task = asyncio.create_task( self.client.search( index=index_param, @@ -828,9 +850,12 @@ async def create_item( item=item, base_url=base_url, exist_ok=exist_ok ) + target_index = await self.async_index_inserter.get_target_index( + collection_id, item + ) # Index the item in the database await self.client.index( - index=index_alias_by_collection_id(collection_id), + index=target_index, id=mk_item_id(item_id, collection_id), document=item, refresh=refresh, @@ -866,10 +891,16 @@ async def delete_item(self, item_id: str, collection_id: str, **kwargs: Any): try: # Perform the delete operation - await self.client.delete( + await self.client.delete_by_query( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), - refresh=refresh, + body={ + "query": { + "term": { + "_id": mk_item_id(item_id, collection_id) + } + } + }, + refresh=refresh ) except ESNotFoundError: # Raise a custom NotFoundError if the item does not exist @@ -937,8 +968,10 @@ async def create_collection(self, collection: Collection, **kwargs: Any): refresh=refresh, ) - # Create the item index for the collection - await create_item_index(collection_id) + if self.async_index_inserter.should_create_collection_index(): + await self.async_index_inserter.create_simple_index( + self.client, collection_id + ) async def find_collection(self, collection_id: str) -> Collection: """Find and return a collection from the database. @@ -1136,9 +1169,12 @@ async def bulk_async( # Perform the bulk insert raise_on_error = self.async_settings.raise_on_bulk_error + actions = await self.async_index_inserter.prepare_bulk_actions( + collection_id, processed_items + ) success, errors = await helpers.async_bulk( self.client, - mk_actions(collection_id, processed_items), + actions, refresh=refresh, raise_on_error=raise_on_error, ) @@ -1202,9 +1238,12 @@ def bulk_sync( # Perform the bulk insert raise_on_error = self.sync_settings.raise_on_bulk_error + actions = self.sync_index_inserter.prepare_bulk_actions( + collection_id, processed_items + ) success, errors = helpers.bulk( self.sync_client, - mk_actions(collection_id, processed_items), + actions, refresh=refresh, raise_on_error=raise_on_error, ) diff --git a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py index f93311f9..b60bf42a 100644 --- a/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py +++ b/stac_fastapi/opensearch/stac_fastapi/opensearch/database_logic.py @@ -5,7 +5,7 @@ import logging from base64 import urlsafe_b64decode, urlsafe_b64encode from copy import deepcopy -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type import attr from opensearchpy import exceptions, helpers @@ -22,6 +22,11 @@ from stac_fastapi.opensearch.config import OpensearchSettings as SyncSearchSettings from stac_fastapi.sfeos_helpers import filter from stac_fastapi.sfeos_helpers.database import ( + AsyncIndexInserter, + IndexInsertionFactory, + IndexSelectionStrategy, + IndexSelectorFactory, + SyncIndexInserter, apply_free_text_filter_shared, apply_intersects_filter_shared, create_index_templates_shared, @@ -30,7 +35,6 @@ index_alias_by_collection_id, index_by_collection_id, indices, - mk_actions, mk_item_id, populate_sort_shared, return_date, @@ -48,7 +52,6 @@ Geometry, ) from stac_fastapi.types.errors import ConflictError, NotFoundError -from stac_fastapi.types.rfc3339 import DateTimeType from stac_fastapi.types.stac import Collection, Item logger = logging.getLogger(__name__) @@ -89,33 +92,6 @@ async def create_collection_index() -> None: await client.close() -async def create_item_index(collection_id: str) -> None: - """ - Create the index for Items. The settings of the index template will be used implicitly. - - Args: - collection_id (str): Collection identifier. - - Returns: - None - - """ - client = AsyncSearchSettings().create_client - - index_name = f"{index_by_collection_id(collection_id)}-000001" - exists = await client.indices.exists(index=index_name) - if not exists: - await client.indices.create( - index=index_name, - body={ - "aliases": {index_alias_by_collection_id(collection_id): {}}, - "mappings": ES_ITEMS_MAPPINGS, - "settings": ES_ITEMS_SETTINGS, - }, - ) - await client.close() - - async def delete_item_index(collection_id: str) -> None: """Delete the index for items in a collection. @@ -136,7 +112,9 @@ class DatabaseLogic(BaseDatabaseLogic): async_settings: AsyncSearchSettings = attr.ib(factory=AsyncSearchSettings) sync_settings: SyncSearchSettings = attr.ib(factory=SyncSearchSettings) - + async_index_inserter: AsyncIndexInserter = attr.ib(init=False) + async_index_selector: IndexSelectionStrategy = attr.ib(init=False) + sync_index_selector: IndexSelectionStrategy = attr.ib(init=False) client = attr.ib(init=False) sync_client = attr.ib(init=False) @@ -144,6 +122,14 @@ def __attrs_post_init__(self): """Initialize clients after the class is instantiated.""" self.client = self.async_settings.create_client self.sync_client = self.sync_settings.create_client + self.async_index_inserter = IndexInsertionFactory.create_insertion_strategy( + self.client + ) + self.sync_index_inserter = IndexInsertionFactory.create_sync_insertion_strategy( + self.sync_client + ) + self.async_index_selector = IndexSelectorFactory.create_async_selector(self.client) + self.sync_index_selector = IndexSelectorFactory.create_sync_selector(self.sync_client) item_serializer: Type[ItemSerializer] = attr.ib(default=ItemSerializer) collection_serializer: Type[CollectionSerializer] = attr.ib( @@ -281,19 +267,18 @@ def apply_free_text_filter(search: Search, free_text_queries: Optional[List[str] @staticmethod def apply_datetime_filter( - search: Search, interval: Optional[Union[DateTimeType, str]] + search: Search, datetime_search: Dict[str, Optional[str]] ): """Apply a filter to search based on datetime field, start_datetime, and end_datetime fields. Args: search (Search): The search object to filter. - interval: Optional[Union[DateTimeType, str]] + datetime_search: Dict[str, Optional[str]] Returns: Search: The filtered search object. """ should = [] - datetime_search = return_date(interval) # If the request is a single datetime return # items with datetimes equal to the requested datetime OR @@ -520,6 +505,7 @@ async def execute_search( token: Optional[str], sort: Optional[Dict[str, Dict[str, str]]], collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], ignore_unavailable: bool = True, ) -> Tuple[Iterable[Dict[str, Any]], Optional[int], Optional[str]]: """Execute a search query with limit and other optional parameters. @@ -530,6 +516,7 @@ async def execute_search( token (Optional[str]): The token used to return the next set of results. sort (Optional[Dict[str, Dict[str, str]]]): Specifies how the results should be sorted. collection_ids (Optional[List[str]]): The collection ids to search. + datetime_search (Dict[str, Optional[str]]): Datetime range used for index selection. ignore_unavailable (bool, optional): Whether to ignore unavailable collections. Defaults to True. Returns: @@ -557,7 +544,9 @@ async def execute_search( search_body["sort"] = sort if sort else DEFAULT_SORT - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) max_result_window = MAX_LIMIT @@ -619,6 +608,7 @@ async def aggregate( geometry_geohash_grid_precision: int, geometry_geotile_grid_precision: int, datetime_frequency_interval: str, + datetime_search, ignore_unavailable: Optional[bool] = True, ): """Return aggregations of STAC Items.""" @@ -652,7 +642,10 @@ def _fill_aggregation_parameters(name: str, agg: dict) -> dict: if k in aggregations } - index_param = indices(collection_ids) + index_param = await self.async_index_selector.select_indexes( + collection_ids, datetime_search + ) + search_task = asyncio.create_task( self.client.search( index=index_param, @@ -845,8 +838,13 @@ async def create_item( item = await self.async_prep_create_item( item=item, base_url=base_url, exist_ok=exist_ok ) + + target_index = await self.async_index_inserter.get_target_index( + collection_id, item + ) + await self.client.index( - index=index_alias_by_collection_id(collection_id), + index=target_index, id=mk_item_id(item_id, collection_id), body=item, refresh=refresh, @@ -876,10 +874,16 @@ async def delete_item(self, item_id: str, collection_id: str, **kwargs: Any): ) try: - await self.client.delete( + await self.client.delete_by_query( index=index_alias_by_collection_id(collection_id), - id=mk_item_id(item_id, collection_id), - refresh=refresh, + body={ + "query": { + "term": { + "_id": mk_item_id(item_id, collection_id) + } + } + }, + refresh=refresh ) except exceptions.NotFoundError: raise NotFoundError( @@ -938,8 +942,10 @@ async def create_collection(self, collection: Collection, **kwargs: Any): body=collection, refresh=refresh, ) - - await create_item_index(collection_id) + if self.async_index_inserter.should_create_collection_index(): + await self.async_index_inserter.create_simple_index( + self.client, collection_id + ) async def find_collection(self, collection_id: str) -> Collection: """Find and return a collection from the database. @@ -1112,9 +1118,13 @@ async def bulk_async( return 0, [] raise_on_error = self.async_settings.raise_on_bulk_error + actions = await self.async_index_inserter.prepare_bulk_actions( + collection_id, processed_items + ) + success, errors = await helpers.async_bulk( self.client, - mk_actions(collection_id, processed_items), + actions, refresh=refresh, raise_on_error=raise_on_error, ) @@ -1175,9 +1185,13 @@ def bulk_sync( return 0, [] raise_on_error = self.sync_settings.raise_on_bulk_error + actions = self.sync_index_inserter.prepare_bulk_actions( + collection_id, processed_items + ) + success, errors = helpers.bulk( self.sync_client, - mk_actions(collection_id, processed_items), + actions, refresh=refresh, raise_on_error=raise_on_error, ) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py index 1f335245..641c81f1 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/aggregation/client.py @@ -21,6 +21,7 @@ Aggregation, AggregationCollection, ) +from stac_fastapi.sfeos_helpers.database import return_date from stac_fastapi.types.rfc3339 import DateTimeType from .format import frequency_agg, metric_agg @@ -312,9 +313,10 @@ async def aggregate( search=search, item_ids=aggregate_request.ids ) + datetime_search = return_date(aggregate_request.datetime) if aggregate_request.datetime: search = self.database.apply_datetime_filter( - search=search, interval=aggregate_request.datetime + search=search, datetime_search=datetime_search ) if aggregate_request.bbox: @@ -414,6 +416,7 @@ async def aggregate( geometry_geohash_grid_precision, geometry_geotile_grid_precision, datetime_frequency_interval, + datetime_search, ) except Exception as error: if not isinstance(error, IndexError): diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py index 31bf28d8..4d1904e7 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/__init__.py @@ -30,15 +30,36 @@ """ # Re-export all functions for backward compatibility -from .datetime import return_date -from .document import mk_actions, mk_item_id +from .datetime import extract_date, return_date, extract_first_date_from_index +from .document import mk_item_id from .index import ( create_index_templates_shared, delete_item_index_shared, + filter_indexes_by_datetime, index_alias_by_collection_id, index_by_collection_id, indices, ) +from .index_insertion_strategies import ( + AsyncIndexInserter, + AsyncSimpleIndexInsertion, + BaseIndexInserter, + ElasticsearchAdapter, + IndexInsertionFactory, + OpenSearchAdapter, + SearchEngineAdapter, + SearchEngineAdapterFactory, + SearchEngineType, + SyncIndexInserter, + SyncSimpleIndexInsertion, +) +from .index_selection_strategies import ( + AsyncDatetimeBasedIndexSelector, + IndexSelectionStrategy, + IndexSelectorFactory, + SyncDatetimeBasedIndexSelector, + UnfilteredIndexSelector, +) from .mapping import get_queryables_mapping_shared from .query import ( apply_free_text_filter_shared, @@ -53,7 +74,26 @@ "delete_item_index_shared", "index_alias_by_collection_id", "index_by_collection_id", + "filter_indexes_by_datetime", "indices", + # Index selection strategies + "IndexSelectionStrategy", + "IndexSelectorFactory", + "UnfilteredIndexSelector", + "SyncDatetimeBasedIndexSelector", + "AsyncDatetimeBasedIndexSelector", + # Index insertion strategies + "AsyncIndexInserter", + "AsyncSimpleIndexInsertion", + "BaseIndexInserter", + "ElasticsearchAdapter", + "IndexInsertionFactory", + "OpenSearchAdapter", + "SearchEngineAdapter", + "SearchEngineAdapterFactory", + "SearchEngineType", + "SyncIndexInserter", + "SyncSimpleIndexInsertion", # Query operations "apply_free_text_filter_shared", "apply_intersects_filter_shared", @@ -62,10 +102,11 @@ "get_queryables_mapping_shared", # Document operations "mk_item_id", - "mk_actions", # Utility functions "validate_refresh", "get_bool_env", # Datetime utilities "return_date", -] + "extract_date", + "extract_first_date_from_index", +] \ No newline at end of file diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py index 352ed4b5..bed1d77f 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/datetime.py @@ -3,7 +3,8 @@ This module provides datetime utility functions specifically designed for Elasticsearch and OpenSearch query formatting. """ - +import re +from datetime import date from datetime import datetime as datetime_type from typing import Dict, Optional, Union @@ -39,8 +40,10 @@ def return_date( if isinstance(interval, str): if "/" in interval: parts = interval.split("/") - result["gte"] = parts[0] if parts[0] != ".." else None - result["lte"] = parts[1] if len(parts) > 1 and parts[1] != ".." else None + result["gte"] = parts[0] if parts[0] != ".." else datetime_type.min + result["lte"] = ( + parts[1] if len(parts) > 1 and parts[1] != ".." else datetime_type.max + ) else: converted_time = interval if interval != ".." else None result["gte"] = result["lte"] = converted_time @@ -58,3 +61,34 @@ def return_date( result["lte"] = end.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" return result + + +def extract_date(date_str: str) -> date: + """Extract date from ISO format string. + + Args: + date_str: ISO format date string + + Returns: + A date object extracted from the input string. + """ + date_str = date_str.replace('Z', '+00:00') + return datetime_type.fromisoformat(date_str).date() + + +def extract_first_date_from_index(index_name: str) -> date: + """Extract the first date from an index name containing date patterns. + + Searches for date patterns (YYYY-MM-DD) within the index name string + and returns the first found date as a date object. + + Args: + index_name: Index name containing date patterns. + + Returns: + A date object extracted from the first date pattern found in the index name. + + """ + date_pattern = r'\d{4}-\d{2}-\d{2}' + match = re.search(date_pattern, index_name) + return datetime_type.strptime(match.group(0), "%Y-%m-%d").date() \ No newline at end of file diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/document.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/document.py index 0ba0e025..05c40f64 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/document.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/document.py @@ -21,28 +21,3 @@ def mk_item_id(item_id: str, collection_id: str) -> str: str: The document id for the Item, combining the Item id and the Collection id, separated by a `|` character. """ return f"{item_id}|{collection_id}" - - -def mk_actions(collection_id: str, processed_items: List[Item]) -> List[Dict[str, Any]]: - """Create Elasticsearch bulk actions for a list of processed items. - - Args: - collection_id (str): The identifier for the collection the items belong to. - processed_items (List[Item]): The list of processed items to be bulk indexed. - - Returns: - List[Dict[str, Union[str, Dict]]]: The list of bulk actions to be executed, - each action being a dictionary with the following keys: - - `_index`: the index to store the document in. - - `_id`: the document's identifier. - - `_source`: the source of the document. - """ - index_alias = index_alias_by_collection_id(collection_id) - return [ - { - "_index": index_alias, - "_id": mk_item_id(item["id"], item["collection"]), - "_source": item, - } - for item in processed_items - ] diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py index 3305f50f..d94a8d0b 100644 --- a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index.py @@ -2,7 +2,8 @@ This module provides functions for creating and managing indices in Elasticsearch/OpenSearch. """ - +import re +from datetime import datetime from functools import lru_cache from typing import Any, List, Optional @@ -66,6 +67,55 @@ def indices(collection_ids: Optional[List[str]]) -> str: ) +def filter_indexes_by_datetime( + indexes: List[str], gte: Optional[str], lte: Optional[str] +) -> List[str]: + """Filter indexes based on datetime range extracted from index names. + + Args: + indexes: List of index names containing dates + gte: Greater than or equal date filter (ISO format, optional 'Z' suffix) + lte: Less than or equal date filter (ISO format, optional 'Z' suffix) + + Returns: + List of filtered index names + """ + + def parse_datetime(dt_str: str) -> datetime: + """Parse datetime string, handling both with and without 'Z' suffix.""" + return datetime.fromisoformat(dt_str.rstrip('Z')) + + def extract_date_range_from_index(index_name: str) -> tuple: + """Extract start and end dates from index name.""" + date_pattern = r"(\d{4}-\d{2}-\d{2})" + dates = re.findall(date_pattern, index_name) + + if len(dates) == 1: + start_date = datetime.strptime(dates[0], "%Y-%m-%d") + max_date = datetime.max.replace(microsecond=0) + return start_date, max_date + else: + start_date = datetime.strptime(dates[0], "%Y-%m-%d") + end_date = datetime.strptime(dates[1], "%Y-%m-%d") + return start_date, end_date + + def is_index_in_range(start_date: datetime, end_date: datetime, gte_dt: datetime, lte_dt: datetime) -> bool: + """Check if index date range overlaps with filter range.""" + return not (end_date.date() < gte_dt.date() or start_date.date() > lte_dt.date()) + + gte_dt = parse_datetime(gte) if gte else datetime.min.replace(microsecond=0) + lte_dt = parse_datetime(lte) if lte else datetime.max.replace(microsecond=0) + + filtered_indexes = [] + + for index in indexes: + start_date, end_date = extract_date_range_from_index(index) + if is_index_in_range(start_date, end_date, gte_dt, lte_dt): + filtered_indexes.append(index) + + return filtered_indexes + + async def create_index_templates_shared(settings: Any) -> None: """Create index templates for Elasticsearch/OpenSearch Collection and Item indices. diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index_insertion_strategies.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index_insertion_strategies.py new file mode 100644 index 00000000..6b8755d9 --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index_insertion_strategies.py @@ -0,0 +1,602 @@ +from __future__ import annotations + +import os +from abc import ABC, abstractmethod +from datetime import timedelta +from enum import Enum +from fastapi import HTTPException, status +from typing import Any, Dict, List, Union + +from stac_fastapi.core.utilities import get_bool_env +from stac_fastapi.sfeos_helpers.database import ( + extract_date, + index_alias_by_collection_id, + index_by_collection_id, + mk_item_id, + extract_first_date_from_index, +) +from stac_fastapi.sfeos_helpers.mappings import ( + _ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE, + ES_ITEMS_MAPPINGS, + ES_ITEMS_SETTINGS, + ITEM_INDICES, + ITEMS_INDEX_PREFIX, +) + +from .index_selection_strategies import ( + AsyncDatetimeBasedIndexSelector, + SyncDatetimeBasedIndexSelector, +) + + +class SearchEngineType(Enum): + ELASTICSEARCH = "elasticsearch" + OPENSEARCH = "opensearch" + + +class SearchEngineAdapter(ABC): + @abstractmethod + async def create_simple_index(self, client: Any, collection_id: str) -> str: + pass + + @abstractmethod + async def create_datetime_index( + self, client: Any, collection_id: str, start_date: str + ) -> str: + pass + + @abstractmethod + def create_simple_index_sync(self, sync_client: Any, collection_id: str) -> str: + pass + + @abstractmethod + def create_datetime_index_sync( + self, sync_client: Any, collection_id: str, start_date: str + ) -> str: + pass + + async def update_index_alias(self, client: Any, end_date: str, old_alias: str): + index = ITEMS_INDEX_PREFIX + old_alias + new_alias = self.alias_by_index_and_end_date(old_alias, end_date) + await client.indices.update_aliases(body={ + "actions": [ + {"remove": {"index": index, "alias": old_alias}}, + {"add": {"index": index, "alias": new_alias}} + ] + }) + return new_alias + + def update_index_alias_sync(self, client: Any, end_date: str, old_alias: str): + index = ITEMS_INDEX_PREFIX + old_alias + new_alias = self.alias_by_index_and_end_date(old_alias, end_date) + client.indices.update_aliases(body={ + "actions": [ + {"remove": {"index": index, "alias": old_alias}}, + {"add": {"index": index, "alias": new_alias}} + ] + }) + return new_alias + + @staticmethod + def index_by_collection_id_and_date(collection_id: str, start_date: str) -> str: + cleaned = collection_id.translate(_ES_INDEX_NAME_UNSUPPORTED_CHARS_TABLE) + return f"{ITEMS_INDEX_PREFIX}{cleaned.lower()}_{start_date}" + + @staticmethod + def alias_by_index_and_end_date(index: str, end_date: str) -> str: + return f"{index}-{end_date}" + + +class ElasticsearchAdapter(SearchEngineAdapter): + async def create_simple_index(self, client: Any, collection_id: str) -> str: + index_name = f"{index_by_collection_id(collection_id)}-000001" + await client.options(ignore_status=400).indices.create( + index=index_name, + body={"aliases": {index_alias_by_collection_id(collection_id): {}}}, + ) + return index_name + + async def create_datetime_index( + self, client: Any, collection_id: str, start_date: str + ) -> str: + index_name = self.index_by_collection_id_and_date(collection_id, start_date) + alias_name = index_name.removeprefix(ITEMS_INDEX_PREFIX) + await client.options(ignore_status=400).indices.create( + index=index_name, + body={"aliases": {index_alias_by_collection_id(collection_id): {}, alias_name: {}}}, + ) + return alias_name + + def create_simple_index_sync(self, sync_client: Any, collection_id: str) -> str: + index_name = f"{index_by_collection_id(collection_id)}-000001" + sync_client.options(ignore_status=400).indices.create( + index=index_name, + body={"aliases": {index_alias_by_collection_id(collection_id): {}}}, + ) + return index_name + + def create_datetime_index_sync( + self, sync_client: Any, collection_id: str, start_date: str + ) -> str: + index_name = self.index_by_collection_id_and_date(collection_id, start_date) + alias_name = index_name.removeprefix(ITEMS_INDEX_PREFIX) + sync_client.options(ignore_status=400).indices.create( + index=index_name, + body={"aliases": {index_alias_by_collection_id(collection_id): {}, alias_name: {}}}, + ) + return alias_name + + +class OpenSearchAdapter(SearchEngineAdapter): + async def create_simple_index(self, client: Any, collection_id: str) -> str: + index_name = f"{index_by_collection_id(collection_id)}-000001" + exists = await client.indices.exists(index=index_name) + if not exists: + await client.indices.create( + index=index_name, + body={ + "aliases": {index_alias_by_collection_id(collection_id): {}}, + "mappings": ES_ITEMS_MAPPINGS, + "settings": ES_ITEMS_SETTINGS, + }, + ) + return index_name + + async def create_datetime_index( + self, client: Any, collection_id: str, start_date: str + ) -> str: + index_name = self.index_by_collection_id_and_date(collection_id, start_date) + alias_name = index_name.removeprefix(ITEMS_INDEX_PREFIX) + await client.indices.create( + index=index_name, + body={ + "aliases": {index_alias_by_collection_id(collection_id): {}, alias_name: {}}, + "mappings": ES_ITEMS_MAPPINGS, + "settings": ES_ITEMS_SETTINGS, + }, + ) + return alias_name + + def create_simple_index_sync(self, sync_client: Any, collection_id: str) -> str: + index_name = f"{index_by_collection_id(collection_id)}-000001" + exists = sync_client.indices.exists(index=index_name) + if not exists: + sync_client.indices.create( + index=index_name, + body={ + "aliases": {index_alias_by_collection_id(collection_id): {}}, + "mappings": ES_ITEMS_MAPPINGS, + "settings": ES_ITEMS_SETTINGS, + }, + ) + return index_name + + def create_datetime_index_sync( + self, sync_client: Any, collection_id: str, start_date: str + ) -> str: + index_name = self.index_by_collection_id_and_date(collection_id, start_date) + alias_name = index_name.removeprefix(ITEMS_INDEX_PREFIX) + sync_client.indices.create( + index=index_name, + body={ + "aliases": {index_alias_by_collection_id(collection_id): {}, alias_name: {}}, + "mappings": ES_ITEMS_MAPPINGS, + "settings": ES_ITEMS_SETTINGS, + }, + ) + return alias_name + + +class SearchEngineAdapterFactory: + @staticmethod + def create_adapter(engine_type: SearchEngineType) -> SearchEngineAdapter: + return ( + ElasticsearchAdapter() + if engine_type == SearchEngineType.ELASTICSEARCH + else OpenSearchAdapter() + ) + + @staticmethod + def detect_engine_type(client: Any) -> SearchEngineType: + return ( + SearchEngineType.OPENSEARCH + if "opensearch" in str(client.__class__) + else SearchEngineType.ELASTICSEARCH + ) + + +class BaseIndexInserter(ABC): + @abstractmethod + def get_target_index(self, collection_id: str, product: Dict[str, Any]): + pass + + @abstractmethod + def prepare_bulk_actions(self, collection_id: str, items: List[Dict[str, Any]]): + pass + + @abstractmethod + def should_create_collection_index(self) -> bool: + pass + + +class AsyncIndexInserter(BaseIndexInserter): + def __init__(self, client, search_adapter: SearchEngineAdapter): + self.client = client + self.search_adapter = search_adapter + + def should_create_collection_index(self) -> bool: + return False + + async def create_simple_index(self, client: Any, collection_id: str): + return await self.search_adapter.create_simple_index(client, collection_id) + + async def get_index_size_in_gb(self, index_name: str) -> float: + data = await self.client.indices.stats(index=index_name) + return data["_all"]["primaries"]["store"]["size_in_bytes"] / 1e9 + + async def _get_target_index_base( + self, + index_selector, + collection_id: str, + product: Dict[str, Any], + check_size: bool = True, + ) -> str: + product_datetime = product["properties"]["datetime"] + + if not product_datetime: + raise HTTPException( + status_code=400, + detail="Product datetime is required for indexing" + ) + + datetime_range = {"gte": product_datetime, "lte": product_datetime} + target_index = await index_selector.select_indexes( + [collection_id], datetime_range + ) + all_indexes = await index_selector.get_collection_indexes(collection_id) + + if not all_indexes: + target_index = await self.search_adapter.create_datetime_index( + self.client, collection_id, extract_date(product_datetime) + ) + await index_selector.refresh_cache() + return target_index + + all_indexes.sort() + + if (start_date := extract_date(product_datetime)) < (end_date := extract_first_date_from_index(all_indexes[0])): + target_index = await self.search_adapter.create_datetime_index( + self.client, collection_id, str(start_date) + ) + alias = await self.search_adapter.update_index_alias( + self.client, str(end_date - timedelta(days=1)), target_index + ) + await index_selector.refresh_cache() + return alias + + if target_index != all_indexes[-1]: + return target_index + + if check_size: + index_size_gb = await self.get_index_size_in_gb(target_index) + max_size_gb = float(os.getenv("DATETIME_INDEX_MAX_SIZE_GB", 20)) + + if index_size_gb > max_size_gb: + end_date = extract_date(product_datetime) + if end_date != extract_first_date_from_index(all_indexes[-1]): + await self.search_adapter.update_index_alias( + self.client, str(end_date), target_index + ) + target_index = await self.search_adapter.create_datetime_index( + self.client, collection_id, (end_date + timedelta(days=1)) + ) + await index_selector.refresh_cache() + + return target_index + + async def get_target_index( + self, collection_id: str, product: Dict[str, Any] + ) -> str: + index_selector = AsyncDatetimeBasedIndexSelector(self.client) + return await self._get_target_index_base( + index_selector, collection_id, product, check_size=True + ) + + async def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + index_selector = AsyncDatetimeBasedIndexSelector(self.client) + all_indexes = await index_selector.get_collection_indexes(collection_id) + + if not all_indexes: + first_item = items[0] + await self.search_adapter.create_datetime_index( + self.client, + collection_id, + extract_date(first_item["properties"]["datetime"]), + ) + await index_selector.refresh_cache() + all_indexes = await index_selector.get_collection_indexes(collection_id) + + all_indexes.sort() + latest_index = all_indexes[-1] + + first_item = items[0] + first_item_index = await self._get_target_index_base( + index_selector, collection_id, first_item, check_size=False + ) + + split_date = None + new_index = None + + if first_item_index == latest_index: + index_size_gb = await self.get_index_size_in_gb(first_item_index) + max_size_gb = float(os.getenv("DATETIME_INDEX_MAX_SIZE_GB", 20)) + + if index_size_gb > max_size_gb: + current_index_end_date = extract_first_date_from_index(first_item_index) + first_item_date = extract_date(first_item["properties"]["datetime"]) + + if first_item_date != current_index_end_date: + await self.search_adapter.update_index_alias( + self.client, str(current_index_end_date), latest_index + ) + next_day_start = current_index_end_date + timedelta(days=1) + new_index = await self.search_adapter.create_datetime_index( + self.client, collection_id, next_day_start + ) + split_date = current_index_end_date + + actions = [] + for item in items: + item_date = extract_date(item["properties"]["datetime"]) + + if split_date and item_date > split_date: + target_index = new_index + else: + target_index = await self._get_target_index_base( + index_selector, collection_id, item, check_size=False + ) + + actions.append( + { + "_index": target_index, + "_id": mk_item_id(item["id"], item["collection"]), + "_source": item, + } + ) + + return actions + + +class SyncIndexInserter(BaseIndexInserter): + def __init__(self, sync_client, search_adapter: SearchEngineAdapter): + self.sync_client = sync_client + self.search_adapter = search_adapter + + def should_create_collection_index(self) -> bool: + return False + + def get_index_size_in_gb(self, index_name: str) -> float: + data = self.sync_client.indices.stats(index=index_name) + return data["_all"]["primaries"]["store"]["size_in_bytes"] / 1e9 + + def _get_target_index_base( + self, + index_selector, + collection_id: str, + product: Dict[str, Any], + check_size: bool = True, + ) -> str: + product_datetime = product["properties"]["datetime"] + + if not product_datetime: + raise HTTPException( + status_code=400, + detail="Product datetime is required for indexing" + ) + + datetime_range = {"gte": product_datetime, "lte": product_datetime} + target_index = index_selector.select_indexes([collection_id], datetime_range) + all_indexes = index_selector.get_collection_indexes(collection_id) + + if not all_indexes: + target_index = self.search_adapter.create_datetime_index_sync( + self.sync_client, collection_id, extract_date(product_datetime) + ) + index_selector.refresh_cache() + return target_index + + if (start_date := extract_date(product_datetime)) < (end_date := extract_first_date_from_index(all_indexes[0])): + target_index = self.search_adapter.create_datetime_index_sync( + self.sync_client, collection_id, str(start_date) + ) + alias = self.search_adapter.update_index_alias_sync( + self.sync_client, str(end_date - timedelta(days=1)), target_index + ) + index_selector.refresh_cache() + return alias + + all_indexes.sort() + if target_index != all_indexes[-1]: + return target_index + + if check_size: + index_size_gb = self.get_index_size_in_gb(target_index) + max_size_gb = float(os.getenv("DATETIME_INDEX_MAX_SIZE_GB", 20)) + + if index_size_gb > max_size_gb: + end_date = extract_date(product_datetime) + if end_date != extract_first_date_from_index(all_indexes[-1]): + self.search_adapter.update_index_alias( + self.sync_client, str(end_date), target_index + ) + target_index = self.search_adapter.create_datetime_index_sync( + self.sync_client, collection_id, (end_date + timedelta(days=1)) + ) + index_selector.refresh_cache() + + return target_index + + def get_target_index(self, collection_id: str, product: Dict[str, Any]) -> str: + index_selector = SyncDatetimeBasedIndexSelector(self.sync_client) + return self._get_target_index_base( + index_selector, collection_id, product, check_size=True + ) + + def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + index_selector = SyncDatetimeBasedIndexSelector(self.sync_client) + all_indexes = index_selector.get_collection_indexes(collection_id) + + if not all_indexes: + first_item = items[0] + self.search_adapter.create_datetime_index_sync( + self.sync_client, + collection_id, + extract_date(first_item["properties"]["datetime"]), + ) + index_selector.refresh_cache() + all_indexes = index_selector.get_collection_indexes(collection_id) + + all_indexes.sort() + latest_index = all_indexes[-1] + + first_item = items[0] + first_item_index = self._get_target_index_base( + index_selector, collection_id, first_item, check_size=False + ) + + split_date = None + new_index = None + + if first_item_index == latest_index: + index_size_gb = self.get_index_size_in_gb(first_item_index) + max_size_gb = float(os.getenv("DATETIME_INDEX_MAX_SIZE_GB", 20)) + if index_size_gb > max_size_gb: + current_index_end_date = extract_first_date_from_index(first_item_index) + first_item_date = extract_date(first_item["properties"]["datetime"]) + + if first_item_date != current_index_end_date: + self.search_adapter.update_index_alias_sync( + self.sync_client, str(current_index_end_date), latest_index + ) + next_day_start = current_index_end_date + timedelta(days=1) + new_index = self.search_adapter.create_datetime_index_sync( + self.sync_client, collection_id, next_day_start + ) + split_date = current_index_end_date + + actions = [] + for item in items: + item_date = extract_date(item["properties"]["datetime"]) + + if split_date and item_date > split_date: + target_index = new_index + else: + target_index = self._get_target_index_base( + index_selector, collection_id, item, check_size=False + ) + + actions.append( + { + "_index": target_index, + "_id": mk_item_id(item["id"], item["collection"]), + "_source": item, + } + ) + + return actions + + +class AsyncSimpleIndexInsertion(BaseIndexInserter): + def __init__(self, search_adapter: SearchEngineAdapter, client): + self.search_adapter = search_adapter + self.client = client + + def should_create_collection_index(self) -> bool: + return True + + async def create_simple_index(self, client: Any, collection_id: str): + return await self.search_adapter.create_simple_index(client, collection_id) + + async def get_target_index( + self, collection_id: str, item_data: Union[Dict[str, Any], List[Dict[str, Any]]] + ) -> str: + return index_alias_by_collection_id(collection_id) + + async def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + target_index = index_alias_by_collection_id(collection_id) + return [ + { + "_index": target_index, + "_id": mk_item_id(item["id"], item["collection"]), + "_source": item, + } + for item in items + ] + + +class SyncSimpleIndexInsertion(BaseIndexInserter): + def __init__(self, search_adapter: SearchEngineAdapter, client): + self.search_adapter = search_adapter + self.client = client + + def should_create_collection_index(self) -> bool: + return True + + def create_simple_index(self, client: Any, collection_id: str): + return self.search_adapter.create_simple_index(client, collection_id) + + def get_target_index( + self, collection_id: str, item_data: Union[Dict[str, Any], List[Dict[str, Any]]] + ) -> str: + return index_alias_by_collection_id(collection_id) + + def prepare_bulk_actions( + self, collection_id: str, items: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + target_index = index_alias_by_collection_id(collection_id) + return [ + { + "_index": target_index, + "_id": mk_item_id(item["id"], item["collection"]), + "_source": item, + } + for item in items + ] + + +class IndexInsertionFactory: + @staticmethod + def create_insertion_strategy(client) -> BaseIndexInserter: + engine_type = SearchEngineAdapterFactory.detect_engine_type(client) + search_adapter = SearchEngineAdapterFactory.create_adapter(engine_type) + + use_datetime_partitioning = get_bool_env( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + return ( + AsyncIndexInserter(client, search_adapter) + if use_datetime_partitioning + else AsyncSimpleIndexInsertion(search_adapter, client) + ) + + @staticmethod + def create_sync_insertion_strategy(sync_client) -> BaseIndexInserter: + engine_type = SearchEngineAdapterFactory.detect_engine_type(sync_client) + search_adapter = SearchEngineAdapterFactory.create_adapter(engine_type) + + use_datetime_partitioning = get_bool_env( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + return ( + SyncIndexInserter(sync_client, search_adapter) + if use_datetime_partitioning + else SyncSimpleIndexInsertion(search_adapter, sync_client) + ) diff --git a/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index_selection_strategies.py b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index_selection_strategies.py new file mode 100644 index 00000000..cb989f5f --- /dev/null +++ b/stac_fastapi/sfeos_helpers/stac_fastapi/sfeos_helpers/database/index_selection_strategies.py @@ -0,0 +1,387 @@ +"""Index selection strategies for Elasticsearch/OpenSearch. + +This module provides strategies for selecting appropriate indices when querying +Elasticsearch/OpenSearch, with support for datetime-based filtering and caching. +""" + +from __future__ import annotations + +import time +from abc import ABC, abstractmethod +from typing import Awaitable, Dict, List, Optional, Union + +from stac_fastapi.core.utilities import get_bool_env +from stac_fastapi.sfeos_helpers.database import ( + filter_indexes_by_datetime, + index_alias_by_collection_id, + index_by_collection_id, + indices, +) +from stac_fastapi.sfeos_helpers.mappings import ITEM_INDICES, ITEMS_INDEX_PREFIX + + +class IndexSelectionStrategy(ABC): + """Abstract base class for index selection strategies.""" + + @abstractmethod + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> Union[str, Awaitable[str]]: + """Select appropriate indexes based on collection IDs and datetime criteria. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + If None, all collections are considered. + datetime_search (Dict[str, Optional[str]]): Dictionary containing datetime + search criteria with 'gte' and 'lte' keys for range filtering. + + Returns: + Union[str, Awaitable[str]]: Comma-separated string of selected index names + or awaitable that resolves to such string. + """ + pass + + +class UnfilteredIndexSelector(IndexSelectionStrategy): + """Index selector that returns all available indices without filtering.""" + + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select all indices for given collections without datetime filtering. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + If None, all collections are considered. + datetime_search (Dict[str, Optional[str]]): Datetime search criteria + (ignored by this implementation). + + Returns: + str: Comma-separated string of all available index names for the collections. + """ + return indices(collection_ids) + + +class AsyncDatetimeBasedIndexSelector(IndexSelectionStrategy): + """Asynchronous index selector that filters indices based on datetime criteria with caching.""" + + _instance = None + + def __new__(cls, client): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self, client): + """Initialize the datetime-based index selector. + + Args: + client: Elasticsearch/OpenSearch client instance used for querying + index aliases and metadata. + """ + if not hasattr(self, '_initialized'): + self.client = client + self._aliases_cache: Optional[Dict[str, List[str]]] = None + self._cache_timestamp: float = 0 + self._initialized = True + + @property + def _cache_expired(self) -> bool: + """Check if the aliases cache has expired. + + Returns: + bool: True if cache is older than 1 hour (3600 seconds), False otherwise. + """ + return time.time() - self._cache_timestamp > 3600 + + async def _load_aliases_cache(self) -> Dict[str, List[str]]: + """Load and cache index aliases from Elasticsearch/OpenSearch. + + Retrieves all aliases for indices matching the items index prefix pattern + and organizes them by base collection alias. + + Returns: + Dict[str, List[str]]: Dictionary mapping base collection aliases to + lists of their corresponding item index aliases. + """ + response = await self.client.indices.get_alias(index=f"{ITEMS_INDEX_PREFIX}*") + result = {} + for index_info in response.values(): + aliases = index_info.get("aliases", {}) + base_alias = None + items_aliases = [] + + for alias_name in aliases.keys(): + if not alias_name.startswith(ITEMS_INDEX_PREFIX): + items_aliases.append(alias_name) + else: + base_alias = alias_name + + if base_alias and items_aliases: + result.setdefault(base_alias, []).extend(items_aliases) + + self._aliases_cache = result + self._cache_timestamp = time.time() + return result + + async def refresh_cache(self) -> Dict[str, List[str]]: + """Force refresh of the aliases cache. + + Returns: + Dict[str, List[str]]: Refreshed dictionary mapping base collection aliases + to lists of their corresponding item index aliases. + """ + return await self._load_aliases_cache() + + async def get_aliases_cache(self) -> Dict[str, List[str]]: + """Get the current aliases cache, refreshing if expired or empty. + + Returns: + Dict[str, List[str]]: Dictionary mapping base collection aliases to + lists of their corresponding item index aliases. + """ + if self._aliases_cache is None or self._cache_expired: + return await self._load_aliases_cache() + return self._aliases_cache + + async def get_collection_indexes(self, collection_id: str) -> List[str]: + """Get all index aliases for a specific collection. + + Args: + collection_id (str): The ID of the collection to retrieve indexes for. + + Returns: + List[str]: List of index aliases associated with the collection. + Returns empty list if collection is not found in cache. + """ + cache = await self.get_aliases_cache() + return cache.get(index_alias_by_collection_id(collection_id), []) + + async def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select indexes filtered by collection IDs and datetime criteria. + + For each specified collection, retrieves its associated indexes and filters + them based on datetime range. If no collection IDs are provided, returns + all item indices. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + If None or empty, returns all item indices. + datetime_search (Dict[str, Optional[str]]): Dictionary containing datetime + search criteria with 'gte' and 'lte' keys for range filtering. + + Returns: + str: Comma-separated string of selected index names that match the + collection and datetime criteria. Returns empty string if no + indexes match the criteria. + """ + if collection_ids: + selected_indexes = [] + for collection_id in collection_ids: + collection_indexes = await self.get_collection_indexes(collection_id) + filtered_indexes = filter_indexes_by_datetime( + collection_indexes, + datetime_search.get("gte"), + datetime_search.get("lte"), + ) + selected_indexes.extend(filtered_indexes) + + return ",".join(selected_indexes) if selected_indexes else "" + + return ITEM_INDICES + + +class SyncDatetimeBasedIndexSelector(IndexSelectionStrategy): + """Synchronous index selector that filters indices based on datetime criteria with caching.""" + + _instance = None + + def __new__(cls, client): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self, sync_client): + """Initialize the datetime-based index selector. + + Args: + sync_client: Synchronous Elasticsearch/OpenSearch client instance used for querying + index aliases and metadata. + """ + if not hasattr(self, '_initialized'): + self.sync_client = sync_client + self._aliases_cache: Optional[Dict[str, List[str]]] = None + self._cache_timestamp: float = 0 + self._initialized = True + + @property + def _cache_expired(self) -> bool: + """Check if the aliases cache has expired. + + Returns: + bool: True if cache is older than 1 hour (3600 seconds), False otherwise. + """ + return time.time() - self._cache_timestamp > 3600 + + def _load_aliases_cache(self) -> Dict[str, List[str]]: + """Load and cache index aliases from Elasticsearch/OpenSearch. + + Retrieves all aliases for indices matching the items index prefix pattern + and organizes them by base collection alias. + + Returns: + Dict[str, List[str]]: Dictionary mapping base collection aliases to + lists of their corresponding item index aliases. + """ + response = self.sync_client.indices.get_alias(index=f"{ITEMS_INDEX_PREFIX}*") + result = {} + + for index_info in response.values(): + aliases = index_info.get("aliases", {}) + base_alias = None + items_aliases = [] + + for alias_name in aliases.keys(): + if not alias_name.startswith(ITEMS_INDEX_PREFIX): + items_aliases.append(alias_name) + else: + base_alias = alias_name + + if base_alias and items_aliases: + result.setdefault(base_alias, []).extend(items_aliases) + + self._aliases_cache = result + self._cache_timestamp = time.time() + return result + + def refresh_cache(self) -> Dict[str, List[str]]: + """Force refresh of the aliases cache. + + Returns: + Dict[str, List[str]]: Refreshed dictionary mapping base collection aliases + to lists of their corresponding item index aliases. + """ + return self._load_aliases_cache() + + def get_aliases_cache(self) -> Dict[str, List[str]]: + """Get the current aliases cache, refreshing if expired or empty. + + Returns: + Dict[str, List[str]]: Dictionary mapping base collection aliases to + lists of their corresponding item index aliases. + """ + if self._aliases_cache is None or self._cache_expired: + return self._load_aliases_cache() + return self._aliases_cache + + def get_collection_indexes(self, collection_id: str) -> List[str]: + """Get all index aliases for a specific collection. + + Args: + collection_id (str): The ID of the collection to retrieve indexes for. + + Returns: + List[str]: List of index aliases associated with the collection. + Returns empty list if collection is not found in cache. + """ + cache = self.get_aliases_cache() + return cache.get(index_alias_by_collection_id(collection_id), []) + + def select_indexes( + self, + collection_ids: Optional[List[str]], + datetime_search: Dict[str, Optional[str]], + ) -> str: + """Select indexes filtered by collection IDs and datetime criteria. + + For each specified collection, retrieves its associated indexes and filters + them based on datetime range. If no collection IDs are provided, returns + all item indices. + + Args: + collection_ids (Optional[List[str]]): List of collection IDs to filter by. + If None or empty, returns all item indices. + datetime_search (Dict[str, Optional[str]]): Dictionary containing datetime + search criteria with 'gte' and 'lte' keys for range filtering. + + Returns: + str: Comma-separated string of selected index names that match the + collection and datetime criteria. Returns empty string if no + indexes match the criteria. + """ + if collection_ids: + selected_indexes = [] + for collection_id in collection_ids: + collection_indexes = self.get_collection_indexes(collection_id) + filtered_indexes = filter_indexes_by_datetime( + collection_indexes, + datetime_search.get("gte"), + datetime_search.get("lte"), + ) + selected_indexes.extend(filtered_indexes) + + return ",".join(selected_indexes) if selected_indexes else "" + + return ITEM_INDICES + + +class IndexSelectorFactory: + """Factory class for creating index selector instances.""" + + @staticmethod + def create_async_selector(client) -> IndexSelectionStrategy: + """Create an appropriate asynchronous index selector based on environment configuration. + + Checks the ENABLE_DATETIME_INDEX_FILTERING environment variable to determine + whether to use datetime-based filtering or return all available indices. + + Args: + client: Asynchronous Elasticsearch/OpenSearch client instance, used only if datetime + filtering is enabled. + + Returns: + IndexSelectionStrategy: Either an AsyncDatetimeBasedIndexSelector if datetime + filtering is enabled, or an UnfilteredIndexSelector otherwise. + """ + use_datetime_filtering = get_bool_env( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + return ( + AsyncDatetimeBasedIndexSelector(client) + if use_datetime_filtering + else UnfilteredIndexSelector() + ) + + @staticmethod + def create_sync_selector(sync_client) -> IndexSelectionStrategy: + """Create an appropriate synchronous index selector based on environment configuration. + + Checks the ENABLE_DATETIME_INDEX_FILTERING environment variable to determine + whether to use datetime-based filtering or return all available indices. + + Args: + sync_client: Synchronous Elasticsearch/OpenSearch client instance, used only if datetime + filtering is enabled. + + Returns: + IndexSelectionStrategy: Either a SyncDatetimeBasedIndexSelector if datetime + filtering is enabled, or an UnfilteredIndexSelector otherwise. + """ + use_datetime_filtering = get_bool_env( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + return ( + SyncDatetimeBasedIndexSelector(sync_client) + if use_datetime_filtering + else UnfilteredIndexSelector() + ) diff --git a/stac_fastapi/tests/api/test_api.py b/stac_fastapi/tests/api/test_api.py index 807da5e4..465f0b97 100644 --- a/stac_fastapi/tests/api/test_api.py +++ b/stac_fastapi/tests/api/test_api.py @@ -1,7 +1,10 @@ +import os + import random import uuid from copy import deepcopy from datetime import datetime, timedelta +from unittest.mock import patch import pytest @@ -424,6 +427,9 @@ async def test_search_point_does_not_intersect(app_client, ctx): @pytest.mark.asyncio async def test_datetime_response_format(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -461,6 +467,9 @@ async def test_datetime_response_format(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_non_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -497,6 +506,9 @@ async def test_datetime_non_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -533,6 +545,9 @@ async def test_datetime_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_bad_non_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -569,6 +584,9 @@ async def test_datetime_bad_non_interval(app_client, txn_client, ctx): @pytest.mark.asyncio async def test_datetime_bad_interval(app_client, txn_client, ctx): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + first_item = dict(ctx.item) second_item = deepcopy(first_item) @@ -698,3 +716,203 @@ async def test_big_int_eo_search( results = {x["properties"][attr] for x in resp_json["features"]} assert len(results) == expected assert results == {value} + + +@pytest.mark.asyncio +async def test_create_item_in_past_date_creates_separate_index(app_client, ctx, load_test_data, txn_client): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2012-02-12T12:30:22Z" + + response = await app_client.post(f"/collections/{item['collection']}/items", json=item) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_indices = ['items_test-collection_2012-02-12', 'items_test-collection_2020-02-12'] + + for expected_index in expected_indices: + assert expected_index in indices.keys() + + +@pytest.mark.asyncio +async def test_create_item_uses_existing_datetime_index(app_client, ctx, load_test_data, txn_client): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + + response = await app_client.post(f"/collections/{item['collection']}/items", json=item) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + assert 'items_test-collection_2020-02-12' in indices.keys() + + +async def test_create_item_with_different_date_same_index(app_client, load_test_data, txn_client, ctx): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2022-02-12T12:30:22Z" + + response = await app_client.post(f"/collections/{item['collection']}/items", json=item) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + assert 'items_test-collection_2020-02-12' in indices.keys() + + +@pytest.mark.asyncio +async def test_create_new_index_when_size_limit_exceeded(app_client, load_test_data, txn_client, ctx): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = load_test_data("test_item.json") + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = "2024-02-12T12:30:22Z" + + with patch('stac_fastapi.sfeos_helpers.database.AsyncIndexInserter.get_index_size_in_gb') as mock_get_size: + mock_get_size.return_value = 21.0 + response = await app_client.post(f"/collections/{item['collection']}/items", json=item) + + assert response.status_code == 201 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_indices = ['items_test-collection_2020-02-12', 'items_test-collection_2024-02-13'] + + for expected_index in expected_indices: + assert expected_index in indices.keys() + + +@pytest.mark.asyncio +async def test_bulk_create_item_fails_without_datetime(app_client, load_test_data, txn_client, ctx): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + item = deepcopy(load_test_data) + item["id"] = "second-item" + item["properties"]["datetime"] = None + response = await app_client.post(f"/collections/{item['collection']}/items", json=item) + assert response.status_code == 400 + + +@pytest.mark.asyncio +async def test_bulk_create_items_with_same_date_range(app_client, load_test_data, txn_client, ctx): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + base_item = load_test_data("test_item.json") + items_dict = {} + + for i in range(10): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2020-02-{12 + i}T12:30:22Z" + items_dict[item["id"]] = item + + payload = { + "items": items_dict, + "method": "insert" + } + + response = await app_client.post( + f"/collections/{base_item['collection']}/bulk_items", + json=payload + ) + + assert response.status_code == 200 + + indices = await txn_client.database.client.indices.get_alias(index="*") + assert 'items_test-collection_2020-02-12' in indices.keys() + + +@pytest.mark.asyncio +async def test_bulk_create_items_with_different_date_ranges(app_client, load_test_data, txn_client, ctx): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + base_item = load_test_data("test_item.json") + items_dict = {} + + for i in range(3): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2020-02-{12 + i}T12:30:22Z" + items_dict[item["id"]] = item + + for i in range(2): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2010-02-{10 + i}T12:30:22Z" + items_dict[item["id"]] = item + + payload = { + "items": items_dict, + "method": "insert" + } + + response = await app_client.post( + f"/collections/{base_item['collection']}/bulk_items", + json=payload + ) + + assert response.status_code == 200 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_indices = ['items_test-collection_2020-02-12', 'items_test-collection_2010-02-10'] + + for expected_index in expected_indices: + assert expected_index in indices.keys() + + +@pytest.mark.asyncio +async def test_bulk_create_items_with_size_limit_exceeded(app_client, load_test_data, txn_client, ctx): + if not os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + + base_item = load_test_data("test_item.json") + items_dict = {} + + for i in range(3): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2019-02-{15 + i}T12:30:22Z" + items_dict[item["id"]] = item + + for i in range(2): + item = deepcopy(base_item) + item["id"] = str(uuid.uuid4()) + item["properties"]["datetime"] = f"2010-02-{10 + i}T12:30:22Z" + items_dict[item["id"]] = item + + payload = { + "items": items_dict, + "method": "insert" + } + + with patch('stac_fastapi.sfeos_helpers.database.SyncIndexInserter.get_index_size_in_gb') as mock_get_size: + mock_get_size.return_value = 21.0 + response = await app_client.post( + f"/collections/{base_item['collection']}/bulk_items", + json=payload + ) + + assert response.status_code == 200 + + indices = await txn_client.database.client.indices.get_alias(index="*") + expected_indices = [ + 'items_test-collection_2010-02-10', + 'items_test-collection_2019-02-15', + 'items_test-collection_2020-02-12' + ] + + for expected_index in expected_indices: + assert expected_index in indices.keys() diff --git a/stac_fastapi/tests/conftest.py b/stac_fastapi/tests/conftest.py index afb9ac9b..cc416c1c 100644 --- a/stac_fastapi/tests/conftest.py +++ b/stac_fastapi/tests/conftest.py @@ -28,6 +28,7 @@ from stac_fastapi.core.route_dependencies import get_route_dependencies from stac_fastapi.core.utilities import get_bool_env from stac_fastapi.sfeos_helpers.aggregation import EsAsyncBaseAggregationClient +from stac_fastapi.sfeos_helpers.mappings import ITEMS_INDEX_PREFIX if os.getenv("BACKEND", "elasticsearch").lower() == "opensearch": from stac_fastapi.opensearch.config import AsyncOpensearchSettings as AsyncSettings @@ -57,6 +58,7 @@ TokenPaginationExtension, TransactionExtension, ) +from stac_fastapi.extensions.third_party import BulkTransactionExtension from stac_fastapi.types.config import Settings DATA_DIR = os.path.join(os.path.dirname(__file__), "data") @@ -153,6 +155,9 @@ async def delete_collections_and_items(txn_client: TransactionsClient) -> None: await refresh_indices(txn_client) await txn_client.database.delete_items() await txn_client.database.delete_collections() + await txn_client.database.client.indices.delete(index=f"{ITEMS_INDEX_PREFIX}*") + await txn_client.database.async_index_selector.refresh_cache() + txn_client.database.sync_index_selector.refresh_cache() async def refresh_indices(txn_client: TransactionsClient) -> None: @@ -213,6 +218,13 @@ async def app(): ), settings=settings, ), + BulkTransactionExtension( + client=BulkTransactionsClient( + database=database, + session=None, + settings=settings, + ) + ), SortExtension(), FieldsExtension(), QueryExtension(), diff --git a/stac_fastapi/tests/database/test_database.py b/stac_fastapi/tests/database/test_database.py index 86611235..67897c15 100644 --- a/stac_fastapi/tests/database/test_database.py +++ b/stac_fastapi/tests/database/test_database.py @@ -1,3 +1,4 @@ +import os import uuid import pytest @@ -27,6 +28,9 @@ async def test_index_mapping_collections(ctx): @pytest.mark.asyncio async def test_index_mapping_items(txn_client, load_test_data): + if os.getenv("ENABLE_DATETIME_INDEX_FILTERING"): + pytest.skip() + collection = load_test_data("test_collection.json") collection["id"] = str(uuid.uuid4()) await txn_client.create_collection( diff --git a/stac_fastapi/tests/index/__init__.py b/stac_fastapi/tests/index/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/stac_fastapi/tests/index/test_index_selection.py b/stac_fastapi/tests/index/test_index_selection.py new file mode 100644 index 00000000..47721a1d --- /dev/null +++ b/stac_fastapi/tests/index/test_index_selection.py @@ -0,0 +1,217 @@ +import time +from unittest.mock import AsyncMock, patch + +import pytest + +from stac_fastapi.sfeos_helpers.database import ( + AsyncDatetimeBasedIndexSelector, + IndexSelectorFactory, + UnfilteredIndexSelector, +) + + +def test_datetime_selector_cache_expired_when_empty(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + assert selector._cache_expired + + +def test_datetime_selector_cache_expired_when_old(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + selector._cache_timestamp = time.time() - 3700 + assert selector._cache_expired + + +def test_datetime_selector_cache_not_expired_when_fresh(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + selector._cache_timestamp = time.time() - 1800 + assert not selector._cache_expired + + +@pytest.mark.asyncio +async def test_datetime_selector_load_aliases_cache(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + mock_response = { + "items_collection1_2023_01_01": { + "aliases": {"collection1": {}, "items_collection1_2023_01": {}} + }, + "items_collection1_2023_02_02": { + "aliases": {"collection1": {}, "items_collection1_2023_02": {}} + }, + } + client.indices.get_alias.return_value = mock_response + result = await selector._load_aliases_cache() + + assert result == { + "collection1": ["items_collection1_2023_01", "items_collection1_2023_02"] + } + assert selector._aliases_cache is not None + assert selector._cache_timestamp > 0 + + +@pytest.mark.asyncio +async def test_datetime_selector_get_aliases_cache_loads_when_empty(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + mock_response = { + "items_collection1_2023_01_01": { + "aliases": {"collection1": {}, "items_collection1_2023_01": {}} + } + } + client.indices.get_alias.return_value = mock_response + + result = await selector.get_aliases_cache() + + assert result == {"collection1": ["items_collection1_2023_01"]} + client.indices.get_alias.assert_called_once() + + +@pytest.mark.asyncio +async def test_datetime_selector_get_aliases_cache_uses_cached_when_fresh(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + cached_data = {"collection1": ["cached_index"]} + selector._aliases_cache = cached_data + selector._cache_timestamp = time.time() + + result = await selector.get_aliases_cache() + + assert result == cached_data + + +@pytest.mark.asyncio +async def test_datetime_selector_refresh_cache(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + mock_response = { + "items_collection1_2023_01_01": { + "aliases": {"collection1": {}, "items_collection1_2023_01": {}} + } + } + client.indices.get_alias.return_value = mock_response + + result = await selector.refresh_cache() + assert result == {"collection1": ["items_collection1_2023_01"]} + client.indices.get_alias.assert_called_once() + + +@pytest.mark.asyncio +async def test_datetime_selector_get_collection_indexes(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + selector._aliases_cache = { + "collection1": ["index1", "index2"], + "collection2": ["index3"], + } + selector._cache_timestamp = time.time() + + result = await selector.get_collection_indexes("collection1") + assert result == ["index1", "index2"] + + result = await selector.get_collection_indexes("nonexistent") + assert result == [] + + +@pytest.mark.asyncio +async def test_datetime_selector_select_indexes_with_collections(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + selector._aliases_cache = { + "collection1": ["items_collection1_2023_01", "items_collection1_2023_02"], + "collection2": ["items_collection2_2023_01"], + } + selector._cache_timestamp = time.time() + with patch( + "stac_fastapi.sfeos_helpers.database.index_selection_strategies.filter_indexes_by_datetime" + ) as mock_filter: + mock_filter.side_effect = [ + ["items_collection1_2023_01"], + ["items_collection2_2023_01"], + ] + result = await selector.select_indexes( + ["collection1", "collection2"], {"gte": "2023-01-01", "lte": "2023-01-31"} + ) + assert result == "items_collection1_2023_01,items_collection2_2023_01" + assert mock_filter.call_count == 2 + + +@pytest.mark.asyncio +async def test_datetime_selector_select_indexes_without_collections(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + result = await selector.select_indexes( + None, {"gte": "2023-01-01", "lte": "2023-01-31"} + ) + assert result == "items_*,-*kibana*,-collections*" + + +@pytest.mark.asyncio +async def test_datetime_selector_select_indexes_no_matches(): + client = AsyncMock() + selector = AsyncDatetimeBasedIndexSelector(client) + selector._aliases_cache = {"collection1": ["items_collection1_2023_01"]} + selector._cache_timestamp = time.time() + + with patch( + "stac_fastapi.sfeos_helpers.database.index_selection_strategies.filter_indexes_by_datetime" + ) as mock_filter: + mock_filter.return_value = [] + + result = await selector.select_indexes( + ["collection1"], {"gte": "2024-01-01", "lte": "2024-01-31"} + ) + assert result == "" + + +def test_index_selector_factory_creates_datetime_selector(): + client = AsyncMock() + + with patch( + "stac_fastapi.sfeos_helpers.database.index_selection_strategies.get_bool_env" + ) as mock_env: + mock_env.return_value = True + + selector = IndexSelectorFactory.create_async_selector(client) + + assert isinstance(selector, AsyncDatetimeBasedIndexSelector) + assert selector.client == client + mock_env.assert_called_once_with( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + +def test_index_selector_factory_creates_unfiltered_selector(): + client = AsyncMock() + + with patch( + "stac_fastapi.sfeos_helpers.database.index_selection_strategies.get_bool_env" + ) as mock_env: + mock_env.return_value = False + + selector = IndexSelectorFactory.create_async_selector(client) + + assert isinstance(selector, UnfilteredIndexSelector) + mock_env.assert_called_once_with( + "ENABLE_DATETIME_INDEX_FILTERING", default="false" + ) + + +def test_index_selector_factory_with_env_var(monkeypatch): + client = AsyncMock() + monkeypatch.setenv("ENABLE_DATETIME_INDEX_FILTERING", "true") + + selector = IndexSelectorFactory.create_async_selector(client) + + assert isinstance(selector, AsyncDatetimeBasedIndexSelector) + + +def test_index_selector_factory_without_env_var(monkeypatch): + client = AsyncMock() + monkeypatch.delenv("ENABLE_DATETIME_INDEX_FILTERING", raising=False) + + selector = IndexSelectorFactory.create_async_selector(client) + + assert isinstance(selector, UnfilteredIndexSelector)