diff --git a/.env_file.example b/.env_file.example index b63810a3..2da853e8 100644 --- a/.env_file.example +++ b/.env_file.example @@ -1,5 +1,5 @@ -IMAGE_MODEL=pdq -ELASTICSEARCH_URL=http://elasticsearch:9200 +IMAGE_MODEL=phash +OPENSEARCH_URL=http://opensearch:9200 REDIS_HOST=redis REDIS_PORT=6379 REDIS_DATABASE=0 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6aa849fb..2901a9b2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -40,6 +40,12 @@ deploy_qa: - pip install ecs-deploy==1.14.0 - pip install awscli==1.29.59 - aws ssm get-parameters-by-path --region $AWS_DEFAULT_REGION --path /qa/alegre/ --recursive --with-decryption --output text --query "Parameters[].[Name]" | sed -E 's#/qa/alegre/##' > env.qa.names + - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-migration $NAME /qa/alegre/$NAME " >> qa-alegre-migration.env.args; done + - ecs update qa-alegre-migration --image qa-alegre-migration $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --exclusive-env -e qa-alegre-migration APP alegre -e qa-alegre-migration DEPLOY_ENV qa -e qa-alegre-migration AWS_REGION $AWS_DEFAULT_REGION -e qa-alegre-migration ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-migration.env.args` + - taskArn=$(aws ecs run-task --cluster ecs-qa --task-definition qa-alegre-migration --query 'tasks[].taskArn' --output text) + - echo "Migration task started - $taskArn" + - aws ecs wait tasks-stopped --cluster ecs-qa --tasks $taskArn + - echo "Migration task finished." - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-c $NAME /qa/alegre/$NAME " >> qa-alegre-c.env.args; done - ecs deploy ecs-qa qa-alegre --diff --image qa-alegre-c $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e qa-alegre-c APP alegre -e qa-alegre-c PERSISTENT_DISK_PATH /mnt/models/video -e qa-alegre-c DEPLOY_ENV qa -e qa-alegre-c ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-c.env.args` - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-indiansbert $NAME /qa/alegre/$NAME " >> qa-alegre-indiansbert.env.args; done @@ -92,6 +98,12 @@ deploy_live: - pip install ecs-deploy==1.14.0 - pip install awscli==1.29.59 - aws ssm get-parameters-by-path --region $AWS_DEFAULT_REGION --path /live/alegre/ --recursive --with-decryption --output text --query "Parameters[].[Name]" | sed -E 's#/live/alegre/##' > env.live.names + - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-migration $NAME /live/alegre/$NAME " >> live-alegre-migration.env.args; done + - ecs update live-alegre-migration --image live-alegre-migration $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA --exclusive-env -e live-alegre-migration APP alegre -e live-alegre-migration DEPLOY_ENV live -e live-alegre-migration AWS_REGION $AWS_DEFAULT_REGION -e live-alegre-migration ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-migration.env.args` + - taskArn=$(aws ecs run-task --cluster ecs-live --task-definition live-alegre-migration --query 'tasks[].taskArn' --output text) + - echo "Migration task started - $taskArn" + - aws ecs wait tasks-stopped --cluster ecs-live --tasks $taskArn + - echo "Migration task finished." - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-c $NAME /live/alegre/$NAME " >> live-alegre-c.env.args; done - ecs deploy ecs-live live-alegre --image live-alegre-c $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e live-alegre-c APP alegre -e live-alegre-c PERSISTENT_DISK_PATH /mnt/models/video -e live-alegre-c DEPLOY_ENV live -e live-alegre-c ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-c.env.args` - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-indiansbert $NAME /live/alegre/$NAME " >> live-alegre-indiansbert.env.args; done diff --git a/.travis.yml b/.travis.yml index 54058018..ad04c5ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ before_script: - docker-compose build --pull - docker-compose -f docker-compose.yml -f docker-test.yml up -d - docker-compose logs -t -f & -- echo "Waiting for Elasticsearch indexes..." && until curl --silent --fail -I "http://localhost:9200/alegre_similarity_test"; do sleep 1; done +- echo "Waiting for OpenSearch indexes..." && until curl --silent --fail -I "http://localhost:9200/alegre_similarity_test"; do sleep 1; done - until curl --silent --fail -I "http://localhost:3100"; do sleep 1; done - echo "Waiting for model servers..." && while [[ ! '2' =~ $(redis-cli -n 1 SCARD 'SharedModel') ]]; do sleep 1; done #comment until fix timeout curl: (28) Operation timed out diff --git a/Makefile b/Makefile index 9a687196..6d76b7b5 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,13 @@ .PHONY: run test wait -run: wait +migration: wait python manage.py init_perl_functions python manage.py init python manage.py db stamp head python manage.py db upgrade + echo "Migrations complete." + +run: wait python manage.py run # The model and worker entry points run repeatedly to @@ -28,7 +31,7 @@ test: wait coverage run --source=app/main/ manage.py test wait: - until curl --silent -XGET --fail $(ELASTICSEARCH_URL); do printf '.'; sleep 1; done + until curl --silent -XGET --fail $(OPENSEARCH_URL); do printf '.'; sleep 1; done contract_testing: wait curl -vvv -X POST "http://alegre:3100/image/similarity/" -H "Content-Type: application/json" -d '{"url":"https://i.pinimg.com/564x/0f/73/57/0f7357637b2b203e9f32e73c24d126d7.jpg","threshold":0.9,"context":{}}' diff --git a/README.md b/README.md index 1d7c330a..bf327423 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,7 @@ A media analysis service. Part of the [Check platform](https://meedan.com/check) The Alegre API Swagger UI unfortunately [does not support sending body payloads to GET methods](https://github.com/swagger-api/swagger-ui/issues/2136). To test those API methods, you can still fill in your arguments, and click "Execute" - Swagger will fail, but show you a `curl` command that you can use in your console. -- Open http://localhost:5601 for the Kibana UI -- Open http://localhost:9200 for the Elasticsearch API +- Open http://localhost:9200 for the OpenSearch API - `docker-compose exec alegre flask shell` to get inside a Python shell in docker container with the loaded app ## Testing @@ -30,7 +29,7 @@ To test individual modules: ## Troubleshooting -- If you're having trouble starting Elasticsearch on macOS, with the error `container_name exited with code 137`, you will need to adjust your Docker settings, as per https://www.petefreitag.com/item/848.cfm +- If you're having trouble starting OpenSearch on macOS, with the error `container_name exited with code 137`, you will need to adjust your Docker settings, as per https://www.petefreitag.com/item/848.cfm - Note that the alegre docker service definitions in the `alegre` repo may not align with the alegre service definitions in the `check` repository, so different variations of the service may be spun up depending on the directory where `docker-compose up` is executed. diff --git a/app/main/config.py b/app/main/config.py index 39c8ac92..4a88d360 100644 --- a/app/main/config.py +++ b/app/main/config.py @@ -6,8 +6,8 @@ class Config: SECRET_KEY = os.getenv('SECRET_KEY', 'my_precious_secret_key') DEBUG = False - ELASTICSEARCH_URL = os.getenv('ELASTICSEARCH_URL', 'http://elasticsearch:9200') - ELASTICSEARCH_SIMILARITY = 'alegre_similarity' + OPENSEARCH_URL = os.getenv('OPENSEARCH_URL', 'http://opensearch:9200') + OPENSEARCH_SIMILARITY = 'alegre_similarity' REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = os.getenv('REDIS_PORT', 6379) REDIS_DATABASE = os.getenv('REDIS_DATABASE', 0) @@ -53,7 +53,7 @@ class TestingConfig(Config): DEBUG = True TESTING = True PRESERVE_CONTEXT_ON_EXCEPTION = False - ELASTICSEARCH_SIMILARITY = 'alegre_similarity_test' + OPENSEARCH_SIMILARITY = 'alegre_similarity_test' REDIS_DATABASE = os.getenv('REDIS_DATABASE', 1) SQLALCHEMY_DATABASE_URI = 'postgresql+psycopg2://%(user)s:%(password)s@%(host)s/%(dbname)s?client_encoding=utf8' % { 'user': os.getenv('DATABASE_USER', 'postgres'), diff --git a/app/main/controller/bulk_similarity_controller.py b/app/main/controller/bulk_similarity_controller.py index 151793e4..15b4a3ae 100644 --- a/app/main/controller/bulk_similarity_controller.py +++ b/app/main/controller/bulk_similarity_controller.py @@ -26,7 +26,7 @@ class BulkSimilarityResource(Resource): def get_bulk_write_object(self, doc_id, body, op_type="index"): return { "_op_type": op_type, - '_index': app.config['ELASTICSEARCH_SIMILARITY'], + '_index': app.config['OPENSEARCH_SIMILARITY'], '_id': doc_id, '_source': body } @@ -46,7 +46,7 @@ def get_bodies_for_request(self): return doc_ids, bodies def submit_bulk_request(self, doc_ids, bodies, op_type="index"): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) writables = [] for doc_body_set in each_slice(list(zip(doc_ids, bodies)), 8000): to_write = [] diff --git a/app/main/controller/bulk_update_similarity_controller.py b/app/main/controller/bulk_update_similarity_controller.py index 025f9690..0b59a3e5 100644 --- a/app/main/controller/bulk_update_similarity_controller.py +++ b/app/main/controller/bulk_update_similarity_controller.py @@ -7,7 +7,7 @@ from app.main.controller.bulk_similarity_controller import BulkSimilarityResource from app.main.lib import similarity from app.main.lib.text_similarity import get_document_body -from app.main.lib.elasticsearch import merge_contexts +from app.main.lib.opensearch import merge_contexts def get_documents_by_ids(index, ids, es): query = { "query": { @@ -71,9 +71,9 @@ def get_cases(params, existing_docs, updateable=True): class BulkUpdateSimilarityResource(Resource): # Assumes less than 10k documents at a time. def get_writeable_data_for_request(self): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) params = request.json - existing_docs = get_documents_by_ids(app.config['ELASTICSEARCH_SIMILARITY'], [e.get("doc_id") for e in params.get("documents", [])], es) + existing_docs = get_documents_by_ids(app.config['OPENSEARCH_SIMILARITY'], [e.get("doc_id") for e in params.get("documents", [])], es) updated_cases = get_cases(params, existing_docs) new_cases = get_cases(params, existing_docs, False) return updated_cases, new_cases diff --git a/app/main/controller/healthcheck_controller.py b/app/main/controller/healthcheck_controller.py index e5715458..2d5de1b9 100644 --- a/app/main/controller/healthcheck_controller.py +++ b/app/main/controller/healthcheck_controller.py @@ -15,8 +15,8 @@ class HealthcheckResource(Resource): @api.doc('Make a healthcheck query') def get(self): result = { - 'ELASTICSEARCH': False, - 'ELASTICSEARCH_SIMILARITY': False, + 'OPENSEARCH': False, + 'OPENSEARCH_SIMILARITY': False, 'REDIS': False, 'DATABASE': False, 'LANGID': False @@ -24,15 +24,15 @@ def get(self): # Elasticsearch try: - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=10, max_retries=3, retry_on_timeout=True) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=10, max_retries=3, retry_on_timeout=True) except Exception as e: - result['ELASTICSEARCH'] = str(e) + result['OPENSEARCH'] = str(e) else: - result['ELASTICSEARCH'] = True - result['ELASTICSEARCH_SIMILARITY'] = True if es.indices.exists( - index=[app.config['ELASTICSEARCH_SIMILARITY']] - ) else 'Index not found `%s`' % app.config['ELASTICSEARCH_SIMILARITY'] + result['OPENSEARCH'] = True + result['OPENSEARCH_SIMILARITY'] = True if es.indices.exists( + index=[app.config['OPENSEARCH_SIMILARITY']] + ) else 'Index not found `%s`' % app.config['OPENSEARCH_SIMILARITY'] # Redis try: diff --git a/app/main/lib/graph_writer.py b/app/main/lib/graph_writer.py index 2787b52a..1fbdf365 100644 --- a/app/main/lib/graph_writer.py +++ b/app/main/lib/graph_writer.py @@ -1,4 +1,4 @@ -from app.main.lib.elasticsearch import get_all_documents_matching_context +from app.main.lib.opensearch import get_all_documents_matching_context from app.main.lib import text_similarity from app.main.lib import image_similarity from flask import current_app as app diff --git a/app/main/lib/language_analyzers.py b/app/main/lib/language_analyzers.py index 8859ffed..b4726e9e 100644 --- a/app/main/lib/language_analyzers.py +++ b/app/main/lib/language_analyzers.py @@ -2,7 +2,6 @@ from opensearchpy import OpenSearch from flask import request, current_app as app SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn", "pt-br", "ar", "fr", "de", "cjk", "id"] -#via https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#bengali-analyzer SETTINGS_BY_LANGUAGE = { "en": { "analysis": { @@ -304,17 +303,17 @@ } def init_indices(): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) indices = es.cat.indices(h='index', s='index').split() for lang in SUPPORTED_LANGUAGES: - index_name = app.config['ELASTICSEARCH_SIMILARITY']+"_"+lang + index_name = app.config['OPENSEARCH_SIMILARITY']+"_"+lang if index_name not in indices: es.indices.create(index=index_name) else: es.indices.delete(index=index_name) es.indices.create(index=index_name) es.indices.close(index=index_name) - mapping = json.load(open('./elasticsearch/alegre_similarity_base.json')) + mapping = json.load(open('./opensearch/alegre_similarity_base.json')) mapping["properties"]["content"]["analyzer"] = "rebuilt_"+lang es.indices.put_settings( body=SETTINGS_BY_LANGUAGE[lang], diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/opensearch.py similarity index 88% rename from app/main/lib/elasticsearch.py rename to app/main/lib/opensearch.py index 18846fe2..7afc3245 100644 --- a/app/main/lib/elasticsearch.py +++ b/app/main/lib/opensearch.py @@ -1,4 +1,4 @@ -# Elasticsearch helpers +# OpenSearch helpers import opensearchpy from opensearchpy import OpenSearch @@ -13,7 +13,7 @@ def get_all_documents_matching_context(context): matches, clause_count = generate_matches(context) - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) conditions = [{ 'nested': { 'score_mode': 'none', @@ -36,7 +36,7 @@ def get_all_documents_matching_context(context): docs = scan(es, size=10000, query=body, - index=app.config['ELASTICSEARCH_SIMILARITY'], + index=app.config['OPENSEARCH_SIMILARITY'], ) for hit in docs: yield hit @@ -75,7 +75,7 @@ def merge_contexts(body, found_doc): return body def update_or_create_document(body, doc_id, index): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) result = None if doc_id: try: @@ -107,7 +107,7 @@ def update_or_create_document(body, doc_id, index): return result def store_document(body, doc_id, language=None): - indices = [app.config['ELASTICSEARCH_SIMILARITY']] + indices = [app.config['OPENSEARCH_SIMILARITY']] # 'auto' indicates we should try to guess the appropriate language if language == 'auto': text = body['content'] @@ -118,7 +118,7 @@ def store_document(body, doc_id, language=None): if (language is not None) and (language in SUPPORTED_LANGUAGES): # also cache in the language-specific index - indices.append(app.config['ELASTICSEARCH_SIMILARITY']+"_"+language) + indices.append(app.config['OPENSEARCH_SIMILARITY']+"_"+language) results = [] for index in indices: @@ -138,25 +138,25 @@ def store_document(body, doc_id, language=None): def delete_context_from_found_doc(context, found_doc, doc_id): found_doc["contexts"] = [row for row in found_doc.get("contexts", []) if context != row] - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) result = es.update( id=doc_id, body={"doc": found_doc}, - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) return result def delete_document(doc_id, context, quiet): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) try: - found_doc = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id) + found_doc = es.get(index=app.config['OPENSEARCH_SIMILARITY'], id=doc_id) except opensearchpy.exceptions.NotFoundError: found_doc = None try: if found_doc and context in found_doc.get("contexts", []) and len(found_doc.get("contexts", [])) > 1: return delete_context_from_found_doc(context, found_doc, doc_id) else: - return es.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id) + return es.delete(index=app.config['OPENSEARCH_SIMILARITY'], id=doc_id) except: if quiet: return { diff --git a/app/main/lib/reindex_analyzers.py b/app/main/lib/reindex_analyzers.py index dc13859c..4e507b30 100644 --- a/app/main/lib/reindex_analyzers.py +++ b/app/main/lib/reindex_analyzers.py @@ -1,7 +1,7 @@ import json import opensearchpy from opensearchpy import OpenSearch -from app.main.lib.elasticsearch import get_all_documents_matching_context, update_or_create_document +from app.main.lib.opensearch import get_all_documents_matching_context, update_or_create_document from app.main.lib.error_log import ErrorLog from opensearchpy.helpers import scan @@ -10,11 +10,11 @@ from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES import cld3 def get_all_documents(): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) try: docs = scan(es, size=10000, - index=app.config['ELASTICSEARCH_SIMILARITY'], + index=app.config['OPENSEARCH_SIMILARITY'], ) for hit in docs: yield hit @@ -23,7 +23,7 @@ def get_all_documents(): return [] def get_docs_to_transform(team_id, language=None): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) docs_to_transform = {} for doc in get_all_documents_matching_context({"team_id": team_id}): if not language: @@ -44,12 +44,12 @@ def get_cached_docs_to_transform(team_id, language=None): return get_docs_to_transform(team_id, language) def store_updated_docs(docs_to_transform): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) for doc_id, language in docs_to_transform.items(): try: - already_done = es.get(index=app.config['ELASTICSEARCH_SIMILARITY']+"_"+language, id=doc_id) + already_done = es.get(index=app.config['OPENSEARCH_SIMILARITY']+"_"+language, id=doc_id) except opensearchpy.exceptions.NotFoundError: - found_doc = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id) + found_doc = es.get(index=app.config['OPENSEARCH_SIMILARITY'], id=doc_id) if found_doc: source = found_doc["_source"] keys_to_pop = [e for e in source.keys() if 'vector' in e or 'model_' in e] @@ -59,7 +59,7 @@ def store_updated_docs(docs_to_transform): finished = False while not finished and fail_count < 5: try: - update_or_create_document(source, doc_id, app.config['ELASTICSEARCH_SIMILARITY']+"_"+language) + update_or_create_document(source, doc_id, app.config['OPENSEARCH_SIMILARITY']+"_"+language) finished = True except opensearchpy.exceptions.ConnectionError: fail_count += 1 @@ -68,4 +68,4 @@ def run(team_id, language=None): if language is not None and language not in SUPPORTED_LANGUAGES: raise Exception(f"Unsupported language: {language} is not a supported language.") docs_to_transform = get_cached_docs_to_transform(team_id, language) - store_updated_docs(docs_to_transform) \ No newline at end of file + store_updated_docs(docs_to_transform) diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index 5daa52bf..00543974 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -1,13 +1,13 @@ from flask import current_app as app from opensearchpy import OpenSearch -from app.main.lib.elasticsearch import generate_matches, truncate_query, store_document, delete_document +from app.main.lib.opensearch import generate_matches, truncate_query, store_document, delete_document from app.main.lib.error_log import ErrorLog from app.main.lib.shared_models.shared_model import SharedModel from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES #from app.main.lib.langid import Cld3LangidProvider as LangidProvider from app.main.lib.langid import GoogleLangidProvider as LangidProvider from app.main.lib.openai import retrieve_openai_embeddings, PREFIX_OPENAI -ELASTICSEARCH_DEFAULT_LIMIT = 10000 +OPENSEARCH_DEFAULT_LIMIT = 10000 def delete_text(doc_id, context, quiet): return delete_document(doc_id, context, quiet) @@ -73,7 +73,7 @@ def get_body_from_conditions(conditions): body = conditions return body -def get_elasticsearch_base_conditions(search_params, clause_count, threshold): +def get_opensearch_base_conditions(search_params, clause_count, threshold): conditions = [ { 'match': { @@ -163,17 +163,17 @@ def search_text_by_model(search_params): model_key, threshold = get_model_and_threshold(search_params) app.logger.info( f"[Alegre Similarity] search_text_by_model:model_key {model_key}, threshold:{threshold}") - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) conditions = [] matches = [] clause_count = 0 - search_indices = [app.config['ELASTICSEARCH_SIMILARITY']] + search_indices = [app.config['OPENSEARCH_SIMILARITY']] if 'context' in search_params: matches, clause_count = generate_matches(search_params['context']) if clause_count >= app.config['MAX_CLAUSE_COUNT']: return {'error': "Too many clauses specified! Text search will fail if another clause is added. Current clause count: "+str(clause_count)} if model_key.lower() == 'elasticsearch': - conditions = get_elasticsearch_base_conditions(search_params, clause_count, threshold) + conditions = get_opensearch_base_conditions(search_params, clause_count, threshold) language = search_params.get("language") if language == 'None': language = None @@ -185,7 +185,7 @@ def search_text_by_model(search_params): app.logger.warning('Detected language in query text {} is not explicitly supported for indexing, defaulting to "none"'.format(language)) language = None if language in SUPPORTED_LANGUAGES: - search_indices.append(app.config['ELASTICSEARCH_SIMILARITY']+"_"+language) + search_indices.append(app.config['OPENSEARCH_SIMILARITY']+"_"+language) elif language: error_text = f"[Alegre Similarity] [Similarity type: text] Language parameter value of {language} for text similarity search asserted, but not in SUPPORTED_LANGUAGES" app.logger.info(error_text) @@ -214,7 +214,7 @@ def search_text_by_model(search_params): body = get_body_from_conditions(conditions) app.logger.info(f"Sending OpenSearch query: {body}") result = es.search( - size=limit or ELASTICSEARCH_DEFAULT_LIMIT, #NOTE a default limit is given in similarity.py + size=limit or OPENSEARCH_DEFAULT_LIMIT, #NOTE a default limit is given in similarity.py body=body, index=search_indices ) diff --git a/app/test/test_bulk_similarity.py b/app/test/test_bulk_similarity.py index 3c97c56c..f127d1b0 100644 --- a/app/test/test_bulk_similarity.py +++ b/app/test/test_bulk_similarity.py @@ -15,22 +15,22 @@ class TestBulkSimilarityBlueprint(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) def test_similarity_mapping(self): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) mapping = es.indices.get_mapping( - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) self.assertDictEqual( json.load(open('./elasticsearch/alegre_similarity.json')), - mapping[app.config['ELASTICSEARCH_SIMILARITY']]['mappings'] + mapping[app.config['OPENSEARCH_SIMILARITY']]['mappings'] ) def test_elasticsearch_insert_text_with_doc_id(self): diff --git a/app/test/test_bulk_update_similarity.py b/app/test/test_bulk_update_similarity.py index 968ea712..0e016285 100644 --- a/app/test/test_bulk_update_similarity.py +++ b/app/test/test_bulk_update_similarity.py @@ -20,12 +20,12 @@ class TestBulkUpdateSimilarityBlueprint(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) r = redis_client.get_client() r.delete(SharedModelStub.model_key) @@ -33,13 +33,13 @@ def setUp(self): r.srem('SharedModel', SharedModelStub.model_key) def test_similarity_mapping(self): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) mapping = es.indices.get_mapping( - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) self.assertDictEqual( json.load(open('./elasticsearch/alegre_similarity.json')), - mapping[app.config['ELASTICSEARCH_SIMILARITY']]['mappings'] + mapping[app.config['OPENSEARCH_SIMILARITY']]['mappings'] ) def test_elasticsearch_insert_text_with_doc_id(self): diff --git a/app/test/test_healthcheck.py b/app/test/test_healthcheck.py index d9d790cb..bf3f9a3e 100644 --- a/app/test/test_healthcheck.py +++ b/app/test/test_healthcheck.py @@ -45,7 +45,7 @@ def test_healthcheck_api_with_wrong_server(self): def test_healthcheck_api_elasticsearch_exception(self): with app.app_context(): - app.config['ELASTICSEARCH_URL']= '' + app.config['OPENSEARCH_URL']= '' response = self.client.get('/healthcheck/') self.assertEqual('application/json', response.content_type) self.assertEqual(500, response.status_code) @@ -59,7 +59,7 @@ def test_healthcheck_api_redis_error_connection(self): def test_healthcheck_api_with_bad_config(self): with app.app_context(): - app.config['ELASTICSEARCH_URL']= 'bad' + app.config['OPENSEARCH_URL']= 'bad' app.config['REDIS_HOST']= 'bad' app.config['SQLALCHEMY_DATABASE_URI']= 'bad' response = self.client.get('/healthcheck/') diff --git a/app/test/test_similarity.py b/app/test/test_similarity.py index f6683b80..24d76e6a 100644 --- a/app/test/test_similarity.py +++ b/app/test/test_similarity.py @@ -16,22 +16,22 @@ class TestSimilarityBlueprint(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) def test_similarity_mapping(self): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) mapping = es.indices.get_mapping( - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) self.assertDictEqual( json.load(open('./elasticsearch/alegre_similarity.json')), - mapping[app.config['ELASTICSEARCH_SIMILARITY']]['mappings'] + mapping[app.config['OPENSEARCH_SIMILARITY']]['mappings'] ) def test_elasticsearch_similarity_english(self): @@ -43,8 +43,8 @@ def test_elasticsearch_similarity_english(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.post( '/text/similarity/search/', @@ -157,8 +157,8 @@ def test_elasticsearch_similarity_english_models_specified(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.post( '/text/similarity/search/', @@ -285,14 +285,14 @@ def test_elasticsearch_update_text_listed_context(self): with self.client: term = { 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': [54, 55] } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] term2 = { 'text': 'how to slice a pizza', 'model': 'elasticsearch', 'context': { 'dbid': [54, 55] }, 'doc_id': doc["_id"]} post_response2 = self.client.post('/text/similarity/', data=json.dumps(term2), content_type='application/json') - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if doc["_id"] == e["_id"]][0] self.assertEqual(term2['text'], doc['_source']['content']) @@ -300,8 +300,8 @@ def test_elasticsearch_performs_correct_fuzzy_search(self): with self.client: term = { 'text': 'what even is a banana', 'model': 'elasticsearch', 'context': { 'dbid': 54 } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) lookup = { 'text': 'what even is a bananna', 'model': 'elasticsearch', 'context': { 'dbid': 54 } } post_response = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json') lookup["fuzzy"] = True @@ -315,14 +315,14 @@ def test_elasticsearch_update_text(self): with self.client: term = { 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': 54 } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] term2 = { 'text': 'how to slice a pizza', 'model': 'elasticsearch', 'context': { 'dbid': 54 }, 'doc_id': doc["_id"]} post_response2 = self.client.post('/text/similarity/', data=json.dumps(term2), content_type='application/json') - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if doc["_id"] == e["_id"]][0] self.assertEqual(term2['text'], doc['_source']['content']) @@ -330,14 +330,14 @@ def test_elasticsearch_update_text_with_doc_id(self): with self.client: term = { 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': 54 }, 'doc_id': "123456" } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] term2 = { 'text': 'how to slice a pizza', 'model': 'elasticsearch', 'context': { 'dbid': 54 }, 'doc_id': "123456"} post_response2 = self.client.post('/text/similarity/', data=json.dumps(term2), content_type='application/json') - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if doc["_id"] == e["_id"]][0] self.assertEqual(term2['text'], doc['_source']['content']) @@ -364,12 +364,12 @@ def test_elasticsearch_delete_text(self): with self.client: term = { 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': 54 } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) result = json.loads(post_response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] delete_response = self.client.delete( '/text/similarity/', @@ -383,12 +383,12 @@ def test_elasticsearch_delete_text(self): post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') term = { 'doc_id': '123', 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': 55 } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) result = json.loads(post_response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] delete_response = self.client.delete( '/text/similarity/', @@ -408,8 +408,8 @@ def test_elasticsearch_similarity_hindi(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.post( '/text/similarity/search/', data=json.dumps({ @@ -440,8 +440,8 @@ def test_model_similarity(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.post( '/text/similarity/search/', data=json.dumps({ @@ -514,8 +514,8 @@ def test_wrong_model_key(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.post( '/text/similarity/search/', @@ -559,8 +559,8 @@ def test_model_similarity_with_vector(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) model = SharedModel.get_client(TestSimilarityBlueprint.use_model_key) vector = model.get_shared_model_response('how to delete an invoice') @@ -588,8 +588,8 @@ def test_min_es_search(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.post( '/text/similarity/search/', @@ -610,4 +610,4 @@ def test_min_es_search(self): self.assertEqual(0, len(result['result'])) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/app/test/test_similarity_lang_analyzers.py b/app/test/test_similarity_lang_analyzers.py index b6817750..1b01ec03 100644 --- a/app/test/test_similarity_lang_analyzers.py +++ b/app/test/test_similarity_lang_analyzers.py @@ -17,12 +17,12 @@ class TestSimilarityBlueprint(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) # also make sure all the language specific indices have been dropped and recreated # (this is slow and runs before each test) @@ -36,8 +36,8 @@ def test_all_analyzers(self): response = self.client.post('/text/similarity/', data=json.dumps(example), content_type='application/json') result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']+"_"+example['language']) response = self.client.post( '/text/similarity/search/', data=json.dumps({ @@ -48,7 +48,7 @@ def test_all_analyzers(self): content_type='application/json' ) result = json.loads(response.data.decode()) - self.assertTrue(app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language'] in [e['_index'] for e in result['result']]) + self.assertTrue(app.config['OPENSEARCH_SIMILARITY']+"_"+example['language'] in [e['_index'] for e in result['result']]) def test_auto_language_id(self): # language examples as input to language classifier @@ -67,11 +67,11 @@ def test_auto_language_id(self): response = self.client.post('/text/similarity/', data=json.dumps(example), content_type='application/json') result = json.loads(response.data.decode()) # we are feeding in 'auto' expected correct id back self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) if expected_lang is None: - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) else: - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']+"_"+expected_lang) response = self.client.post( '/text/similarity/search/', data=json.dumps({ @@ -83,9 +83,9 @@ def test_auto_language_id(self): ) result = json.loads(response.data.decode()) # indirectly checking classification by confirming which index was included in result - index_alias = app.config['ELASTICSEARCH_SIMILARITY'] + index_alias = app.config['OPENSEARCH_SIMILARITY'] if expected_lang is not None: - index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang + index_alias = app.config['OPENSEARCH_SIMILARITY']+"_"+expected_lang self.assertTrue(index_alias in [e['_index'] for e in result['result']]) def test_auto_language_query(self): @@ -105,11 +105,11 @@ def test_auto_language_query(self): response = self.client.post('/text/similarity/', data=json.dumps(example), content_type='application/json') result = json.loads(response.data.decode()) # we are feeding in 'auto' expected correct id back self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) if expected_lang is None: - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) else: - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']+"_"+expected_lang) response = self.client.post( '/text/similarity/search/', data=json.dumps({ @@ -121,11 +121,11 @@ def test_auto_language_query(self): ) result = json.loads(response.data.decode()) # indirectly checking classification by confirming which index was included in result - index_alias = app.config['ELASTICSEARCH_SIMILARITY'] + index_alias = app.config['OPENSEARCH_SIMILARITY'] if expected_lang is not None: - index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang + index_alias = app.config['OPENSEARCH_SIMILARITY']+"_"+expected_lang self.assertTrue(index_alias in [e['_index'] for e in result['result']]) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/app/test/test_text_similarity.py b/app/test/test_text_similarity.py index e6661117..c3ae3002 100644 --- a/app/test/test_text_similarity.py +++ b/app/test/test_text_similarity.py @@ -11,12 +11,12 @@ class TestTextSimilarity(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) def test_get_vector_model_base_conditions(self): @@ -57,4 +57,4 @@ def test_get_document_body(self): if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/docker-compose.yml b/docker-compose.yml index 22083971..39c5cbee 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,27 +1,20 @@ version: '2' volumes: - elasticsearch: + opensearch: redis: postgres: services: - elasticsearch: - build: ./elasticsearch + opensearch: + build: ./opensearch ports: - "9200:9200" environment: discovery.type: single-node transport.host: 127.0.0.1 - xpack.security.enabled: "false" + plugins.security.disabled: "true" volumes: - - "elasticsearch:/usr/share/elasticsearch/data" - kibana: - image: docker.elastic.co/kibana/kibana:7.9.2 - ports: - - "5601:5601" - depends_on: - - elasticsearch - environment: - ELASTICSEARCH_URL: http://elasticsearch:9200 + - "./opensearch.yml:/usr/share/opensearch/config/opensearch.yml" + - "opensearch:/usr/share/opensearch/data" redis: image: redis:6.2 ports: @@ -141,10 +134,10 @@ services: - ".:/app" depends_on: - postgres - - kibana - redis + # - kibana # - video # - xlm_r_bert_base_nli_stsb_mean_tokens # - indian_sbert env_file: - - .env_file \ No newline at end of file + - .env_file diff --git a/elasticsearch/Dockerfile b/elasticsearch/Dockerfile deleted file mode 100644 index 7497f36c..00000000 --- a/elasticsearch/Dockerfile +++ /dev/null @@ -1,3 +0,0 @@ -FROM docker.elastic.co/elasticsearch/elasticsearch:7.9.2 -RUN echo y | bin/elasticsearch-plugin install analysis-icu -RUN echo y | bin/elasticsearch-plugin install repository-s3 diff --git a/elasticsearch/alegre_similarity.json b/elasticsearch/alegre_similarity.json deleted file mode 100644 index 96b80f79..00000000 --- a/elasticsearch/alegre_similarity.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "properties": { - "vector_768": { - "type": "dense_vector", - "dims": 768 - }, - "vector_xlm-r-bert-base-nli-stsb-mean-tokens": { - "type": "dense_vector", - "dims": 768 - }, - "vector_paraphrase-filipino-mpnet-base-v2": { - "type": "dense_vector", - "dims": 768 - }, - "vector_indian-sbert": { - "type": "dense_vector", - "dims": 768 - }, - "vector_paraphrase-multilingual-mpnet-base-v2": { - "type": "dense_vector", - "dims": 768 - }, - "vector_openai-text-embedding-ada-002": { - "type": "dense_vector", - "dims": 1536 - }, - "content": { - "type": "text" - }, - "context": { - "type": "nested" - }, - "vector": { - "type": "double" - }, - "model": { - "type": "keyword" - } - } -} diff --git a/elasticsearch/alegre_similarity_settings.json b/elasticsearch/alegre_similarity_settings.json deleted file mode 100644 index 4608723b..00000000 --- a/elasticsearch/alegre_similarity_settings.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "similarity": { - "scripted_tfidf": { - "type": "scripted", - "script": { - "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;" - } - }, - "lm_jelinek_mercer" : { - "type" : "LMJelinekMercer" - } - } -} diff --git a/manage.py b/manage.py index 61ece43f..16577b39 100644 --- a/manage.py +++ b/manage.py @@ -10,6 +10,8 @@ from sqlalchemy.schema import DDL from sqlalchemy_utils import database_exists, create_database import json_logging +import logging +import sys import redis from rq import Connection, Worker @@ -24,6 +26,9 @@ # (by upgrading to tensorflow 2.2 or higher) import tensorflow as tf +alegre_index_name = os.getenv('ALEGRE_INDEX', 'alegre_similarity') +alegre_init_index_name = os.getenv('ALEGRE_INIT_INDEX', 'alegre_similarity') + config_name = os.getenv('BOILERPLATE_ENV', 'dev') app = create_app(config_name) app.register_blueprint(blueprint) @@ -55,6 +60,11 @@ def test_simple_perl_function(): @manager.command def init_simple_perl_function(): with app.app_context(): + json_logging.init_non_web(enable_json=True) + logger = logging.getLogger("init") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.info("Starting init_simple_perl_function ...") sqlalchemy.event.listen( db.metadata, 'before_create', @@ -115,6 +125,11 @@ def init_simple_perl_function(): @manager.command def init_perl_functions(): with app.app_context(): + json_logging.init_non_web(enable_json=True) + logger = logging.getLogger("init") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.info("Starting init_perl_functions ...") sqlalchemy.event.listen( db.metadata, 'before_create', @@ -269,25 +284,32 @@ def run_video_matcher(): @manager.command def init(): """Initializes the service.""" + json_logging.init_non_web(enable_json=True) + logger = logging.getLogger("init") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.info("Starting init ...") # Create ES indexes. - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + logger.info("Creating indices with init index name: " + alegre_init_index_name) + es = OpenSearch(app.config['OPENSEARCH_URL']) try: if config_name == 'test': - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.delete(index=alegre_init_index_name, ignore=[400, 404]) + es.indices.create(alegre_init_index_name) except TransportError as e: # ignore already existing index if e.error == 'resource_already_exists_exception': pass else: raise - es.indices.put_mapping( - body=json.load(open('./elasticsearch/alegre_similarity.json')), - # include_type_name=True, - index=app.config['ELASTICSEARCH_SIMILARITY'] - ) + # For now, omit mapping updates. + #es.indices.put_mapping( + # body=json.load(open('./opensearch/alegre_similarity.json')), + # index=alegre_init_index_name + #) init_indices() # Create database. + logger.info("Creating database ...") with app.app_context(): if not database_exists(db.engine.url): create_database(db.engine.url) diff --git a/opensearch.yml b/opensearch.yml new file mode 100644 index 00000000..17f5fbc2 --- /dev/null +++ b/opensearch.yml @@ -0,0 +1,39 @@ +network.host: 0.0.0.0 +plugins.security.disabled: true + +path: + logs: /usr/share/opensearch/logs + data: /usr/share/opensearch/data + repo: /usr/share/opensearch/snapshots +cluster: + name: ${HOSTNAME}-cluster +node: + name: ${HOSTNAME} +http: + cors: + enabled: true + allow-origin: '*' + +######## Start OpenSearch Security Demo Configuration ######## +# WARNING: revise all the lines below before you go into production +plugins.security.ssl.transport.pemcert_filepath: esnode.pem +plugins.security.ssl.transport.pemkey_filepath: esnode-key.pem +plugins.security.ssl.transport.pemtrustedcas_filepath: root-ca.pem +plugins.security.ssl.transport.enforce_hostname_verification: false +plugins.security.ssl.http.enabled: true +plugins.security.ssl.http.pemcert_filepath: esnode.pem +plugins.security.ssl.http.pemkey_filepath: esnode-key.pem +plugins.security.ssl.http.pemtrustedcas_filepath: root-ca.pem +plugins.security.allow_unsafe_democertificates: true +plugins.security.allow_default_init_securityindex: true +plugins.security.authcz.admin_dn: + - CN=kirk,OU=client,O=client,L=test, C=de + +plugins.security.audit.type: internal_opensearch +plugins.security.enable_snapshot_restore_privilege: true +plugins.security.check_snapshot_restore_write_privileges: true +plugins.security.restapi.roles_enabled: ["all_access", "security_rest_api_access"] +plugins.security.system_indices.enabled: true +plugins.security.system_indices.indices: [".plugins-ml-config", ".plugins-ml-connector", ".plugins-ml-model-group", ".plugins-ml-model", ".plugins-ml-task", ".plugins-ml-conversation-meta", ".plugins-ml-conversation-interactions", ".opendistro-alerting-config", ".opendistro-alerting-alert*", ".opendistro-anomaly-results*", ".opendistro-anomaly-detector*", ".opendistro-anomaly-checkpoints", ".opendistro-anomaly-detection-state", ".opendistro-reports-*", ".opensearch-notifications-*", ".opensearch-notebooks", ".opensearch-observability", ".ql-datasources", ".opendistro-asynchronous-search-response*", ".replication-metadata-store", ".opensearch-knn-models", ".geospatial-ip2geo-data*"] +node.max_local_storage_nodes: 3 +######## End OpenSearch Security Demo Configuration ######## diff --git a/opensearch/alegre_similarity.json b/opensearch/alegre_similarity.json index 1fb35c73..49296ab8 100644 --- a/opensearch/alegre_similarity.json +++ b/opensearch/alegre_similarity.json @@ -1,6 +1,18 @@ { "mappings": { "properties": { + "content": { + "type": "text" + }, + "context": { + "type": "nested" + }, + "vector": { + "type": "double" + }, + "model": { + "type": "keyword" + }, "vector_768": { "type": "knn_vector", "dimension": 768 @@ -10,9 +22,9 @@ "dimension": 768 }, "vector_openai-text-embedding-ada-002": { - "type": "dense_vector", - "dims": 1536 - }, + "type": "knn_vector", + "dimension": 1536 + }, "vector_paraphrase-filipino-mpnet-base-v2": { "type": "knn_vector", "dimension": 768 @@ -20,18 +32,6 @@ "vector_indian-sbert": { "type": "knn_vector", "dimension": 768 - }, - "content": { - "type": "text" - }, - "context": { - "type": "nested" - }, - "vector": { - "type": "double" - }, - "model": { - "type": "keyword" } } } diff --git a/elasticsearch/alegre_similarity_base.json b/opensearch/alegre_similarity_base.json similarity index 100% rename from elasticsearch/alegre_similarity_base.json rename to opensearch/alegre_similarity_base.json diff --git a/production/bin/migrate_entrypoint.sh b/production/bin/migrate_entrypoint.sh new file mode 100755 index 00000000..9205186b --- /dev/null +++ b/production/bin/migrate_entrypoint.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +echo "Begin container entrypoint..." + +# Redirect filehandles +ln -sf /proc/$$/fd/1 /var/log/entrypoint-stdout.log +ln -sf /proc/$$/fd/2 /var/log/entrypoint-stderr.log + +echo "Executing into target..." + +# exec into target process +>/log/stdout.log 2>/log/stderr.log exec /opt/bin/run_migrations.sh diff --git a/production/bin/run_migrations.sh b/production/bin/run_migrations.sh new file mode 100755 index 00000000..ca971eeb --- /dev/null +++ b/production/bin/run_migrations.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +echo "Starting migrations..." +cd /app + +echo "Calling init_perl_functions ..." +python manage.py init_perl_functions + +echo "Initializing db stamp head ..." +python manage.py db stamp head + +echo "Initializing db upgrade ..." +python manage.py db upgrade + +echo "Initializing search ..." +python manage.py init + +echo "Migrations complete."