diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..b9bd98b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +# Elastic Beanstalk Files +.elasticbeanstalk/* +.git +.gitignore \ No newline at end of file diff --git a/.gitignore b/.gitignore index 34708e6..d16a5a2 100644 --- a/.gitignore +++ b/.gitignore @@ -135,3 +135,9 @@ dmypy.json # Pyre type checker .pyre/ + +# Elastic Beanstalk Files +.elasticbeanstalk/* +!.elasticbeanstalk/*.cfg.yml +!.elasticbeanstalk/*.global.yml +.idea/ diff --git a/Dockerfile b/Dockerfile index a8ad302..2fdbe43 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,45 @@ -FROM ubuntu:20.04 +FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-inference:1.5.1-gpu-py36-cu101-ubuntu16.04 + +RUN conda install -c anaconda tensorflow +RUN python -m pip install transformers==3.5.1 +RUN conda install -c conda-forge uvicorn aiofiles fastapi elasticsearch==7.13.1 +RUN conda install -c conda-forge flask spacy plac==0.9.6 +RUN python -m pip install numpy==1.19.2 scipy==1.4.1 Keras-Preprocessing==1.1.1 +RUN conda install -c conda-forge boto3 requests pandas scikit-learn +RUN python -m pip install gremlinpython requests_aws4auth +RUN python -m pip install uvicorn[standard] websockets +RUN python -m pip install thinc[tensorflow,torch] --pre +RUN conda install -c conda-forge cudatoolkit +RUN python -m pip install tensorflow==2.3.0 +#RUN apt-get install -y nvidia-headless-495 nvidia-modprobe +RUN apt-get update -y --allow-unauthenticated +RUN apt-get install -y --allow-unauthenticated nvidia-headless-495 nvidia-modprobe + + +ADD Dockerfile /root/neuralqa/ +ADD LICENSE /root/neuralqa/ +ADD README.md /root/neuralqa/ +#ADD config.yaml /root/neuralqa/ +ADD docker-compose.yml /root/neuralqa/ +ADD docs/ /root/neuralqa/docs +ADD neuralqa/ /root/neuralqa/neuralqa +ADD notes.md /root/neuralqa/ +ADD Dockerfile /root/neuralqa/ +ADD requirements.txt /root/neuralqa/ +ADD setup.cfg /root/neuralqa/ +ADD setup.py /root/neuralqa/ +ADD tests/ /root/neuralqa/tests +WORKDIR /root/neuralqa +RUN ls && python setup.py install + +COPY neuralqa/config_default.yaml /root/config_default.yaml +ENV NEURALQA_CONFIG_PATH=/root/config_default.yaml +ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/compat:/usr/local/cuda/lib:/usr/local/cuda/lib64 +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV CUDA_VISIBLE_DEVICES 0,1 -COPY . . -RUN apt-get update && \ - apt-get -y upgrade && \ - apt-get -y install python3 && \ - apt-get -y install python3-pip && \ - pip3 install neuralqa && \ - apt-get -y install wget && \ - wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.0-amd64.deb && \ - wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.0-amd64.deb.sha512 && \ - shasum -a 512 -c elasticsearch-7.8.0-amd64.deb.sha512 && \ - dpkg -i elasticsearch-7.8.0-amd64.deb && \ - service elasticsearch start && \ - sleep 30 && \ - EXPOSE 80 -CMD ["neuralqa", "--host", "0.0.0.0", "--port", "80"] \ No newline at end of file +CMD ["neuralqa", "ui", "--host", "0.0.0.0", "--port", "80"] \ No newline at end of file diff --git a/Dockerfile.old b/Dockerfile.old new file mode 100644 index 0000000..5e7fe6a --- /dev/null +++ b/Dockerfile.old @@ -0,0 +1,37 @@ +FROM continuumio/miniconda3 + +RUN conda install -c anaconda python=3.6 +RUN conda install pip +RUN conda install pytorch==1.5.1 torchvision==0.6.1 cpuonly -c pytorch &&\ + conda install -c anaconda tensorflow &&\ + python -m pip install transformers==3.5.1 &&\ + conda install -c conda-forge uvicorn aiofiles fastapi elasticsearch==7.13.1 pyyaml spacy &&\ + python -m pip install numpy==1.18.5 scipy==1.4.1 Keras-Preprocessing==1.1.1 +RUN conda install -c conda-forge boto3 pandas requests scikit-learn scipy flask &&\ + python -m pip install gremlinpython requests_aws4auth + +RUN python -m pip install uvicorn[standard] websockets +# RUN python -m pip install websockets + +ADD Dockerfile /root/neuralqa/ +ADD LICENSE /root/neuralqa/ +ADD README.md /root/neuralqa/ +#ADD config.yaml /root/neuralqa/ +ADD docker-compose.yml /root/neuralqa/ +ADD docs/ /root/neuralqa/docs +ADD neuralqa/ /root/neuralqa/neuralqa +ADD notes.md /root/neuralqa/ +ADD Dockerfile /root/neuralqa/ +ADD requirements.txt /root/neuralqa/ +ADD setup.cfg /root/neuralqa/ +ADD setup.py /root/neuralqa/ +ADD tests/ /root/neuralqa/tests +WORKDIR /root/neuralqa +RUN ls && python setup.py install + +COPY neuralqa/config_default.yaml /root/config_default.yaml +ENV NEURALQA_CONFIG_PATH=/root/config_default.yaml + +EXPOSE 80 + +CMD ["neuralqa", "ui", "--host", "0.0.0.0", "--port", "80"] diff --git a/docker-compose.yml b/docker-compose.yml index f6aaa68..ae79586 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,8 @@ version: "3" services: - neuralqa_docker: + neuralqa: build: . + ports: + - "80:80" expose: - - 80 + - 80 \ No newline at end of file diff --git a/neuralqa/config_default.yaml b/neuralqa/config_default.yaml index 012abd1..369207a 100644 --- a/neuralqa/config_default.yaml +++ b/neuralqa/config_default.yaml @@ -61,34 +61,20 @@ retriever: - name: None value: "none" type: "none" - - # - name: Case Law - # value: cases - # type: elasticsearch - # connection: - # host: localhost - # port: 9200 - # username: "" - # password: "" - # body_field: "casebody.data.opinions.text" - # - name: Medical - # value: medical - # host: localhost - # port: 9200 - # username: None - # password: None - # type: elasticsearch - # fields: - # body_field: context - # - name: Supreme Court - # value: supremecourt - # host: localhost - # port: 9200 - # username: None - # password: None - # type: elasticsearch - # fields: - # body_field: casebody + - name: Abstracts + value: pubmed_abstracts + type: elasticsearch + connection: + host: vpc-neptune-es-opxf6xkhk6ra7sfhybnkvxydtu.us-east-2.es.amazonaws.com + port: 443 + body_field: "abstract" + - name: Orpheus + value: orpheus + type: elasticsearch + connection: + host: vpc-neptune-es-opxf6xkhk6ra7sfhybnkvxydtu.us-east-2.es.amazonaws.com + port: 443 + body_field: "text" readtopn: 0 relsnip: @@ -106,13 +92,10 @@ server: # webserver host and port defaults reader: title: Reader - selected: twmkn9/distilbert-base-uncased-squad2 + selected: ktrapeznikov/biobert_v1.1_pubmed_squad_v2 options: - - name: DistilBERT SQUAD2 - value: twmkn9/distilbert-base-uncased-squad2 - type: distilbert - - name: BERT SQUAD2 - value: deepset/bert-base-cased-squad2 + - name: BioBERT Pubmed SQUAD2 + value: ktrapeznikov/biobert_v1.1_pubmed_squad_v2 type: bert # - name: Medical BERT SQUAD2 # value: /Users/victordibia/Downloads/meddistilbert diff --git a/neuralqa/retriever/__init__.py b/neuralqa/retriever/__init__.py index fa07e6f..24f4a54 100644 --- a/neuralqa/retriever/__init__.py +++ b/neuralqa/retriever/__init__.py @@ -1,4 +1,5 @@ from .retriever import * from .elasticsearchretriever import * +from .awselasticsearchretriever import * from .solrretriever import * from .retrieverpool import * diff --git a/neuralqa/retriever/awselasticsearchretriever.py b/neuralqa/retriever/awselasticsearchretriever.py new file mode 100644 index 0000000..75bfd60 --- /dev/null +++ b/neuralqa/retriever/awselasticsearchretriever.py @@ -0,0 +1,110 @@ +import boto3 +from elasticsearch.exceptions import AuthorizationException +from requests_aws4auth import AWS4Auth +import copy +from neuralqa.retriever import Retriever, ElasticSearchRetriever +from neuralqa.utils import parse_field_content +from elasticsearch import Elasticsearch, ConnectionError, NotFoundError, RequestsHttpConnection +import logging + +import traceback + +from neuralqa.utils.decorators import retry_on_exception + +logger = logging.getLogger(__name__) +region = 'us-east-2' +service = 'es' + + +class AWSElasticSearchRetriever(ElasticSearchRetriever): + def __init__(self, host, index_type="elasticsearch", port=443, **kwargs): + Retriever.__init__(self, index_type) + + self.body_field = "" + self.search_fields = [] + self.return_fields = [] + self.remove_body_field = True + self.host = host + self.port = port + allowed_keys = list(self.__dict__.keys()) + self.__dict__.update((k, v) for k, v in kwargs.items() if k in allowed_keys) + # assert self.body_field in self.return_fields + # assert any(self.body_field in f for f in self.search_fields) + self.construct_es_instance() + rejected_keys = set(kwargs.keys()) - set(allowed_keys) + + if rejected_keys: + raise ValueError( + "Invalid arguments in ElasticSearchRetriever constructor:{}".format(rejected_keys)) + + def construct_es_instance(self): + credentials = boto3.Session().get_credentials() + awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + self.es = Elasticsearch( + hosts=[{"host": self.host, "port": self.port}], + http_auth=awsauth, + use_ssl=True, + verify_certs=True, + connection_class=RequestsHttpConnection, + ) + self.isAvailable = self.es.ping() + + @retry_on_exception(exception=AuthorizationException) + def run_query(self, index_name, search_query, max_documents=5, fragment_size=100, relsnip=True, num_fragments=5, + highlight_tags=True): + + tags = {"pre_tags": [""], "post_tags": [ + ""]} if not highlight_tags else {} + highlight_params = { + "fragment_size": fragment_size, + "fields": { + self.body_field: tags + }, + "number_of_fragments": num_fragments + } + + search_query = { + "_source": self.return_fields, + "query": { + "multi_match": { + "query": search_query, + "fields": self.search_fields + } + }, + "size": max_documents + } + + status = True + results = {} + + if (relsnip): + # search_query["_source"] = {"includes": [""]} + search_query["highlight"] = highlight_params + # else: + # search_query["_source"] = {"includes": [self.body_field]} + + try: + query_result = self.es.search(index=index_name, body=search_query) + + # RelSnip: for each document, we concatenate all + # fragments in each document and return as the document. + highlights = [" ".join(hit["highlight"][self.body_field]) + for hit in query_result["hits"]["hits"] if "highlight" in hit] + docs = [parse_field_content(self.body_field, hit["_source"]) + for hit in query_result["hits"]["hits"] if "_source" in hit] + source = copy.deepcopy(query_result) + if self.remove_body_field: + for hit in source["hits"]["hits"]: + if "_source" in hit: + del hit['_source'][self.body_field] + took = query_result["took"] + results = {"took": took, "highlights": highlights, "docs": docs, "source": source} + except AuthorizationException: + self.construct_es_instance() + raise + except (ConnectionRefusedError, NotFoundError, Exception) as e: + status = False + results["errormsg"] = str(e) + + results["status"] = status + return results diff --git a/neuralqa/retriever/elasticsearchretriever.py b/neuralqa/retriever/elasticsearchretriever.py index 35fa95f..bf5c28c 100644 --- a/neuralqa/retriever/elasticsearchretriever.py +++ b/neuralqa/retriever/elasticsearchretriever.py @@ -25,7 +25,7 @@ def __init__(self, index_type="elasticsearch", host="localhost", port=9200, user # [{'host': self.host, 'port': self.port, # "username": self.username, "password": self.password}]) self.es = Elasticsearch(hosts=[{"host": self.host, "port": self.port}], - http_auth=(self.username, self.password)) + http_auth=(self.username, self.password), scheme='https') self.isAvailable = self.es.ping() rejected_keys = set(kwargs.keys()) - set(allowed_keys) diff --git a/neuralqa/retriever/retrieverpool.py b/neuralqa/retriever/retrieverpool.py index 06ba9a3..8985e8f 100644 --- a/neuralqa/retriever/retrieverpool.py +++ b/neuralqa/retriever/retrieverpool.py @@ -1,5 +1,5 @@ -from neuralqa.retriever import ElasticSearchRetriever +from neuralqa.retriever import AWSElasticSearchRetriever import logging logger = logging.getLogger(__name__) @@ -15,7 +15,7 @@ def __init__(self, retrievers): "Duplicate retriever value : {} ".format(retriever["value"])) if (retriever["type"] == "elasticsearch"): - self.retriever_pool[retriever["value"]] = ElasticSearchRetriever( + self.retriever_pool[retriever["value"]] = AWSElasticSearchRetriever( **retriever["connection"]) if (retriever["type"] == "solr"): logger.info("We do not yet support Solr retrievers") diff --git a/neuralqa/server/routehandlers.py b/neuralqa/server/routehandlers.py index 755716d..9b21960 100644 --- a/neuralqa/server/routehandlers.py +++ b/neuralqa/server/routehandlers.py @@ -51,8 +51,7 @@ async def get_answers(params: Answer): else: # add query expansion terms to query if any - retriever_query = params.query + \ - " ".join(params.expansionterms) + retriever_query = params.query + " ".join(params.expansionterms) num_fragments = 5 query_results = self.retriever_pool.retriever.run_query(params.retriever, retriever_query, max_documents=params.max_documents, fragment_size=params.fragment_size, diff --git a/neuralqa/server/routemodels.py b/neuralqa/server/routemodels.py index 8ead518..73f178f 100644 --- a/neuralqa/server/routemodels.py +++ b/neuralqa/server/routemodels.py @@ -23,7 +23,7 @@ class Answer(BaseModel): reader: str = None relsnip: bool = True expander: Optional[str] = None - expansionterms: Optional[list] = None + expansionterms: Optional[list] = [] retriever: Optional[str] = "manual" diff --git a/neuralqa/server/serve.py b/neuralqa/server/serve.py index 1d57ac3..1627bff 100644 --- a/neuralqa/server/serve.py +++ b/neuralqa/server/serve.py @@ -2,7 +2,7 @@ from neuralqa.reader import BERTReader, ReaderPool from neuralqa.server.routehandlers import Handler -from neuralqa.retriever import ElasticSearchRetriever, RetrieverPool +from neuralqa.retriever import AWSElasticSearchRetriever, RetrieverPool from neuralqa.utils import ConfigParser from neuralqa.expander import ExpanderPool diff --git a/neuralqa/utils/decorators.py b/neuralqa/utils/decorators.py new file mode 100644 index 0000000..f9ba522 --- /dev/null +++ b/neuralqa/utils/decorators.py @@ -0,0 +1,14 @@ +import functools + + +def retry_on_exception(exception): + def actual_decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except exception: + return func(*args, **kwargs) + return wrapper + + return actual_decorator diff --git a/requirements.txt b/requirements.txt index 33448b4..3d9827b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,8 @@ transformers==3.0.2 uvicorn aiofiles fastapi -elasticsearch==7.7.1 -pyyaml==3.13 -spacy \ No newline at end of file +elasticsearch<=7.13.4 +pyyaml>=3.13 +spacy +requests-aws4auth +boto3 diff --git a/setup.py b/setup.py index 4712fe4..e53c350 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ def package_files(directory): 'aiofiles', 'uvicorn', 'numpy', + 'plac==0.9.6', 'tensorflow>=2.1.0', 'torch', 'torchvision', diff --git a/tests/retriever/test_retriever.py b/tests/retriever/test_retriever.py index 5dc9f15..b854031 100644 --- a/tests/retriever/test_retriever.py +++ b/tests/retriever/test_retriever.py @@ -1,11 +1,11 @@ -from neuralqa.retriever import ElasticSearchRetriever +from neuralqa.retriever import AWSElasticSearchRetriever from neuralqa.utils import ConfigParser def test_elasticserch_retriever(): app_config = ConfigParser("config.yaml") rkwargs = app_config.config["retriever"]["options"][1]["connection"] - retriever = ElasticSearchRetriever(**rkwargs) + retriever = AWSElasticSearchRetriever(**rkwargs) results = retriever.run_query( "cases", "what is the punishment for arson crime") assert results != None