victordibia · Bogdanfl935 · Feb 22, 2021 · Apr 22, 2021 · Apr 23, 2021 · Apr 23, 2021
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,4 @@
+# Elastic Beanstalk Files
+.elasticbeanstalk/*
+.git
+.gitignore
diff --git a/.gitignore b/.gitignore
@@ -135,3 +135,9 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Elastic Beanstalk Files
+.elasticbeanstalk/*
+!.elasticbeanstalk/*.cfg.yml
+!.elasticbeanstalk/*.global.yml
+.idea/
diff --git a/Dockerfile b/Dockerfile
@@ -1,20 +1,45 @@
-FROM ubuntu:20.04
+FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-inference:1.5.1-gpu-py36-cu101-ubuntu16.04
+
+RUN conda install -c anaconda tensorflow
+RUN python -m pip install transformers==3.5.1
+RUN conda install -c conda-forge uvicorn aiofiles fastapi elasticsearch==7.13.1
+RUN conda install -c conda-forge flask spacy plac==0.9.6
+RUN python -m pip install numpy==1.19.2 scipy==1.4.1 Keras-Preprocessing==1.1.1
+RUN conda install -c conda-forge boto3 requests pandas scikit-learn
+RUN python -m pip install gremlinpython requests_aws4auth
+RUN python -m pip install uvicorn[standard] websockets
+RUN python -m pip install thinc[tensorflow,torch] --pre
+RUN conda install -c conda-forge cudatoolkit
+RUN python -m pip install tensorflow==2.3.0
+#RUN apt-get install -y nvidia-headless-495 nvidia-modprobe
+RUN apt-get update -y --allow-unauthenticated
+RUN apt-get install -y --allow-unauthenticated nvidia-headless-495 nvidia-modprobe
+
+
+ADD Dockerfile /root/neuralqa/
+ADD LICENSE /root/neuralqa/
+ADD README.md /root/neuralqa/
+#ADD config.yaml /root/neuralqa/
+ADD docker-compose.yml /root/neuralqa/
+ADD docs/ /root/neuralqa/docs
+ADD neuralqa/ /root/neuralqa/neuralqa
+ADD notes.md /root/neuralqa/
+ADD Dockerfile /root/neuralqa/
+ADD requirements.txt /root/neuralqa/
+ADD setup.cfg /root/neuralqa/
+ADD setup.py /root/neuralqa/
+ADD tests/ /root/neuralqa/tests
+WORKDIR /root/neuralqa
+RUN ls && python setup.py install
+
+COPY neuralqa/config_default.yaml /root/config_default.yaml
+ENV NEURALQA_CONFIG_PATH=/root/config_default.yaml
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/compat:/usr/local/cuda/lib:/usr/local/cuda/lib64
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV CUDA_VISIBLE_DEVICES 0,1
 
-COPY . . 
 
-RUN apt-get update && \
-    apt-get -y upgrade && \
-    apt-get -y install python3 && \
-    apt-get -y install python3-pip && \
-    pip3 install neuralqa && \
-    apt-get -y install wget && \
-    wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.0-amd64.deb && \
-    wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.0-amd64.deb.sha512 && \
-    shasum -a 512 -c elasticsearch-7.8.0-amd64.deb.sha512 && \
-    dpkg -i elasticsearch-7.8.0-amd64.deb && \
-    service elasticsearch start && \
-    sleep 30 && \
-
 EXPOSE 80
 
-CMD ["neuralqa", "--host", "0.0.0.0", "--port", "80"]
+CMD ["neuralqa", "ui", "--host", "0.0.0.0", "--port", "80"]
diff --git a/Dockerfile.old b/Dockerfile.old
@@ -0,0 +1,37 @@
+FROM continuumio/miniconda3
+
+RUN conda install -c anaconda python=3.6
+RUN conda install pip
+RUN conda install pytorch==1.5.1 torchvision==0.6.1 cpuonly -c pytorch &&\
+    conda install -c anaconda tensorflow &&\
+    python -m pip install transformers==3.5.1 &&\
+    conda install -c conda-forge uvicorn aiofiles fastapi elasticsearch==7.13.1 pyyaml spacy &&\
+    python -m pip install numpy==1.18.5 scipy==1.4.1 Keras-Preprocessing==1.1.1
+RUN conda install -c conda-forge boto3 pandas requests scikit-learn scipy flask &&\
+    python -m pip install gremlinpython requests_aws4auth
+
+RUN python -m pip install uvicorn[standard] websockets
+# RUN python -m pip install websockets
+
+ADD Dockerfile /root/neuralqa/
+ADD LICENSE /root/neuralqa/
+ADD README.md /root/neuralqa/
+#ADD config.yaml /root/neuralqa/
+ADD docker-compose.yml /root/neuralqa/
+ADD docs/ /root/neuralqa/docs
+ADD neuralqa/ /root/neuralqa/neuralqa
+ADD notes.md /root/neuralqa/
+ADD Dockerfile /root/neuralqa/
+ADD requirements.txt /root/neuralqa/
+ADD setup.cfg /root/neuralqa/
+ADD setup.py /root/neuralqa/
+ADD tests/ /root/neuralqa/tests
+WORKDIR /root/neuralqa
+RUN ls && python setup.py install
+
+COPY neuralqa/config_default.yaml /root/config_default.yaml
+ENV NEURALQA_CONFIG_PATH=/root/config_default.yaml
+
+EXPOSE 80
+
+CMD ["neuralqa", "ui", "--host", "0.0.0.0", "--port", "80"]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,6 +1,8 @@
 version: "3"
 services:
-  neuralqa_docker:
+  neuralqa:
     build: .
+    ports:
+      - "80:80"
     expose:
-      - 80
+      - 80
diff --git a/neuralqa/config_default.yaml b/neuralqa/config_default.yaml
@@ -61,34 +61,20 @@ retriever:
     - name: None
       value: "none"
       type: "none"
-
-    # - name: Case Law
-    #   value: cases
-    #   type: elasticsearch
-    #   connection:
-    #     host: localhost
-    #     port: 9200
-    #     username: ""
-    #     password: ""
-    #     body_field: "casebody.data.opinions.text"
-    # - name: Medical
-    #   value: medical
-    #   host: localhost
-    #   port: 9200
-    #   username: None
-    #   password: None
-    #   type: elasticsearch
-    #   fields:
-    #     body_field: context
-    # - name: Supreme Court
-    #   value: supremecourt
-    #   host: localhost
-    #   port: 9200
-    #   username: None
-    #   password: None
-    #   type: elasticsearch
-    #   fields:
-    #     body_field: casebody
+    - name: Abstracts
+      value: pubmed_abstracts
+      type: elasticsearch
+      connection:
+        host: vpc-neptune-es-opxf6xkhk6ra7sfhybnkvxydtu.us-east-2.es.amazonaws.com
+        port: 443
+        body_field: "abstract"
+    - name: Orpheus
+      value: orpheus
+      type: elasticsearch
+      connection:
+        host: vpc-neptune-es-opxf6xkhk6ra7sfhybnkvxydtu.us-east-2.es.amazonaws.com
+        port: 443
+        body_field: "text"
   readtopn: 0
 
 relsnip:
@@ -106,13 +92,10 @@ server: # webserver host and port defaults
 
 reader:
   title: Reader
-  selected: twmkn9/distilbert-base-uncased-squad2
+  selected: ktrapeznikov/biobert_v1.1_pubmed_squad_v2
   options:
-    - name: DistilBERT SQUAD2
-      value: twmkn9/distilbert-base-uncased-squad2
-      type: distilbert
-    - name: BERT SQUAD2
-      value: deepset/bert-base-cased-squad2
+    - name: BioBERT Pubmed SQUAD2
+      value: ktrapeznikov/biobert_v1.1_pubmed_squad_v2
       type: bert
     # - name: Medical BERT SQUAD2
     #   value: /Users/victordibia/Downloads/meddistilbert

diff --git a/neuralqa/retriever/__init__.py b/neuralqa/retriever/__init__.py
@@ -1,4 +1,5 @@
 from .retriever import *
 from .elasticsearchretriever import *
+from .awselasticsearchretriever import *
 from .solrretriever import *
 from .retrieverpool import *
diff --git a/neuralqa/retriever/awselasticsearchretriever.py b/neuralqa/retriever/awselasticsearchretriever.py
@@ -0,0 +1,110 @@
+import boto3
+from elasticsearch.exceptions import AuthorizationException
+from requests_aws4auth import AWS4Auth
+import copy
+from neuralqa.retriever import Retriever, ElasticSearchRetriever
+from neuralqa.utils import parse_field_content
+from elasticsearch import Elasticsearch, ConnectionError, NotFoundError, RequestsHttpConnection
+import logging
+
+import traceback
+
+from neuralqa.utils.decorators import retry_on_exception
+
+logger = logging.getLogger(__name__)
+region = 'us-east-2'
+service = 'es'
+
+
+class AWSElasticSearchRetriever(ElasticSearchRetriever):
+    def __init__(self, host, index_type="elasticsearch", port=443, **kwargs):
+        Retriever.__init__(self, index_type)
+
+        self.body_field = ""
+        self.search_fields = []
+        self.return_fields = []
+        self.remove_body_field = True
+        self.host = host
+        self.port = port
+        allowed_keys = list(self.__dict__.keys())
+        self.__dict__.update((k, v) for k, v in kwargs.items() if k in allowed_keys)
+        # assert self.body_field in self.return_fields
+        # assert any(self.body_field in f for f in self.search_fields)
+        self.construct_es_instance()
+        rejected_keys = set(kwargs.keys()) - set(allowed_keys)
+
+        if rejected_keys:
+            raise ValueError(
+                "Invalid arguments in ElasticSearchRetriever constructor:{}".format(rejected_keys))
+
+    def construct_es_instance(self):
+        credentials = boto3.Session().get_credentials()
+        awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
+        self.es = Elasticsearch(
+            hosts=[{"host": self.host, "port": self.port}],
+            http_auth=awsauth,
+            use_ssl=True,
+            verify_certs=True,
+            connection_class=RequestsHttpConnection,
+        )
+        self.isAvailable = self.es.ping()
+
+    @retry_on_exception(exception=AuthorizationException)
+    def run_query(self, index_name, search_query, max_documents=5, fragment_size=100, relsnip=True, num_fragments=5,
+                  highlight_tags=True):
+
+        tags = {"pre_tags": [""], "post_tags": [
+            ""]} if not highlight_tags else {}
+        highlight_params = {
+            "fragment_size": fragment_size,
+            "fields": {
+                self.body_field: tags
+            },
+            "number_of_fragments": num_fragments
+        }
+
+        search_query = {
+            "_source": self.return_fields,
+            "query": {
+                "multi_match": {
+                    "query":    search_query,
+                    "fields": self.search_fields
+                }
+            },
+            "size": max_documents
+        }
+
+        status = True
+        results = {}
+
+        if (relsnip):
+            # search_query["_source"] = {"includes": [""]}
+            search_query["highlight"] = highlight_params
+        # else:
+        #     search_query["_source"] = {"includes": [self.body_field]}
+
+        try:
+            query_result = self.es.search(index=index_name, body=search_query)
+
+            # RelSnip: for each document, we concatenate all
+            # fragments in each document and return as the document.
+            highlights = [" ".join(hit["highlight"][self.body_field])
+                          for hit in query_result["hits"]["hits"] if "highlight" in hit]
+            docs = [parse_field_content(self.body_field, hit["_source"])
+                    for hit in query_result["hits"]["hits"] if "_source" in hit]
+            source = copy.deepcopy(query_result)
+            if self.remove_body_field:
+                for hit in source["hits"]["hits"]:
+                    if "_source" in hit:
+                        del hit['_source'][self.body_field]
+            took = query_result["took"]
+            results = {"took": took,  "highlights": highlights, "docs": docs, "source": source}
+        except AuthorizationException:
+            self.construct_es_instance()
+            raise
+        except (ConnectionRefusedError, NotFoundError, Exception) as e:
+            status = False
+            results["errormsg"] = str(e)
+
+        results["status"] = status
+        return results
diff --git a/neuralqa/retriever/elasticsearchretriever.py b/neuralqa/retriever/elasticsearchretriever.py
@@ -25,7 +25,7 @@ def __init__(self, index_type="elasticsearch", host="localhost", port=9200, user
         #     [{'host': self.host, 'port': self.port,
         #       "username": self.username, "password": self.password}])
         self.es = Elasticsearch(hosts=[{"host": self.host, "port": self.port}],
-                                http_auth=(self.username, self.password))
+                                http_auth=(self.username, self.password), scheme='https')
         self.isAvailable = self.es.ping()
 
         rejected_keys = set(kwargs.keys()) - set(allowed_keys)

diff --git a/neuralqa/retriever/retrieverpool.py b/neuralqa/retriever/retrieverpool.py
@@ -1,5 +1,5 @@
 
-from neuralqa.retriever import ElasticSearchRetriever
+from neuralqa.retriever import AWSElasticSearchRetriever
 import logging
 
 logger = logging.getLogger(__name__)
@@ -15,7 +15,7 @@ def __init__(self, retrievers):
                     "Duplicate retriever value : {} ".format(retriever["value"]))
 
             if (retriever["type"] == "elasticsearch"):
-                self.retriever_pool[retriever["value"]] = ElasticSearchRetriever(
+                self.retriever_pool[retriever["value"]] = AWSElasticSearchRetriever(
                     **retriever["connection"])
             if (retriever["type"] == "solr"):
                 logger.info("We do not yet support Solr retrievers")

diff --git a/neuralqa/server/routehandlers.py b/neuralqa/server/routehandlers.py
@@ -51,8 +51,7 @@ async def get_answers(params: Answer):
 
             else:
                 # add query expansion terms to query if any
-                retriever_query = params.query + \
-                    " ".join(params.expansionterms)
+                retriever_query = params.query + " ".join(params.expansionterms)
                 num_fragments = 5
                 query_results = self.retriever_pool.retriever.run_query(params.retriever, retriever_query,
                                                                         max_documents=params.max_documents, fragment_size=params.fragment_size,

diff --git a/neuralqa/server/routemodels.py b/neuralqa/server/routemodels.py
@@ -23,7 +23,7 @@ class Answer(BaseModel):
     reader: str = None
     relsnip: bool = True
     expander: Optional[str] = None
-    expansionterms: Optional[list] = None
+    expansionterms: Optional[list] = []
     retriever: Optional[str] = "manual"
 
 

diff --git a/neuralqa/server/serve.py b/neuralqa/server/serve.py
@@ -2,7 +2,7 @@
 
 from neuralqa.reader import BERTReader, ReaderPool
 from neuralqa.server.routehandlers import Handler
-from neuralqa.retriever import ElasticSearchRetriever, RetrieverPool
+from neuralqa.retriever import AWSElasticSearchRetriever, RetrieverPool
 from neuralqa.utils import ConfigParser
 from neuralqa.expander import ExpanderPool
 

diff --git a/neuralqa/utils/decorators.py b/neuralqa/utils/decorators.py
@@ -0,0 +1,14 @@
+import functools
+
+
+def retry_on_exception(exception):
+    def actual_decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            except exception:
+                return func(*args, **kwargs)
+        return wrapper
+
+    return actual_decorator
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,8 @@ transformers==3.0.2
 uvicorn
 aiofiles
 fastapi
-elasticsearch==7.7.1
-pyyaml==3.13 
-spacy
+elasticsearch<=7.13.4
+pyyaml>=3.13 
+spacy
+requests-aws4auth
+boto3
diff --git a/setup.py b/setup.py
@@ -36,6 +36,7 @@ def package_files(directory):
         'aiofiles',
         'uvicorn',
         'numpy',
+	'plac==0.9.6', 
         'tensorflow>=2.1.0',
         'torch',
         'torchvision',

diff --git a/tests/retriever/test_retriever.py b/tests/retriever/test_retriever.py
@@ -1,11 +1,11 @@
-from neuralqa.retriever import ElasticSearchRetriever
+from neuralqa.retriever import AWSElasticSearchRetriever
 from neuralqa.utils import ConfigParser
 
 
 def test_elasticserch_retriever():
     app_config = ConfigParser("config.yaml")
     rkwargs = app_config.config["retriever"]["options"][1]["connection"]
-    retriever = ElasticSearchRetriever(**rkwargs)
+    retriever = AWSElasticSearchRetriever(**rkwargs)
     results = retriever.run_query(
         "cases", "what is the punishment for arson crime")
     assert results != None