Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wap 170 #94

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Elastic Beanstalk Files
.elasticbeanstalk/*
.git
.gitignore
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -135,3 +135,9 @@ dmypy.json

# Pyre type checker
.pyre/

# Elastic Beanstalk Files
.elasticbeanstalk/*
!.elasticbeanstalk/*.cfg.yml
!.elasticbeanstalk/*.global.yml
.idea/
57 changes: 41 additions & 16 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,20 +1,45 @@
FROM ubuntu:20.04
FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-inference:1.5.1-gpu-py36-cu101-ubuntu16.04

RUN conda install -c anaconda tensorflow
RUN python -m pip install transformers==3.5.1
RUN conda install -c conda-forge uvicorn aiofiles fastapi elasticsearch==7.13.1
RUN conda install -c conda-forge flask spacy plac==0.9.6
RUN python -m pip install numpy==1.19.2 scipy==1.4.1 Keras-Preprocessing==1.1.1
RUN conda install -c conda-forge boto3 requests pandas scikit-learn
RUN python -m pip install gremlinpython requests_aws4auth
RUN python -m pip install uvicorn[standard] websockets
RUN python -m pip install thinc[tensorflow,torch] --pre
RUN conda install -c conda-forge cudatoolkit
RUN python -m pip install tensorflow==2.3.0
#RUN apt-get install -y nvidia-headless-495 nvidia-modprobe
RUN apt-get update -y --allow-unauthenticated
RUN apt-get install -y --allow-unauthenticated nvidia-headless-495 nvidia-modprobe


ADD Dockerfile /root/neuralqa/
ADD LICENSE /root/neuralqa/
ADD README.md /root/neuralqa/
#ADD config.yaml /root/neuralqa/
ADD docker-compose.yml /root/neuralqa/
ADD docs/ /root/neuralqa/docs
ADD neuralqa/ /root/neuralqa/neuralqa
ADD notes.md /root/neuralqa/
ADD Dockerfile /root/neuralqa/
ADD requirements.txt /root/neuralqa/
ADD setup.cfg /root/neuralqa/
ADD setup.py /root/neuralqa/
ADD tests/ /root/neuralqa/tests
WORKDIR /root/neuralqa
RUN ls && python setup.py install

COPY neuralqa/config_default.yaml /root/config_default.yaml
ENV NEURALQA_CONFIG_PATH=/root/config_default.yaml
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/compat:/usr/local/cuda/lib:/usr/local/cuda/lib64
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV CUDA_VISIBLE_DEVICES 0,1

COPY . .

RUN apt-get update && \
apt-get -y upgrade && \
apt-get -y install python3 && \
apt-get -y install python3-pip && \
pip3 install neuralqa && \
apt-get -y install wget && \
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.0-amd64.deb && \
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.0-amd64.deb.sha512 && \
shasum -a 512 -c elasticsearch-7.8.0-amd64.deb.sha512 && \
dpkg -i elasticsearch-7.8.0-amd64.deb && \
service elasticsearch start && \
sleep 30 && \

EXPOSE 80

CMD ["neuralqa", "--host", "0.0.0.0", "--port", "80"]
CMD ["neuralqa", "ui", "--host", "0.0.0.0", "--port", "80"]
37 changes: 37 additions & 0 deletions Dockerfile.old
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
FROM continuumio/miniconda3

RUN conda install -c anaconda python=3.6
RUN conda install pip
RUN conda install pytorch==1.5.1 torchvision==0.6.1 cpuonly -c pytorch &&\
conda install -c anaconda tensorflow &&\
python -m pip install transformers==3.5.1 &&\
conda install -c conda-forge uvicorn aiofiles fastapi elasticsearch==7.13.1 pyyaml spacy &&\
python -m pip install numpy==1.18.5 scipy==1.4.1 Keras-Preprocessing==1.1.1
RUN conda install -c conda-forge boto3 pandas requests scikit-learn scipy flask &&\
python -m pip install gremlinpython requests_aws4auth

RUN python -m pip install uvicorn[standard] websockets
# RUN python -m pip install websockets

ADD Dockerfile /root/neuralqa/
ADD LICENSE /root/neuralqa/
ADD README.md /root/neuralqa/
#ADD config.yaml /root/neuralqa/
ADD docker-compose.yml /root/neuralqa/
ADD docs/ /root/neuralqa/docs
ADD neuralqa/ /root/neuralqa/neuralqa
ADD notes.md /root/neuralqa/
ADD Dockerfile /root/neuralqa/
ADD requirements.txt /root/neuralqa/
ADD setup.cfg /root/neuralqa/
ADD setup.py /root/neuralqa/
ADD tests/ /root/neuralqa/tests
WORKDIR /root/neuralqa
RUN ls && python setup.py install

COPY neuralqa/config_default.yaml /root/config_default.yaml
ENV NEURALQA_CONFIG_PATH=/root/config_default.yaml

EXPOSE 80

CMD ["neuralqa", "ui", "--host", "0.0.0.0", "--port", "80"]
6 changes: 4 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
version: "3"
services:
neuralqa_docker:
neuralqa:
build: .
ports:
- "80:80"
expose:
- 80
- 80
51 changes: 17 additions & 34 deletions neuralqa/config_default.yaml
Original file line number Diff line number Diff line change
@@ -61,34 +61,20 @@ retriever:
- name: None
value: "none"
type: "none"

# - name: Case Law
# value: cases
# type: elasticsearch
# connection:
# host: localhost
# port: 9200
# username: ""
# password: ""
# body_field: "casebody.data.opinions.text"
# - name: Medical
# value: medical
# host: localhost
# port: 9200
# username: None
# password: None
# type: elasticsearch
# fields:
# body_field: context
# - name: Supreme Court
# value: supremecourt
# host: localhost
# port: 9200
# username: None
# password: None
# type: elasticsearch
# fields:
# body_field: casebody
- name: Abstracts
value: pubmed_abstracts
type: elasticsearch
connection:
host: vpc-neptune-es-opxf6xkhk6ra7sfhybnkvxydtu.us-east-2.es.amazonaws.com
port: 443
body_field: "abstract"
- name: Orpheus
value: orpheus
type: elasticsearch
connection:
host: vpc-neptune-es-opxf6xkhk6ra7sfhybnkvxydtu.us-east-2.es.amazonaws.com
port: 443
body_field: "text"
readtopn: 0

relsnip:
@@ -106,13 +92,10 @@ server: # webserver host and port defaults

reader:
title: Reader
selected: twmkn9/distilbert-base-uncased-squad2
selected: ktrapeznikov/biobert_v1.1_pubmed_squad_v2
options:
- name: DistilBERT SQUAD2
value: twmkn9/distilbert-base-uncased-squad2
type: distilbert
- name: BERT SQUAD2
value: deepset/bert-base-cased-squad2
- name: BioBERT Pubmed SQUAD2
value: ktrapeznikov/biobert_v1.1_pubmed_squad_v2
type: bert
# - name: Medical BERT SQUAD2
# value: /Users/victordibia/Downloads/meddistilbert
1 change: 1 addition & 0 deletions neuralqa/retriever/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .retriever import *
from .elasticsearchretriever import *
from .awselasticsearchretriever import *
from .solrretriever import *
from .retrieverpool import *
110 changes: 110 additions & 0 deletions neuralqa/retriever/awselasticsearchretriever.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import boto3
from elasticsearch.exceptions import AuthorizationException
from requests_aws4auth import AWS4Auth
import copy
from neuralqa.retriever import Retriever, ElasticSearchRetriever
from neuralqa.utils import parse_field_content
from elasticsearch import Elasticsearch, ConnectionError, NotFoundError, RequestsHttpConnection
import logging

import traceback

from neuralqa.utils.decorators import retry_on_exception

logger = logging.getLogger(__name__)
region = 'us-east-2'
service = 'es'


class AWSElasticSearchRetriever(ElasticSearchRetriever):
def __init__(self, host, index_type="elasticsearch", port=443, **kwargs):
Retriever.__init__(self, index_type)

self.body_field = ""
self.search_fields = []
self.return_fields = []
self.remove_body_field = True
self.host = host
self.port = port
allowed_keys = list(self.__dict__.keys())
self.__dict__.update((k, v) for k, v in kwargs.items() if k in allowed_keys)
# assert self.body_field in self.return_fields
# assert any(self.body_field in f for f in self.search_fields)
self.construct_es_instance()
rejected_keys = set(kwargs.keys()) - set(allowed_keys)

if rejected_keys:
raise ValueError(
"Invalid arguments in ElasticSearchRetriever constructor:{}".format(rejected_keys))

def construct_es_instance(self):
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
self.es = Elasticsearch(
hosts=[{"host": self.host, "port": self.port}],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
)
self.isAvailable = self.es.ping()

@retry_on_exception(exception=AuthorizationException)
def run_query(self, index_name, search_query, max_documents=5, fragment_size=100, relsnip=True, num_fragments=5,
highlight_tags=True):

tags = {"pre_tags": [""], "post_tags": [
""]} if not highlight_tags else {}
highlight_params = {
"fragment_size": fragment_size,
"fields": {
self.body_field: tags
},
"number_of_fragments": num_fragments
}

search_query = {
"_source": self.return_fields,
"query": {
"multi_match": {
"query": search_query,
"fields": self.search_fields
}
},
"size": max_documents
}

status = True
results = {}

if (relsnip):
# search_query["_source"] = {"includes": [""]}
search_query["highlight"] = highlight_params
# else:
# search_query["_source"] = {"includes": [self.body_field]}

try:
query_result = self.es.search(index=index_name, body=search_query)

# RelSnip: for each document, we concatenate all
# fragments in each document and return as the document.
highlights = [" ".join(hit["highlight"][self.body_field])
for hit in query_result["hits"]["hits"] if "highlight" in hit]
docs = [parse_field_content(self.body_field, hit["_source"])
for hit in query_result["hits"]["hits"] if "_source" in hit]
source = copy.deepcopy(query_result)
if self.remove_body_field:
for hit in source["hits"]["hits"]:
if "_source" in hit:
del hit['_source'][self.body_field]
took = query_result["took"]
results = {"took": took, "highlights": highlights, "docs": docs, "source": source}
except AuthorizationException:
self.construct_es_instance()
raise
except (ConnectionRefusedError, NotFoundError, Exception) as e:
status = False
results["errormsg"] = str(e)

results["status"] = status
return results
2 changes: 1 addition & 1 deletion neuralqa/retriever/elasticsearchretriever.py
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@ def __init__(self, index_type="elasticsearch", host="localhost", port=9200, user
# [{'host': self.host, 'port': self.port,
# "username": self.username, "password": self.password}])
self.es = Elasticsearch(hosts=[{"host": self.host, "port": self.port}],
http_auth=(self.username, self.password))
http_auth=(self.username, self.password), scheme='https')
self.isAvailable = self.es.ping()

rejected_keys = set(kwargs.keys()) - set(allowed_keys)
4 changes: 2 additions & 2 deletions neuralqa/retriever/retrieverpool.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

from neuralqa.retriever import ElasticSearchRetriever
from neuralqa.retriever import AWSElasticSearchRetriever
import logging

logger = logging.getLogger(__name__)
@@ -15,7 +15,7 @@ def __init__(self, retrievers):
"Duplicate retriever value : {} ".format(retriever["value"]))

if (retriever["type"] == "elasticsearch"):
self.retriever_pool[retriever["value"]] = ElasticSearchRetriever(
self.retriever_pool[retriever["value"]] = AWSElasticSearchRetriever(
**retriever["connection"])
if (retriever["type"] == "solr"):
logger.info("We do not yet support Solr retrievers")
3 changes: 1 addition & 2 deletions neuralqa/server/routehandlers.py
Original file line number Diff line number Diff line change
@@ -51,8 +51,7 @@ async def get_answers(params: Answer):

else:
# add query expansion terms to query if any
retriever_query = params.query + \
" ".join(params.expansionterms)
retriever_query = params.query + " ".join(params.expansionterms)
num_fragments = 5
query_results = self.retriever_pool.retriever.run_query(params.retriever, retriever_query,
max_documents=params.max_documents, fragment_size=params.fragment_size,
2 changes: 1 addition & 1 deletion neuralqa/server/routemodels.py
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@ class Answer(BaseModel):
reader: str = None
relsnip: bool = True
expander: Optional[str] = None
expansionterms: Optional[list] = None
expansionterms: Optional[list] = []
retriever: Optional[str] = "manual"


2 changes: 1 addition & 1 deletion neuralqa/server/serve.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@

from neuralqa.reader import BERTReader, ReaderPool
from neuralqa.server.routehandlers import Handler
from neuralqa.retriever import ElasticSearchRetriever, RetrieverPool
from neuralqa.retriever import AWSElasticSearchRetriever, RetrieverPool
from neuralqa.utils import ConfigParser
from neuralqa.expander import ExpanderPool

14 changes: 14 additions & 0 deletions neuralqa/utils/decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import functools


def retry_on_exception(exception):
def actual_decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except exception:
return func(*args, **kwargs)
return wrapper

return actual_decorator
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -4,6 +4,8 @@ transformers==3.0.2
uvicorn
aiofiles
fastapi
elasticsearch==7.7.1
pyyaml==3.13
spacy
elasticsearch<=7.13.4
pyyaml>=3.13
spacy
requests-aws4auth
boto3
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -36,6 +36,7 @@ def package_files(directory):
'aiofiles',
'uvicorn',
'numpy',
'plac==0.9.6',
'tensorflow>=2.1.0',
'torch',
'torchvision',
4 changes: 2 additions & 2 deletions tests/retriever/test_retriever.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from neuralqa.retriever import ElasticSearchRetriever
from neuralqa.retriever import AWSElasticSearchRetriever
from neuralqa.utils import ConfigParser


def test_elasticserch_retriever():
app_config = ConfigParser("config.yaml")
rkwargs = app_config.config["retriever"]["options"][1]["connection"]
retriever = ElasticSearchRetriever(**rkwargs)
retriever = AWSElasticSearchRetriever(**rkwargs)
results = retriever.run_query(
"cases", "what is the punishment for arson crime")
assert results != None