Add a fork of postgres-elasticsearch-fdw to the engine and make it available through sgr mount.

mildbyte · mildbyte · commit b2caee83cd1b · 2020-09-15T21:37:17.000+01:00
Basic usage: ``` sgr mount elasticsearch -c elasticsearch:9200 -o@- <<EOF { "table_spec": { "table_1": { "schema": { "id": "text", "@timestamp": "timestamp", "query": "text", "col_1": "text", "col_2": "boolean", } "index": "index-pattern*", "rowid_column": "id", "query_column": "query", } } } EOF ```
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "engine/src/cstore_fdw"]
 	path = engine/src/cstore_fdw
 	url = https://github.com/splitgraph/cstore_fdw.git
+[submodule "engine/src/postgres-elasticsearch-fdw"]
+	path = engine/src/postgres-elasticsearch-fdw
+	url = https://github.com/splitgraph/postgres-elasticsearch-fdw.git
diff --git a/engine/Dockerfile b/engine/Dockerfile
@@ -175,12 +175,19 @@ ENV POSTGRES_USER sgr
 COPY ./engine/etc /etc/
 COPY ./engine/init_scripts /docker-entrypoint-initdb.d/
 
-ENV PYTHONPATH "${PYTHONPATH}:/splitgraph"
 
 # Copy the actual Splitgraph code over at this point.
 COPY ./splitgraph /splitgraph/splitgraph
 COPY ./bin /splitgraph/bin
+
+# "Install" elasticsearch_fdw
+RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
+    mkdir /pg_es_fdw && \
+    pip install elasticsearch>=7.7.0
+COPY ./engine/src/postgres-elasticsearch-fdw/pg_es_fdw /pg_es_fdw/
+
 ENV PATH "${PATH}:/splitgraph/bin"
+ENV PYTHONPATH "${PYTHONPATH}:/splitgraph:/pg_es_fdw"
 
 # https://github.com/postgis/docker-postgis/blob/master/12-3.0/Dockerfile
 ARG with_postgis
diff --git a/engine/src/postgres-elasticsearch-fdw b/engine/src/postgres-elasticsearch-fdw
@@ -0,0 +1 @@
+Subproject commit c77d6160864ddaeaf60c59a1fcf55260f3a876e7
diff --git a/splitgraph/config/keys.py b/splitgraph/config/keys.py
@@ -61,6 +61,7 @@
         "mongo_fdw": "splitgraph.hooks.mount_handlers.mount_mongo",
         "mysql_fdw": "splitgraph.hooks.mount_handlers.mount_mysql",
         "socrata": "splitgraph.ingestion.socrata.mount.mount_socrata",
+        "elasticsearch": "splitgraph.hooks.mount_handlers.mount_elasticsearch",
     },
 }
 
diff --git a/splitgraph/core/output.py b/splitgraph/core/output.py
@@ -66,17 +66,17 @@ def parse_repo_tag_or_hash(value, default="latest"):
 
 def conn_string_to_dict(connection: Optional[str]) -> Dict[str, Any]:
     if connection:
-        match = re.match(r"(\S+):(\S+)@(.+):(\d+)", connection)
+        match = re.match(r"((\S+):(\S+)@)?(.+):(\d+)", connection)
         if not match:
             raise ValueError("Invalid connection string!")
         # In the future, we could turn all of these options into actual Click options,
         # but then we'd also have to parse the docstring deeper to find out the types the function
         # requires, how to serialize them etc etc. Idea for a click-contrib addon perhaps?
         return dict(
-            server=match.group(3),
-            port=int(match.group(4)),
-            username=match.group(1),
-            password=match.group(2),
+            server=match.group(4),
+            port=int(match.group(5)),
+            username=match.group(2),
+            password=match.group(3),
         )
     else:
         return dict(server=None, port=None, username=None, password=None)
diff --git a/splitgraph/hooks/mount_handlers.py b/splitgraph/hooks/mount_handlers.py
@@ -41,7 +41,7 @@ def init_fdw(
     engine: "PostgresEngine",
     server_id: str,
     wrapper: str,
-    server_options: Optional[Dict[str, Union[str, None]]] = None,
+    server_options: Optional[Dict[str, Union[str, int, None]]] = None,
     user_options: Optional[Dict[str, str]] = None,
     overwrite: bool = True,
 ) -> None:
@@ -67,7 +67,7 @@ def init_fdw(
     if server_options:
         server_keys, server_vals = zip(*server_options.items())
         create_server += _format_options(server_keys)
-        engine.run_sql(create_server, server_vals)
+        engine.run_sql(create_server, [str(v) for v in server_vals])
     else:
         engine.run_sql(create_server)
 
@@ -77,7 +77,7 @@ def init_fdw(
         )
         user_keys, user_vals = zip(*user_options.items())
         create_mapping += _format_options(user_keys)
-        engine.run_sql(create_mapping, user_vals)
+        engine.run_sql(create_mapping, [str(v) for v in user_vals])
 
 
 def _format_options(option_names):
@@ -178,7 +178,7 @@ def _create_foreign_table(engine, local_schema, table_name, schema_spec, server_
     if server_options:
         server_keys, server_vals = zip(*server_options.items())
         query += _format_options(server_keys)
-        engine.run_sql(query, server_vals)
+        engine.run_sql(query, [str(v) for v in server_vals])
     else:
         engine.run_sql(query)
 
@@ -296,6 +296,98 @@ def mount_mysql(
         )
 
 
+def mount_elasticsearch(
+    mountpoint: str,
+    server: str,
+    port: int,
+    username: str,
+    password: str,
+    table_spec: Dict[str, Dict[str, Any]],
+):
+    """
+    Mount an ElasticSearch instance.
+
+    Mount a set of tables proxying to a remote ElasticSearch index.
+
+    This uses a fork of postgres-elasticsearch-fdw behind the scenes. You can add a column
+    `query` to your table and set it as `query_column` to pass advanced ES queries and aggregations.
+    For example:
+
+    ```
+    sgr mount elasticsearch -c elasticsearch:9200 -o@- <<EOF
+        {
+          "table_spec": {
+            "table_1": {
+              "schema": {
+                "id": "text",
+                "@timestamp": "timestamp",
+                "query": "text",
+                "col_1": "text",
+                "col_2": "boolean",
+              }
+              "index": "index-pattern*",
+              "rowid_column": "id",
+              "query_column": "query",
+            }
+          }
+        }
+    ```
+    \b
+
+    :param mountpoint: Schema to mount the remote into.
+    :param server: Database hostname.
+    :param port: Database port
+    :param username: A read-only user that the database will be accessed as.
+    :param password: Password for the read-only user.
+    :param table_spec: A dictionary of form
+        ```
+        {"table_name":
+            {"schema": {"col1": "type1"...},
+             "index": <es index>,
+             "type": <es doc_type, optional in ES7 and later>,
+             "query_column": <column to pass ES query in>,
+             "score_column": <column to return document score>,
+             "scroll_size": <fetch size, default 1000>,
+             "scroll_duration": <how long to hold the scroll context open for, default 10m>},
+             ...}
+        ```
+    """
+    from splitgraph.engine import get_engine
+    from psycopg2.sql import Identifier, SQL
+
+    engine = get_engine()
+    logging.info("Mounting ElasticSearch instance...")
+    server_id = mountpoint + "_server"
+
+    init_fdw(
+        engine,
+        server_id,
+        "multicorn",
+        {
+            "wrapper": "pg_es_fdw.ElasticsearchFDW",
+            "host": server,
+            "port": port,
+            "username": username,
+            "password": password,
+        },
+        None,
+    )
+
+    engine.run_sql(SQL("CREATE SCHEMA IF NOT EXISTS {}").format(Identifier(mountpoint)))
+
+    for table_name, table_options in table_spec.items():
+        logging.info("Mounting table %s", table_name)
+        schema = table_options.pop("schema")
+        _create_foreign_table(
+            engine,
+            local_schema=mountpoint,
+            table_name=table_name,
+            schema_spec=schema,
+            server_id=server_id,
+            server_options=table_options,
+        )
+
+
 def mount(
     mountpoint: str,
     mount_handler: str,

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@`
`61`	`61`	`"mongo_fdw": "splitgraph.hooks.mount_handlers.mount_mongo",`
`62`	`62`	`"mysql_fdw": "splitgraph.hooks.mount_handlers.mount_mysql",`
`63`	`63`	`"socrata": "splitgraph.ingestion.socrata.mount.mount_socrata",`
	`64`	`+ "elasticsearch": "splitgraph.hooks.mount_handlers.mount_elasticsearch",`
`64`	`65`	`},`
`65`	`66`	`}`
`66`	`67`