Merge pull request #638 from splitgraph/add-big-query-data-source-cu-26udw0h

gruuya · web-flow · commit b4a7e77ba611 · 2022-02-28T10:42:01.000+01:00
Big Query data source
diff --git a/engine/Dockerfile b/engine/Dockerfile
@@ -195,6 +195,10 @@ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
     pip install "PyAthena>=2.4.1" && \
     pip install "pandas>=1.0.0"
 
+# Install Google's Big Query SQLAlchemy dialect lib
+RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
+    pip install "sqlalchemy-bigquery"
+
 ENV PATH "${PATH}:/splitgraph/bin"
 ENV PYTHONPATH "${PYTHONPATH}:/splitgraph:/pg_es_fdw"
 
diff --git a/engine/Dockerfile.debug b/engine/Dockerfile.debug
@@ -118,6 +118,10 @@ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
     pip install "PyAthena>=2.4.1" && \
     pip install "pandas>=1.0.0"
 
+# Install Google's Big Query SQLAlchemy dialect lib
+RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
+    pip install "sqlalchemy-bigquery"
+
 ENV PATH "${PATH}:/splitgraph/bin"
 ENV PYTHONPATH "${PYTHONPATH}:/splitgraph:/pg_es_fdw"
 
diff --git a/splitgraph/config/keys.py b/splitgraph/config/keys.py
@@ -67,6 +67,7 @@
         "snowflake": "splitgraph.ingestion.snowflake.SnowflakeDataSource",
         "dbt": "splitgraph.ingestion.dbt.data_source.DBTDataSource",
         "athena": "splitgraph.ingestion.athena.AmazonAthenaDataSource",
+        "bigquery": "splitgraph.ingestion.bigquery.BigQueryDataSource",
     },
 }
 
diff --git a/splitgraph/ingestion/bigquery/BUILD b/splitgraph/ingestion/bigquery/BUILD
@@ -0,0 +1,6 @@
+python_sources(
+    skip_black=True,
+    dependencies=[
+        "src/py/splitgraph/splitgraph/resources/icons",
+    ],
+)
diff --git a/splitgraph/ingestion/bigquery/__init__.py b/splitgraph/ingestion/bigquery/__init__.py
@@ -0,0 +1,140 @@
+import base64
+from copy import deepcopy
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+from splitgraph.core.types import Credentials, Params, TableInfo
+from splitgraph.hooks.data_source.fdw import ForeignDataWrapperDataSource
+from splitgraph.ingestion.common import build_commandline_help
+
+if TYPE_CHECKING:
+    from splitgraph.engine.postgres.engine import PostgresEngine
+
+
+class BigQueryDataSource(ForeignDataWrapperDataSource):
+    credentials_schema: Dict[str, Any] = {
+        "type": "object",
+        "properties": {
+            "credentials": {
+                "type": "string",
+                "title": "GCP credentials",
+                "description": "GCP credentials in JSON format",
+            },
+        },
+    }
+
+    params_schema = {
+        "type": "object",
+        "properties": {
+            "project": {
+                "type": "string",
+                "title": "GCP project name",
+                "description": "Name of the GCP project to use",
+            },
+            "dataset_name": {
+                "type": "string",
+                "title": "Big Query dataset",
+                "description": "Name of the dataset in Big Query",
+            },
+        },
+        "required": ["project", "dataset_name"],
+    }
+
+    supports_mount = True
+    supports_load = True
+    supports_sync = False
+
+    commandline_help = """Mount a GCP Big Query project/dataset.
+
+This will mount a Big Query dataset:
+
+\b
+```
+$ sgr mount bigquery bq -o@- <<EOF
+{
+    "credentials": "/path/to/my/creds.json",
+    "project": "my-project-name",
+    "dataset_name": "my_dataset"
+}
+EOF
+```
+    """
+
+    commandline_kwargs_help: str = (
+        build_commandline_help(credentials_schema) + "\n" + build_commandline_help(params_schema)
+    )
+
+    _icon_file = "bigquery.svg"
+
+    def __init__(
+        self,
+        engine: "PostgresEngine",
+        credentials: Credentials,
+        params: Params,
+        tables: Optional[TableInfo] = None,
+    ):
+        super().__init__(engine, credentials, params, tables)
+
+    def get_fdw_name(self):
+        return "multicorn"
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "Google BigQuery"
+
+    @classmethod
+    def get_description(cls) -> str:
+        return "Query data in GCP BigQuery datasets"
+
+    @classmethod
+    def from_commandline(cls, engine, commandline_kwargs) -> "BigQueryDataSource":
+        params = deepcopy(commandline_kwargs)
+        credentials = Credentials({})
+
+        if "credentials" in params:
+            with open(params["credentials"], "r") as credentials_file:
+                credentials_str = credentials_file.read()
+
+            params.pop("credentials")
+            credentials["credentials"] = credentials_str
+
+        return cls(engine, credentials, params)
+
+    def get_table_options(
+        self, table_name: str, tables: Optional[TableInfo] = None
+    ) -> Dict[str, str]:
+        result = super().get_table_options(table_name, tables)
+        result["tablename"] = result.get("tablename", table_name)
+        return result
+
+    def get_server_options(self):
+        options: Dict[str, Optional[str]] = {
+            "wrapper": "multicorn.sqlalchemyfdw.SqlAlchemyFdw",
+            "db_url": self._build_db_url(),
+        }
+
+        # For some reason, in SQLAlchemy, if this is not passed
+        # to the FDW params (even if it is in the DB URL), it doesn't
+        # schema-qualify tables and server-side cursors don't work for scanning
+        # (loads the whole table instead of scrolling through it).
+        if "schema" in self.params:
+            options["schema"] = self.params["schema"]
+
+        return options
+
+    def _build_db_url(self) -> str:
+        """Construct the SQLAlchemy GCP Big Query db_url"""
+
+        db_url = f"bigquery://{self.params['project']}/{self.params['dataset_name']}"
+
+        if "credentials" in self.credentials:
+            # base64 encode the credentials
+            credentials_str = self.credentials["credentials"]
+            credentials_base64 = base64.urlsafe_b64encode(credentials_str.encode()).decode()
+            db_url += f"?credentials_base64={credentials_base64}"
+
+        return db_url
+
+    def get_remote_schema_name(self) -> str:
+        if "dataset_name" not in self.params:
+            raise ValueError("Cannot IMPORT FOREIGN SCHEMA without a dataset_name!")
+        return str(self.params["dataset_name"])
diff --git a/splitgraph/resources/icons/biqquery.svg b/splitgraph/resources/icons/biqquery.svg
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="64" height="64">
+  <path d="M14.48 58.196L.558 34.082c-.744-1.288-.744-2.876 0-4.164L14.48 5.805c.743-1.287 2.115-2.08 3.6-2.082h27.857c1.48.007 2.845.8 3.585 2.082l13.92 24.113c.744 1.288.744 2.876 0 4.164L49.52 58.196c-.743 1.287-2.115 2.08-3.6 2.082H18.07c-1.483-.005-2.85-.798-3.593-2.082z" fill="#4386fa"/>
+  <path d="M40.697 24.235s3.87 9.283-1.406 14.545-14.883 1.894-14.883 1.894L43.95 60.27h1.984c1.486-.002 2.858-.796 3.6-2.082L58.75 42.23z" opacity=".1"/>
+  <path d="M45.267 43.23L41 38.953a.67.67 0 0 0-.158-.12 11.63 11.63 0 1 0-2.032 2.037.67.67 0 0 0 .113.15l4.277 4.277a.67.67 0 0 0 .947 0l1.12-1.12a.67.67 0 0 0 0-.947zM31.64 40.464a8.75 8.75 0 1 1 8.749-8.749 8.75 8.75 0 0 1-8.749 8.749zm-5.593-9.216v3.616c.557.983 1.363 1.803 2.338 2.375v-6.013zm4.375-2.998v9.772a6.45 6.45 0 0 0 2.338 0V28.25zm6.764 6.606v-2.142H34.85v4.5a6.43 6.43 0 0 0 2.338-2.368z" fill="#fff"/>
+</svg>
diff --git a/test/resources/ingestion/bigquery/dummy_credentials.json b/test/resources/ingestion/bigquery/dummy_credentials.json
@@ -0,0 +1,12 @@
+{
+  "type": "service_account",
+  "project_id": "project_id",
+  "private_key_id": "private_key_id",
+  "private_key": "private_key",
+  "client_email": "client_email",
+  "client_id": "client_id",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "client_x509_cert_url"
+}
diff --git a/test/splitgraph/ingestion/test_bigquery.py b/test/splitgraph/ingestion/test_bigquery.py
@@ -0,0 +1,77 @@
+import base64
+from unittest.mock import Mock
+
+import pytest
+from psycopg2 import DatabaseError
+
+from splitgraph.core.types import Credentials, Params
+from splitgraph.hooks.mount_handlers import mount
+from splitgraph.ingestion.bigquery import BigQueryDataSource
+
+
+def test_bigquery_data_source_options_creds_file(local_engine_empty):
+    source = BigQueryDataSource.from_commandline(
+        local_engine_empty,
+        {
+            "credentials": "test/resources/ingestion/bigquery/dummy_credentials.json",
+            "project": "bigquery-public-data",
+            "dataset_name": "hacker_news",
+        },
+    )
+
+    with open("test/resources/ingestion/bigquery/dummy_credentials.json", "r") as credentials_file:
+        credentials_str = credentials_file.read()
+        credentials_base64 = base64.urlsafe_b64encode(credentials_str.encode()).decode()
+
+    assert source.get_server_options() == {
+        "db_url": f"bigquery://bigquery-public-data/hacker_news?credentials_base64={credentials_base64}",
+        "wrapper": "multicorn.sqlalchemyfdw.SqlAlchemyFdw",
+    }
+
+
+def test_bigquery_data_source_options_creds_raw():
+    source = BigQueryDataSource(
+        Mock(),
+        credentials=Credentials({"credentials": "test-raw-creds"}),
+        params=Params(
+            {
+                "project": "bigquery-public-data",
+                "dataset_name": "hacker_news",
+            }
+        ),
+    )
+
+    credentials_base64 = base64.urlsafe_b64encode("test-raw-creds".encode()).decode()
+
+    assert source.get_server_options() == {
+        "db_url": f"bigquery://bigquery-public-data/hacker_news?credentials_base64={credentials_base64}",
+        "wrapper": "multicorn.sqlalchemyfdw.SqlAlchemyFdw",
+    }
+
+
+def test_bigquery_data_source_options_no_creds_file():
+    source = BigQueryDataSource(
+        Mock(),
+        credentials=Credentials({}),
+        params=Params(
+            {
+                "project": "bigquery-public-data",
+                "dataset_name": "hacker_news",
+            }
+        ),
+    )
+
+    assert source.get_server_options() == {
+        "db_url": "bigquery://bigquery-public-data/hacker_news",
+        "wrapper": "multicorn.sqlalchemyfdw.SqlAlchemyFdw",
+    }
+
+
+@pytest.mark.mounting
+def test_bigquery_mount_expected_error():
+    with pytest.raises(DatabaseError, match="Could not automatically determine credentials"):
+        mount(
+            "bq",
+            "bigquery",
+            {"project": "bigquery-public-data", "dataset_name": "hacker_news"},
+        )

Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,7 @@`
`67`	`67`	`"snowflake": "splitgraph.ingestion.snowflake.SnowflakeDataSource",`
`68`	`68`	`"dbt": "splitgraph.ingestion.dbt.data_source.DBTDataSource",`
`69`	`69`	`"athena": "splitgraph.ingestion.athena.AmazonAthenaDataSource",`
	`70`	`+ "bigquery": "splitgraph.ingestion.bigquery.BigQueryDataSource",`
`70`	`71`	`},`
`71`	`72`	`}`
`72`	`73`