|
| 1 | +import base64 |
| 2 | +from copy import deepcopy |
| 3 | +from typing import TYPE_CHECKING, Any, Dict, Optional |
| 4 | + |
| 5 | +from splitgraph.core.types import Credentials, Params, TableInfo |
| 6 | +from splitgraph.hooks.data_source.fdw import ForeignDataWrapperDataSource |
| 7 | +from splitgraph.ingestion.common import build_commandline_help |
| 8 | + |
| 9 | +if TYPE_CHECKING: |
| 10 | + from splitgraph.engine.postgres.engine import PostgresEngine |
| 11 | + |
| 12 | + |
| 13 | +class BigQueryDataSource(ForeignDataWrapperDataSource): |
| 14 | + credentials_schema: Dict[str, Any] = { |
| 15 | + "type": "object", |
| 16 | + "properties": { |
| 17 | + "credentials": { |
| 18 | + "type": "string", |
| 19 | + "title": "GCP credentials", |
| 20 | + "description": "GCP credentials in JSON format", |
| 21 | + }, |
| 22 | + }, |
| 23 | + } |
| 24 | + |
| 25 | + params_schema = { |
| 26 | + "type": "object", |
| 27 | + "properties": { |
| 28 | + "project": { |
| 29 | + "type": "string", |
| 30 | + "title": "GCP project name", |
| 31 | + "description": "Name of the GCP project to use", |
| 32 | + }, |
| 33 | + "dataset_name": { |
| 34 | + "type": "string", |
| 35 | + "title": "Big Query dataset", |
| 36 | + "description": "Name of the dataset in Big Query", |
| 37 | + }, |
| 38 | + }, |
| 39 | + "required": ["project", "dataset_name"], |
| 40 | + } |
| 41 | + |
| 42 | + supports_mount = True |
| 43 | + supports_load = True |
| 44 | + supports_sync = False |
| 45 | + |
| 46 | + commandline_help = """Mount a GCP Big Query project/dataset. |
| 47 | +
|
| 48 | +This will mount a Big Query dataset: |
| 49 | +
|
| 50 | +\b |
| 51 | +``` |
| 52 | +$ sgr mount bigquery bq -o@- <<EOF |
| 53 | +{ |
| 54 | + "credentials": "/path/to/my/creds.json", |
| 55 | + "project": "my-project-name", |
| 56 | + "dataset_name": "my_dataset" |
| 57 | +} |
| 58 | +EOF |
| 59 | +``` |
| 60 | + """ |
| 61 | + |
| 62 | + commandline_kwargs_help: str = ( |
| 63 | + build_commandline_help(credentials_schema) + "\n" + build_commandline_help(params_schema) |
| 64 | + ) |
| 65 | + |
| 66 | + _icon_file = "bigquery.svg" |
| 67 | + |
| 68 | + def __init__( |
| 69 | + self, |
| 70 | + engine: "PostgresEngine", |
| 71 | + credentials: Credentials, |
| 72 | + params: Params, |
| 73 | + tables: Optional[TableInfo] = None, |
| 74 | + ): |
| 75 | + super().__init__(engine, credentials, params, tables) |
| 76 | + |
| 77 | + def get_fdw_name(self): |
| 78 | + return "multicorn" |
| 79 | + |
| 80 | + @classmethod |
| 81 | + def get_name(cls) -> str: |
| 82 | + return "Google BigQuery" |
| 83 | + |
| 84 | + @classmethod |
| 85 | + def get_description(cls) -> str: |
| 86 | + return "Query data in GCP BigQuery datasets" |
| 87 | + |
| 88 | + @classmethod |
| 89 | + def from_commandline(cls, engine, commandline_kwargs) -> "BigQueryDataSource": |
| 90 | + params = deepcopy(commandline_kwargs) |
| 91 | + credentials = Credentials({}) |
| 92 | + |
| 93 | + if "credentials" in params: |
| 94 | + with open(params["credentials"], "r") as credentials_file: |
| 95 | + credentials_str = credentials_file.read() |
| 96 | + |
| 97 | + params.pop("credentials") |
| 98 | + credentials["credentials"] = credentials_str |
| 99 | + |
| 100 | + return cls(engine, credentials, params) |
| 101 | + |
| 102 | + def get_table_options( |
| 103 | + self, table_name: str, tables: Optional[TableInfo] = None |
| 104 | + ) -> Dict[str, str]: |
| 105 | + result = super().get_table_options(table_name, tables) |
| 106 | + result["tablename"] = result.get("tablename", table_name) |
| 107 | + return result |
| 108 | + |
| 109 | + def get_server_options(self): |
| 110 | + options: Dict[str, Optional[str]] = { |
| 111 | + "wrapper": "multicorn.sqlalchemyfdw.SqlAlchemyFdw", |
| 112 | + "db_url": self._build_db_url(), |
| 113 | + } |
| 114 | + |
| 115 | + # For some reason, in SQLAlchemy, if this is not passed |
| 116 | + # to the FDW params (even if it is in the DB URL), it doesn't |
| 117 | + # schema-qualify tables and server-side cursors don't work for scanning |
| 118 | + # (loads the whole table instead of scrolling through it). |
| 119 | + if "schema" in self.params: |
| 120 | + options["schema"] = self.params["schema"] |
| 121 | + |
| 122 | + return options |
| 123 | + |
| 124 | + def _build_db_url(self) -> str: |
| 125 | + """Construct the SQLAlchemy GCP Big Query db_url""" |
| 126 | + |
| 127 | + db_url = f"bigquery://{self.params['project']}/{self.params['dataset_name']}" |
| 128 | + |
| 129 | + if "credentials" in self.credentials: |
| 130 | + # base64 encode the credentials |
| 131 | + credentials_str = self.credentials["credentials"] |
| 132 | + credentials_base64 = base64.urlsafe_b64encode(credentials_str.encode()).decode() |
| 133 | + db_url += f"?credentials_base64={credentials_base64}" |
| 134 | + |
| 135 | + return db_url |
| 136 | + |
| 137 | + def get_remote_schema_name(self) -> str: |
| 138 | + if "dataset_name" not in self.params: |
| 139 | + raise ValueError("Cannot IMPORT FOREIGN SCHEMA without a dataset_name!") |
| 140 | + return str(self.params["dataset_name"]) |
0 commit comments