Skip to content

Commit ef4c785

Browse files
authored
Merge pull request #602 from splitgraph/bugfix/cli-csv-ingestion
Various fixes for sgr cloud CLIs
2 parents dfdb892 + 85ae2c7 commit ef4c785

File tree

13 files changed

+253
-20
lines changed

13 files changed

+253
-20
lines changed

splitgraph/cloud/project/dbt.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def generate_dbt_plugin_params(repositories: List[str]) -> Tuple[Dict[str, Any],
7878
# the Git pull URL at action runtime (using GITHUB_TOKEN).
7979
credentials = {"git_url": "$THIS_REPO_URL"}
8080

81-
params = {"sources": [_make_source(r) for r in repositories]}
81+
# Same with the branch: we want to inject the current SHA we're running the action for.
82+
params = {"sources": [_make_source(r) for r in repositories], "git_branch": "$THIS_SHA"}
8283

8384
return params, credentials

splitgraph/cloud/project/github_actions.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ def generate_job(
3030
{
3131
"name": "Set up dbt Git URL",
3232
"run": 'echo "$CREDENTIALS_YML" > splitgraph.credentials.yml && '
33-
'sed -i "s|\\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g" splitgraph.credentials.yml',
33+
'sed -i "s|\\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g" splitgraph.credentials.yml && '
34+
'sed -i "s|\\$THIS_SHA|$GITHUB_SHA|g" splitgraph.yml',
3435
"shell": "bash",
3536
"env": {
3637
"CREDENTIALS_YML": "${{secrets.SPLITGRAPH_CREDENTIALS_YML}}",
@@ -54,7 +55,7 @@ def generate_job(
5455
steps.append(
5556
{
5657
"name": "Run sgr cloud load to set up metadata and data source settings",
57-
"run": "sgr cloud load --remote splitgraph "
58+
"run": "sgr cloud load --remote splitgraph --initial-private "
5859
f"-f splitgraph.yml -f splitgraph.credentials.yml {repository}",
5960
"shell": "bash",
6061
}

splitgraph/cloud/project/templates.py

+113-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
# Here as a starting point. You can reference these models downstream in models that actually
6868
# materialize as tables.
6969
staging:
70-
+materialized: cte
70+
+materialized: ephemeral
7171
"""
7272

7373
SOURCES_YML_TEMPLATE = """# This file defines all data sources referenced by this model. The mapping
@@ -132,6 +132,118 @@
132132
"SQL credentials"). You can get them at https://www.splitgraph.com/settings/sql-credentials (or
133133
your deployment URL if you're on a private deployment).
134134
135+
### Edit `splitgraph.yml`
136+
137+
We generated a [`splitgraph.yml`](./splitgraph.yml) file from your chosen plugins'
138+
parameters JSONSchema. You should review it and add suitable plugin settings:
139+
140+
- set `tables` to `tables: {}` to let the plugin automatically infer the schema and the
141+
options of the data source (by default, it adds a sample table into the project file)
142+
- change and customize the `metadata` block
143+
- set up the plugin parameters in `external.params`. Where the comment says `CHOOSE ONE`
144+
and offers a list of alternative subobjects, choose one entry from the list and delete
145+
the list itself, leaving the object at the top level.
146+
147+
Example:
148+
149+
```yaml
150+
- namespace: my_namespace
151+
repository: csv
152+
# Catalog-specific metadata for the repository. Optional.
153+
metadata:
154+
readme:
155+
text: Readme
156+
description: Description of the repository
157+
topics:
158+
- sample_topic
159+
# Data source settings for the repository. Optional.
160+
external:
161+
# Name of the credential that the plugin uses. This can also be a credential_id if the
162+
# credential is already registered on Splitgraph.
163+
credential: csv
164+
plugin: csv
165+
# Plugin-specific parameters matching the plugin's parameters schema
166+
params:
167+
connection: # Choose one of:
168+
- connection_type: http # REQUIRED. Constant
169+
url: '' # REQUIRED. HTTP URL to the CSV file
170+
- connection_type: s3 # REQUIRED. Constant
171+
s3_endpoint: '' # REQUIRED. S3 endpoint (including port if required)
172+
s3_bucket: '' # REQUIRED. Bucket the object is in
173+
s3_region: '' # Region of the S3 bucket
174+
s3_secure: false # Whether to use HTTPS for S3 access
175+
s3_object: '' # Limit the import to a single object
176+
s3_object_prefix: '' # Prefix for object in S3 bucket
177+
autodetect_header: true # Detect whether the CSV file has a header automatically
178+
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
179+
autodetect_encoding: true # Detect the CSV file's encoding automatically
180+
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
181+
schema_inference_rows: 100000 # Number of rows to use for schema inference
182+
encoding: utf-8 # Encoding of the CSV file
183+
ignore_decode_errors: false # Ignore errors when decoding the file
184+
header: true # First line of the CSV file is its header
185+
delimiter: ',' # Character used to separate fields in the file
186+
quotechar: '"' # Character used to quote fields
187+
tables:
188+
sample_table:
189+
# Plugin-specific table parameters matching the plugin's schema
190+
options:
191+
url: '' # HTTP URL to the CSV file
192+
s3_object: '' # S3 object of the CSV file
193+
autodetect_header: true # Detect whether the CSV file has a header automatically
194+
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
195+
autodetect_encoding: true # Detect the CSV file's encoding automatically
196+
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
197+
schema_inference_rows: 100000 # Number of rows to use for schema inference
198+
encoding: utf-8 # Encoding of the CSV file
199+
ignore_decode_errors: false # Ignore errors when decoding the file
200+
header: true # First line of the CSV file is its header
201+
delimiter: ',' # Character used to separate fields in the file
202+
quotechar: '"' # Character used to quote fields
203+
# Schema of the table, a list of objects with `name` and `type`. If set to `[]`, will infer.
204+
schema: []
205+
# Whether live querying is enabled for the plugin (creates a "live" tag in the
206+
# repository proxying to the data source). The plugin must support live querying.
207+
is_live: true
208+
# Ingestion schedule settings. Disable this if you're using GitHub Actions or other methods
209+
# to trigger ingestion.
210+
schedule:
211+
```
212+
213+
becomes:
214+
215+
```yaml
216+
- namespace: my_namespace
217+
repository: csv
218+
metadata:
219+
readme:
220+
text: Readme
221+
description: Description of the repository
222+
topics:
223+
- sample_topic
224+
external:
225+
# No credential required since we're querying a CSV file over HTTP
226+
plugin: csv
227+
# Plugin-specific parameters matching the plugin's parameters schema
228+
params:
229+
connection:
230+
connection_type: http # REQUIRED. Constant
231+
url: 'https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv' # REQUIRED. HTTP URL to the CSV file
232+
autodetect_header: true # Detect whether the CSV file has a header automatically
233+
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
234+
autodetect_encoding: true # Detect the CSV file's encoding automatically
235+
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
236+
schema_inference_rows: 100000 # Number of rows to use for schema inference
237+
encoding: utf-8 # Encoding of the CSV file
238+
ignore_decode_errors: false # Ignore errors when decoding the file
239+
header: true # First line of the CSV file is its header
240+
delimiter: ',' # Character used to separate fields in the file
241+
quotechar: '"' # Character used to quote fields
242+
# Automatically infer table parameters
243+
tables: {}
244+
is_live: true
245+
```
246+
135247
### Set up GitHub Actions
136248
137249
Because this repository was itself generated by a GitHub Actions job, we can't edit the workflow

splitgraph/commandline/cloud.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -632,15 +632,15 @@ def load_c(
632632

633633
repo_yaml = load_project(repositories_file)
634634
repositories = repo_yaml.repositories
635+
if limit_repositories:
636+
repositories = [
637+
r for r in repositories if f"{r.namespace}/{r.repository}" in limit_repositories
638+
]
635639

636640
gql_client = GQLAPIClient(remote)
637641

638642
if not skip_external:
639643
rest_client = RESTAPIClient(remote)
640-
if limit_repositories:
641-
repositories = [
642-
r for r in repositories if f"{r.namespace}/{r.repository}" in limit_repositories
643-
]
644644

645645
filter_credential_names = [
646646
r.external.credential for r in repositories if r.external and r.external.credential

splitgraph/commandline/ingestion.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,9 @@ def csv_import(
142142
sample = [[str(i) for i in range(len(sample))]] + sample
143143

144144
type_overrides = dict(override_type or [])
145-
sg_schema = infer_sg_schema(sample, override_types=type_overrides, primary_keys=primary_key)
145+
sg_schema = infer_sg_schema(
146+
sample, override_types=type_overrides, primary_keys=primary_key, ignore_empty_strings=False
147+
)
146148
logging.debug("Using Splitgraph schema: %r", sg_schema)
147149

148150
# Reset the stream and pass it to COPY FROM STDIN

splitgraph/ingestion/inference.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,12 @@ def parse_json(json_s: str):
4848
]
4949

5050

51-
def _infer_column_schema(column_sample: Sequence[str]) -> str:
51+
def _infer_column_schema(column_sample: Sequence[str], ignore_empty_strings=True) -> str:
5252
for candidate, converter in _CONVERTERS:
5353
try:
5454
seen_value = False
5555
for c in column_sample:
56-
if c == "" or c is None:
56+
if (c == "" and ignore_empty_strings) or c is None:
5757
continue
5858

5959
seen_value = True
@@ -73,6 +73,7 @@ def infer_sg_schema(
7373
sample: Sequence[List[str]],
7474
override_types: Optional[Dict[str, str]] = None,
7575
primary_keys: Optional[List[str]] = None,
76+
ignore_empty_strings: bool = True,
7677
):
7778
override_types = override_types or {}
7879
primary_keys = primary_keys or []
@@ -92,7 +93,9 @@ def infer_sg_schema(
9293
)
9394

9495
for i, (c_name, c_sample) in enumerate(zip(header, columns)):
95-
pg_type = override_types.get(c_name, _infer_column_schema(c_sample))
96+
pg_type = override_types.get(
97+
c_name, _infer_column_schema(c_sample, ignore_empty_strings=ignore_empty_strings)
98+
)
9699

97100
result.append(
98101
TableColumn(

test/splitgraph/cloud/project/snapshots/test_dbt/test_generate_dbt_project/splitgraph_template/dbt_project.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ models:
3030
# Here as a starting point. You can reference these models downstream in models that actually
3131
# materialize as tables.
3232
staging:
33-
+materialized: cte
33+
+materialized: ephemeral

test/splitgraph/cloud/project/snapshots/test_generation/test_generate_project_no_dbt/generate_project/.github/workflows/build.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ jobs:
1818
env:
1919
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
2020
- name: Run sgr cloud load to set up metadata and data source settings
21-
run: sgr cloud load --remote splitgraph -f splitgraph.yml -f splitgraph.credentials.yml
22-
myns/postgres_fdw
21+
run: sgr cloud load --remote splitgraph --initial-private -f splitgraph.yml
22+
-f splitgraph.credentials.yml myns/postgres_fdw
2323
shell: bash
2424
myns_airbyte_postgres:
2525
name: Build myns/airbyte-postgres

test/splitgraph/cloud/project/snapshots/test_generation/test_generate_project_with_dbt/generate_project_dbt/.github/workflows/build.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ jobs:
1818
env:
1919
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
2020
- name: Run sgr cloud load to set up metadata and data source settings
21-
run: sgr cloud load --remote splitgraph -f splitgraph.yml -f splitgraph.credentials.yml
22-
myns/postgres_fdw
21+
run: sgr cloud load --remote splitgraph --initial-private -f splitgraph.yml
22+
-f splitgraph.credentials.yml myns/postgres_fdw
2323
shell: bash
2424
myns_airbyte_postgres:
2525
name: Build myns/airbyte-postgres
@@ -58,7 +58,7 @@ jobs:
5858
splitgraph_api_secret: ${{ secrets.SPLITGRAPH_API_SECRET }}
5959
- name: Set up dbt Git URL
6060
run: echo "$CREDENTIALS_YML" > splitgraph.credentials.yml && sed -i "s|\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g"
61-
splitgraph.credentials.yml
61+
splitgraph.credentials.yml && sed -i "s|\$THIS_SHA|$GITHUB_SHA|g" splitgraph.yml
6262
shell: bash
6363
env:
6464
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}

test/splitgraph/cloud/project/snapshots/test_generation/test_generate_project_with_dbt/generate_project_dbt/README.md

+112
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,118 @@ repository and create the following secrets:
3939
"SQL credentials"). You can get them at https://www.splitgraph.com/settings/sql-credentials (or
4040
your deployment URL if you're on a private deployment).
4141

42+
### Edit `splitgraph.yml`
43+
44+
We generated a [`splitgraph.yml`](./splitgraph.yml) file from your chosen plugins'
45+
parameters JSONSchema. You should review it and add suitable plugin settings:
46+
47+
- set `tables` to `tables: {}` to let the plugin automatically infer the schema and the
48+
options of the data source (by default, it adds a sample table into the project file)
49+
- change and customize the `metadata` block
50+
- set up the plugin parameters in `external.params`. Where the comment says `CHOOSE ONE`
51+
and offers a list of alternative subobjects, choose one entry from the list and delete
52+
the list itself, leaving the object at the top level.
53+
54+
Example:
55+
56+
```yaml
57+
- namespace: my_namespace
58+
repository: csv
59+
# Catalog-specific metadata for the repository. Optional.
60+
metadata:
61+
readme:
62+
text: Readme
63+
description: Description of the repository
64+
topics:
65+
- sample_topic
66+
# Data source settings for the repository. Optional.
67+
external:
68+
# Name of the credential that the plugin uses. This can also be a credential_id if the
69+
# credential is already registered on Splitgraph.
70+
credential: csv
71+
plugin: csv
72+
# Plugin-specific parameters matching the plugin's parameters schema
73+
params:
74+
connection: # Choose one of:
75+
- connection_type: http # REQUIRED. Constant
76+
url: '' # REQUIRED. HTTP URL to the CSV file
77+
- connection_type: s3 # REQUIRED. Constant
78+
s3_endpoint: '' # REQUIRED. S3 endpoint (including port if required)
79+
s3_bucket: '' # REQUIRED. Bucket the object is in
80+
s3_region: '' # Region of the S3 bucket
81+
s3_secure: false # Whether to use HTTPS for S3 access
82+
s3_object: '' # Limit the import to a single object
83+
s3_object_prefix: '' # Prefix for object in S3 bucket
84+
autodetect_header: true # Detect whether the CSV file has a header automatically
85+
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
86+
autodetect_encoding: true # Detect the CSV file's encoding automatically
87+
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
88+
schema_inference_rows: 100000 # Number of rows to use for schema inference
89+
encoding: utf-8 # Encoding of the CSV file
90+
ignore_decode_errors: false # Ignore errors when decoding the file
91+
header: true # First line of the CSV file is its header
92+
delimiter: ',' # Character used to separate fields in the file
93+
quotechar: '"' # Character used to quote fields
94+
tables:
95+
sample_table:
96+
# Plugin-specific table parameters matching the plugin's schema
97+
options:
98+
url: '' # HTTP URL to the CSV file
99+
s3_object: '' # S3 object of the CSV file
100+
autodetect_header: true # Detect whether the CSV file has a header automatically
101+
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
102+
autodetect_encoding: true # Detect the CSV file's encoding automatically
103+
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
104+
schema_inference_rows: 100000 # Number of rows to use for schema inference
105+
encoding: utf-8 # Encoding of the CSV file
106+
ignore_decode_errors: false # Ignore errors when decoding the file
107+
header: true # First line of the CSV file is its header
108+
delimiter: ',' # Character used to separate fields in the file
109+
quotechar: '"' # Character used to quote fields
110+
# Schema of the table, a list of objects with `name` and `type`. If set to `[]`, will infer.
111+
schema: []
112+
# Whether live querying is enabled for the plugin (creates a "live" tag in the
113+
# repository proxying to the data source). The plugin must support live querying.
114+
is_live: true
115+
# Ingestion schedule settings. Disable this if you're using GitHub Actions or other methods
116+
# to trigger ingestion.
117+
schedule:
118+
```
119+
120+
becomes:
121+
122+
```yaml
123+
- namespace: my_namespace
124+
repository: csv
125+
metadata:
126+
readme:
127+
text: Readme
128+
description: Description of the repository
129+
topics:
130+
- sample_topic
131+
external:
132+
# No credential required since we're querying a CSV file over HTTP
133+
plugin: csv
134+
# Plugin-specific parameters matching the plugin's parameters schema
135+
params:
136+
connection:
137+
connection_type: http # REQUIRED. Constant
138+
url: 'https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv' # REQUIRED. HTTP URL to the CSV file
139+
autodetect_header: true # Detect whether the CSV file has a header automatically
140+
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
141+
autodetect_encoding: true # Detect the CSV file's encoding automatically
142+
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
143+
schema_inference_rows: 100000 # Number of rows to use for schema inference
144+
encoding: utf-8 # Encoding of the CSV file
145+
ignore_decode_errors: false # Ignore errors when decoding the file
146+
header: true # First line of the CSV file is its header
147+
delimiter: ',' # Character used to separate fields in the file
148+
quotechar: '"' # Character used to quote fields
149+
# Automatically infer table parameters
150+
tables: {}
151+
is_live: true
152+
```
153+
42154
### Set up GitHub Actions
43155
44156
Because this repository was itself generated by a GitHub Actions job, we can't edit the workflow

test/splitgraph/cloud/project/snapshots/test_generation/test_generate_project_with_dbt/generate_project_dbt/dbt_project.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ models:
3030
# Here as a starting point. You can reference these models downstream in models that actually
3131
# materialize as tables.
3232
staging:
33-
+materialized: cte
33+
+materialized: ephemeral

test/splitgraph/cloud/project/snapshots/test_generation/test_generate_project_with_dbt/generate_project_dbt/splitgraph.yml

+1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ repositories:
9696
namespace: myns
9797
repository: airbyte-postgres
9898
hash_or_tag: latest
99+
git_branch: $THIS_SHA
99100
is_live: false
100101
tables: {}
101102
metadata:

0 commit comments

Comments
 (0)