diff --git a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md new file mode 100644 index 0000000..5072383 --- /dev/null +++ b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md @@ -0,0 +1,47 @@ +# Generate Parquet files from Jaffle Shop CSV data + +## Prerequisites + +- `pipx` +- `gcloud` CLI + +This script reads the Jaffle Shop CSV files and converts them to Parquet format for more efficient storage and querying in Snowflake. + +### Dependencies + +Ensure you are in the `integration/jaffle-shop-data` directory and have `uv` set up: + +```bash +cd integration/jaffle-shop-data/ +uv sync +``` + +## Generate Jaffle Shop Data (CSV) + +To generate the Jaffle Shop CSV data, +run the following command (in `jaffle-shop-data` directory): + +```bash +pipx run jafgen 6 +``` + +This will create the necessary CSV files in the `jaffle-data` directory. + +## Convert CSV to Parquet + +To convert the generated CSV files to Parquet format, run the following script: + +```bash +uv run python convert_jaffle_csv_to_parquet.py +``` + +This will read each CSV file from the `jaffle-data` directory and save the corresponding Parquet files in the `jaffle-data/parquet` directory. + +## Upload Parquet Files to GCP + +To upload the Parquet files to your GCP bucket, use the following commands: + +```bash +gcloud config set project getml-infra +gcloud storage cp jaffle-data/parquet/*.parquet gs://static.getml.com/datasets/jaffle_shop/ +``` diff --git a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py new file mode 100644 index 0000000..742ba8c --- /dev/null +++ b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py @@ -0,0 +1,37 @@ +from pathlib import Path + +import pandas as pd + +NAMES: list[str] = [ + "raw_customers", + "raw_items", + "raw_orders", + "raw_products", + "raw_stores", + "raw_supplies", + "raw_tweets", +] + +JAFFLE_CSV_DATA_PATH = Path("jaffle-data") + +if not JAFFLE_CSV_DATA_PATH.exists(): + raise FileNotFoundError( + f"Jaffle CSV data path {JAFFLE_CSV_DATA_PATH} does not exist." + " Please run `pipx run jafgen 6` to generate CSVs. (6 years)" + ) + +JAFFLE_PARQUET_DATA_PATH: Path = JAFFLE_CSV_DATA_PATH / "parquet" +JAFFLE_PARQUET_DATA_PATH.mkdir(parents=True, exist_ok=True) + + +for name in NAMES: + csv_filepath = JAFFLE_CSV_DATA_PATH / f"{name}.csv" + parquet_filepath = JAFFLE_PARQUET_DATA_PATH / f"{name}.parquet" + print(f"Loading {csv_filepath}...") + + df: pd.DataFrame = pd.read_csv(csv_filepath) + + # 'index=False' prevents adding an extra index column + df.to_parquet(parquet_filepath, index=False) + + print(f"Converted {name} to parquet format at {parquet_filepath}.") diff --git a/integration/jaffle-shop-data/pyproject.toml b/integration/jaffle-shop-data/pyproject.toml new file mode 100644 index 0000000..b11b3b0 --- /dev/null +++ b/integration/jaffle-shop-data/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "jaffle-shop-parquet-generator" +version = "0.1.0" +description = "Convert Jaffle Shop CSV data to Parquet format" +requires-python = ">=3.12" + +dependencies = [ + "pandas>=2.0.0", + "pyarrow>=14.0.0", +] + +[tool.uv] +package = false