From dee36e82bc37db7eef95373f02783c799b3e7dd9 Mon Sep 17 00:00:00 2001 From: Alexandros Ladas Date: Fri, 5 Dec 2025 16:55:58 +0100 Subject: [PATCH 1/2] Add script to generate and convert Jaffle Shop CSV data to Parquet format and instructions to upload to GCS bucket --- .../GENERATE_JAFFLE_SHOP_PARQUET.md | 37 +++++ .../convert_jaffle_csv_to_parquet.py | 39 +++++ integration/pyproject.toml | 137 ++++++++++++++++++ 3 files changed, 213 insertions(+) create mode 100644 integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md create mode 100644 integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py create mode 100644 integration/pyproject.toml diff --git a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md new file mode 100644 index 0000000..04b1b9e --- /dev/null +++ b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md @@ -0,0 +1,37 @@ +# Generate Parquet files from Jaffle Shop CSV data + +## Prerequisites + +- pipx +- gcloud CLI + +This script reads the Jaffle Shop CSV files and converts them to Parquet format for more efficient storage and querying in Snowflake. + +## Generate Jaffle Shop Data (CSV) + +To generate the Jaffle Shop CSV data, run the following command: + +```bash +pipx run jafgen 6 +``` + +This will create the necessary CSV files in the `jaffle-data` directory. + +## Convert CSV to Parquet + +To convert the generated CSV files to Parquet format, run the following script: + +```bash +python convert_jaffle_csv_to_parquet.py +``` + +This will read each CSV file from the `jaffle-data` directory and save the corresponding Parquet files in the `jaffle-data/parquet` directory. + +## Upload Parquet Files to GCP + +To upload the Parquet files to your GCP bucket, use the following commands: + +```bash +gcloud config set project getml-infra +gcloud storage cp jaffle-data/parquet/*.parquet gs://static.getml.com/datasets/jaffle_shop/ +``` diff --git a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py new file mode 100644 index 0000000..e97ae58 --- /dev/null +++ b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py @@ -0,0 +1,39 @@ +from pathlib import Path + +import pandas as pd + +NAMES: list[str] = [ + "raw_customers", + "raw_items", + "raw_orders", + "raw_products", + "raw_stores", + "raw_supplies", + "raw_tweets", +] + +JAFFLE_CSV_DATA_PATH = Path("jaffle-data") + +if not JAFFLE_CSV_DATA_PATH.exists(): + raise FileNotFoundError( + f"Jaffle CSV data path {JAFFLE_CSV_DATA_PATH} does not exist." + " Please run `jafgen` to generate CSVs." + ) + +JAFFLE_PARQUET_DATA_PATH = JAFFLE_CSV_DATA_PATH / "parquet" +Path.mkdir(JAFFLE_PARQUET_DATA_PATH, exist_ok=True) + + +for name in NAMES: + csv_filepath = JAFFLE_CSV_DATA_PATH / f"{name}.csv" + parquet_filepath = JAFFLE_PARQUET_DATA_PATH / f"{name}.parquet" + print(f"Loading {csv_filepath}...") + + # 1. Read CSV into memory + df: pd.DataFrame = pd.read_csv(csv_filepath) + + # 2. Write DataFrame to Parquet + # 'index=False' prevents pandas from adding an extra index column + df.to_parquet(parquet_filepath, index=False) + + print(f"Converted {name} to parquet format at {parquet_filepath}.") diff --git a/integration/pyproject.toml b/integration/pyproject.toml new file mode 100644 index 0000000..0eee2da --- /dev/null +++ b/integration/pyproject.toml @@ -0,0 +1,137 @@ +[project] +name = "getml-featurestore-integrations" +version = "0.1.0" +description = "Integrations and Data Preparation for getML Feature Stores" +authors = [ + { name = "Code17 GmbH", email = "hello@code17.io" }, + { name = "getML", email = "hello@getml.com" }, +] +maintainers = [ + { name = "Code17 GmbH", email = "hello@code17.io" }, + { name = "getML", email = "hello@getml.com" }, +] +license = { text = "Proprietary" } +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Operating System :: OS Independent", + "Private :: Do Not Upload", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", +] +readme = "README.md" +requires-python = ">=3.12" + +dependencies = [ + "fastparquet>=2024.11.0", + "httpx>=0.27.0", + "ipykernel>=7.1.0", + "pandas>=2.3.3", + "pyarrow>=18.0.0", + "pydantic>=2.12.5", + "pydantic-settings>=2.12.0", + "snowflake-connector-python>=3.17.3", + "snowflake-snowpark-python>=1.42.0", +] + +[dependency-groups] +dev = [ + "ruff~=0.12.2", + "basedpyright~=1.28.4", + "pytest~=8.0.0", + "pytest-cov>=6.2.1", + "pytest-dependency>=0.6.0", +] + +[tool.uv] +package = false + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +markers = [ + "integration: marks tests as integration tests (require Snowflake credentials)", +] + +[project.urls] +"Homepage" = "https://github.com/getml/getml-demo" +"Bug Tracker" = "https://github.com/getml/getml-demo/issues" +"getML" = "https://getml.com" +"Code17 GmbH" = "https://www.code17.io/" + +[tool.pyright] +venvPath = "." +venv = ".venv" +reportMissingTypeStubs = false +reportImplicitStringConcatenation = false + +[[tool.pyright.executionEnvironments]] +root = "tests" +extraPaths = ["."] +reportUnusedParameter = false + +[build-system] +requires = ["uv_build>=0.7.21,<0.8.0"] +build-backend = "uv_build" + +[tool.ruff] +line-length = 88 +target-version = "py312" + +[tool.ruff.format] +preview = false +quote-style = "double" +line-ending = "auto" +docstring-code-format = true + +[tool.ruff.lint] +select = ["ALL"] +ignore = [ + # Allow for string literals in exceptions + "EM", + # Allow missing copyright notice at top of files + "CPY001", + # Allow missing docstrings in public modules + "D100", + # Allow missing docstrings in public classes + "D101", + # Allow missing docstrings in public packages + "D104", + # Allow docstrings without blank line before class docstring + "D203", + # Allow multi-line docstring summary to start at second line + "D213", + # Allow first-party imports outside type-checking blocks + "TC001", + # Allow third-party imports outside type-checking blocks + "TC002", + # Allow standard library imports outside type-checking blocks + "TC003", + # Allow TODO comments + "FIX002", + # Allow TODO comments without author + "TD002", + # Allow TODO comments without link to issue + "TD003", + # Allow specifying long messages outside the exception class + "TRY003", + # Conflicts with formatter - trailing commas are handled by ruff format + "COM812", +] + +fixable = ["ALL"] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +# S101: Allow for use of the assert keyword +# PLR2004: Allow "magic value" used in comparison +"test_*.py" = ["S101", "PLR2004"] From c92e44a80b9640e7f8cd0ce9564e10500d5b500d Mon Sep 17 00:00:00 2001 From: Alexandros Ladas Date: Sun, 14 Dec 2025 22:21:31 +0100 Subject: [PATCH 2/2] Refactor Jaffle Shop data generation scripts and update project configuration --- .../GENERATE_JAFFLE_SHOP_PARQUET.md | 18 ++- .../convert_jaffle_csv_to_parquet.py | 10 +- integration/jaffle-shop-data/pyproject.toml | 13 ++ integration/pyproject.toml | 137 ------------------ 4 files changed, 31 insertions(+), 147 deletions(-) create mode 100644 integration/jaffle-shop-data/pyproject.toml delete mode 100644 integration/pyproject.toml diff --git a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md index 04b1b9e..5072383 100644 --- a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md +++ b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md @@ -2,14 +2,24 @@ ## Prerequisites -- pipx -- gcloud CLI +- `pipx` +- `gcloud` CLI This script reads the Jaffle Shop CSV files and converts them to Parquet format for more efficient storage and querying in Snowflake. +### Dependencies + +Ensure you are in the `integration/jaffle-shop-data` directory and have `uv` set up: + +```bash +cd integration/jaffle-shop-data/ +uv sync +``` + ## Generate Jaffle Shop Data (CSV) -To generate the Jaffle Shop CSV data, run the following command: +To generate the Jaffle Shop CSV data, +run the following command (in `jaffle-shop-data` directory): ```bash pipx run jafgen 6 @@ -22,7 +32,7 @@ This will create the necessary CSV files in the `jaffle-data` directory. To convert the generated CSV files to Parquet format, run the following script: ```bash -python convert_jaffle_csv_to_parquet.py +uv run python convert_jaffle_csv_to_parquet.py ``` This will read each CSV file from the `jaffle-data` directory and save the corresponding Parquet files in the `jaffle-data/parquet` directory. diff --git a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py index e97ae58..742ba8c 100644 --- a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py +++ b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py @@ -17,11 +17,11 @@ if not JAFFLE_CSV_DATA_PATH.exists(): raise FileNotFoundError( f"Jaffle CSV data path {JAFFLE_CSV_DATA_PATH} does not exist." - " Please run `jafgen` to generate CSVs." + " Please run `pipx run jafgen 6` to generate CSVs. (6 years)" ) -JAFFLE_PARQUET_DATA_PATH = JAFFLE_CSV_DATA_PATH / "parquet" -Path.mkdir(JAFFLE_PARQUET_DATA_PATH, exist_ok=True) +JAFFLE_PARQUET_DATA_PATH: Path = JAFFLE_CSV_DATA_PATH / "parquet" +JAFFLE_PARQUET_DATA_PATH.mkdir(parents=True, exist_ok=True) for name in NAMES: @@ -29,11 +29,9 @@ parquet_filepath = JAFFLE_PARQUET_DATA_PATH / f"{name}.parquet" print(f"Loading {csv_filepath}...") - # 1. Read CSV into memory df: pd.DataFrame = pd.read_csv(csv_filepath) - # 2. Write DataFrame to Parquet - # 'index=False' prevents pandas from adding an extra index column + # 'index=False' prevents adding an extra index column df.to_parquet(parquet_filepath, index=False) print(f"Converted {name} to parquet format at {parquet_filepath}.") diff --git a/integration/jaffle-shop-data/pyproject.toml b/integration/jaffle-shop-data/pyproject.toml new file mode 100644 index 0000000..b11b3b0 --- /dev/null +++ b/integration/jaffle-shop-data/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "jaffle-shop-parquet-generator" +version = "0.1.0" +description = "Convert Jaffle Shop CSV data to Parquet format" +requires-python = ">=3.12" + +dependencies = [ + "pandas>=2.0.0", + "pyarrow>=14.0.0", +] + +[tool.uv] +package = false diff --git a/integration/pyproject.toml b/integration/pyproject.toml deleted file mode 100644 index 0eee2da..0000000 --- a/integration/pyproject.toml +++ /dev/null @@ -1,137 +0,0 @@ -[project] -name = "getml-featurestore-integrations" -version = "0.1.0" -description = "Integrations and Data Preparation for getML Feature Stores" -authors = [ - { name = "Code17 GmbH", email = "hello@code17.io" }, - { name = "getML", email = "hello@getml.com" }, -] -maintainers = [ - { name = "Code17 GmbH", email = "hello@code17.io" }, - { name = "getML", email = "hello@getml.com" }, -] -license = { text = "Proprietary" } -classifiers = [ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Operating System :: OS Independent", - "Private :: Do Not Upload", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Software Development :: Libraries", - "Topic :: Software Development :: Libraries :: Python Modules", -] -readme = "README.md" -requires-python = ">=3.12" - -dependencies = [ - "fastparquet>=2024.11.0", - "httpx>=0.27.0", - "ipykernel>=7.1.0", - "pandas>=2.3.3", - "pyarrow>=18.0.0", - "pydantic>=2.12.5", - "pydantic-settings>=2.12.0", - "snowflake-connector-python>=3.17.3", - "snowflake-snowpark-python>=1.42.0", -] - -[dependency-groups] -dev = [ - "ruff~=0.12.2", - "basedpyright~=1.28.4", - "pytest~=8.0.0", - "pytest-cov>=6.2.1", - "pytest-dependency>=0.6.0", -] - -[tool.uv] -package = false - -[tool.pytest.ini_options] -testpaths = ["tests"] -python_files = ["test_*.py"] -python_classes = ["Test*"] -python_functions = ["test_*"] -markers = [ - "integration: marks tests as integration tests (require Snowflake credentials)", -] - -[project.urls] -"Homepage" = "https://github.com/getml/getml-demo" -"Bug Tracker" = "https://github.com/getml/getml-demo/issues" -"getML" = "https://getml.com" -"Code17 GmbH" = "https://www.code17.io/" - -[tool.pyright] -venvPath = "." -venv = ".venv" -reportMissingTypeStubs = false -reportImplicitStringConcatenation = false - -[[tool.pyright.executionEnvironments]] -root = "tests" -extraPaths = ["."] -reportUnusedParameter = false - -[build-system] -requires = ["uv_build>=0.7.21,<0.8.0"] -build-backend = "uv_build" - -[tool.ruff] -line-length = 88 -target-version = "py312" - -[tool.ruff.format] -preview = false -quote-style = "double" -line-ending = "auto" -docstring-code-format = true - -[tool.ruff.lint] -select = ["ALL"] -ignore = [ - # Allow for string literals in exceptions - "EM", - # Allow missing copyright notice at top of files - "CPY001", - # Allow missing docstrings in public modules - "D100", - # Allow missing docstrings in public classes - "D101", - # Allow missing docstrings in public packages - "D104", - # Allow docstrings without blank line before class docstring - "D203", - # Allow multi-line docstring summary to start at second line - "D213", - # Allow first-party imports outside type-checking blocks - "TC001", - # Allow third-party imports outside type-checking blocks - "TC002", - # Allow standard library imports outside type-checking blocks - "TC003", - # Allow TODO comments - "FIX002", - # Allow TODO comments without author - "TD002", - # Allow TODO comments without link to issue - "TD003", - # Allow specifying long messages outside the exception class - "TRY003", - # Conflicts with formatter - trailing commas are handled by ruff format - "COM812", -] - -fixable = ["ALL"] - -[tool.ruff.lint.pydocstyle] -convention = "google" - -[tool.ruff.lint.per-file-ignores] -# S101: Allow for use of the assert keyword -# PLR2004: Allow "magic value" used in comparison -"test_*.py" = ["S101", "PLR2004"]