From dee36e82bc37db7eef95373f02783c799b3e7dd9 Mon Sep 17 00:00:00 2001
From: Alexandros Ladas <alexandros@getml.com>
Date: Fri, 5 Dec 2025 16:55:58 +0100
Subject: [PATCH 1/2] Add script to generate and convert Jaffle Shop CSV data
 to Parquet format and instructions to upload to GCS bucket

---
 .../GENERATE_JAFFLE_SHOP_PARQUET.md           |  37 +++++
 .../convert_jaffle_csv_to_parquet.py          |  39 +++++
 integration/pyproject.toml                    | 137 ++++++++++++++++++
 3 files changed, 213 insertions(+)
 create mode 100644 integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md
 create mode 100644 integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py
 create mode 100644 integration/pyproject.toml

diff --git a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md
new file mode 100644
index 0000000..04b1b9e
--- /dev/null
+++ b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md
@@ -0,0 +1,37 @@
+# Generate Parquet files from Jaffle Shop CSV data
+
+## Prerequisites
+
+- pipx
+- gcloud CLI
+
+This script reads the Jaffle Shop CSV files and converts them to Parquet format for more efficient storage and querying in Snowflake.
+
+## Generate Jaffle Shop Data (CSV)
+
+To generate the Jaffle Shop CSV data, run the following command:
+
+```bash
+pipx run jafgen 6
+```
+
+This will create the necessary CSV files in the `jaffle-data` directory.
+
+## Convert CSV to Parquet
+
+To convert the generated CSV files to Parquet format, run the following script:
+
+```bash
+python convert_jaffle_csv_to_parquet.py
+```
+
+This will read each CSV file from the `jaffle-data` directory and save the corresponding Parquet files in the `jaffle-data/parquet` directory.
+
+## Upload Parquet Files to GCP
+
+To upload the Parquet files to your GCP bucket, use the following commands:
+
+```bash
+gcloud config set project getml-infra
+gcloud storage cp jaffle-data/parquet/*.parquet gs://static.getml.com/datasets/jaffle_shop/
+```
diff --git a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py
new file mode 100644
index 0000000..e97ae58
--- /dev/null
+++ b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py
@@ -0,0 +1,39 @@
+from pathlib import Path
+
+import pandas as pd
+
+NAMES: list[str] = [
+    "raw_customers",
+    "raw_items",
+    "raw_orders",
+    "raw_products",
+    "raw_stores",
+    "raw_supplies",
+    "raw_tweets",
+]
+
+JAFFLE_CSV_DATA_PATH = Path("jaffle-data")
+
+if not JAFFLE_CSV_DATA_PATH.exists():
+    raise FileNotFoundError(
+        f"Jaffle CSV data path {JAFFLE_CSV_DATA_PATH} does not exist."
+        " Please run `jafgen` to generate CSVs."
+    )
+
+JAFFLE_PARQUET_DATA_PATH = JAFFLE_CSV_DATA_PATH / "parquet"
+Path.mkdir(JAFFLE_PARQUET_DATA_PATH, exist_ok=True)
+
+
+for name in NAMES:
+    csv_filepath = JAFFLE_CSV_DATA_PATH / f"{name}.csv"
+    parquet_filepath = JAFFLE_PARQUET_DATA_PATH / f"{name}.parquet"
+    print(f"Loading {csv_filepath}...")
+
+    # 1. Read CSV into memory
+    df: pd.DataFrame = pd.read_csv(csv_filepath)
+
+    # 2. Write DataFrame to Parquet
+    # 'index=False' prevents pandas from adding an extra index column
+    df.to_parquet(parquet_filepath, index=False)
+
+    print(f"Converted {name} to parquet format at {parquet_filepath}.")
diff --git a/integration/pyproject.toml b/integration/pyproject.toml
new file mode 100644
index 0000000..0eee2da
--- /dev/null
+++ b/integration/pyproject.toml
@@ -0,0 +1,137 @@
+[project]
+name = "getml-featurestore-integrations"
+version = "0.1.0"
+description = "Integrations and Data Preparation for getML Feature Stores"
+authors = [
+    { name = "Code17 GmbH", email = "hello@code17.io" },
+    { name = "getML", email = "hello@getml.com" },
+]
+maintainers = [
+    { name = "Code17 GmbH", email = "hello@code17.io" },
+    { name = "getML", email = "hello@getml.com" },
+]
+license = { text = "Proprietary" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Operating System :: OS Independent",
+    "Private :: Do Not Upload",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+readme = "README.md"
+requires-python = ">=3.12"
+
+dependencies = [
+    "fastparquet>=2024.11.0",
+    "httpx>=0.27.0",
+    "ipykernel>=7.1.0",
+    "pandas>=2.3.3",
+    "pyarrow>=18.0.0",
+    "pydantic>=2.12.5",
+    "pydantic-settings>=2.12.0",
+    "snowflake-connector-python>=3.17.3",
+    "snowflake-snowpark-python>=1.42.0",
+]
+
+[dependency-groups]
+dev = [
+    "ruff~=0.12.2",
+    "basedpyright~=1.28.4",
+    "pytest~=8.0.0",
+    "pytest-cov>=6.2.1",
+    "pytest-dependency>=0.6.0",
+]
+
+[tool.uv]
+package = false
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+markers = [
+    "integration: marks tests as integration tests (require Snowflake credentials)",
+]
+
+[project.urls]
+"Homepage" = "https://github.com/getml/getml-demo"
+"Bug Tracker" = "https://github.com/getml/getml-demo/issues"
+"getML" = "https://getml.com"
+"Code17 GmbH" = "https://www.code17.io/"
+
+[tool.pyright]
+venvPath = "."
+venv = ".venv"
+reportMissingTypeStubs = false
+reportImplicitStringConcatenation = false
+
+[[tool.pyright.executionEnvironments]]
+root = "tests"
+extraPaths = ["."]
+reportUnusedParameter = false
+
+[build-system]
+requires = ["uv_build>=0.7.21,<0.8.0"]
+build-backend = "uv_build"
+
+[tool.ruff]
+line-length = 88
+target-version = "py312"
+
+[tool.ruff.format]
+preview = false
+quote-style = "double"
+line-ending = "auto"
+docstring-code-format = true
+
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    # Allow for string literals in exceptions
+    "EM",
+    # Allow missing copyright notice at top of files
+    "CPY001",
+    # Allow missing docstrings in public modules
+    "D100",
+    # Allow missing docstrings in public classes
+    "D101",
+    # Allow missing docstrings in public packages
+    "D104",
+    # Allow docstrings without blank line before class docstring
+    "D203",
+    # Allow multi-line docstring summary to start at second line
+    "D213",
+    # Allow first-party imports outside type-checking blocks
+    "TC001",
+    # Allow third-party imports outside type-checking blocks
+    "TC002",
+    # Allow standard library imports outside type-checking blocks
+    "TC003",
+    # Allow TODO comments  
+    "FIX002",
+    # Allow TODO comments without author
+    "TD002",
+    # Allow TODO comments without link to issue
+    "TD003",
+    # Allow specifying long messages outside the exception class
+    "TRY003",
+    # Conflicts with formatter - trailing commas are handled by ruff format
+    "COM812",
+]
+
+fixable = ["ALL"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.per-file-ignores]
+# S101: Allow for use of the assert keyword
+# PLR2004: Allow "magic value" used in comparison
+"test_*.py" = ["S101", "PLR2004"]

From c92e44a80b9640e7f8cd0ce9564e10500d5b500d Mon Sep 17 00:00:00 2001
From: Alexandros Ladas <alexandros@getml.com>
Date: Sun, 14 Dec 2025 22:21:31 +0100
Subject: [PATCH 2/2] Refactor Jaffle Shop data generation scripts and update
 project configuration

---
 .../GENERATE_JAFFLE_SHOP_PARQUET.md           |  18 ++-
 .../convert_jaffle_csv_to_parquet.py          |  10 +-
 integration/jaffle-shop-data/pyproject.toml   |  13 ++
 integration/pyproject.toml                    | 137 ------------------
 4 files changed, 31 insertions(+), 147 deletions(-)
 create mode 100644 integration/jaffle-shop-data/pyproject.toml
 delete mode 100644 integration/pyproject.toml

diff --git a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md
index 04b1b9e..5072383 100644
--- a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md
+++ b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md
@@ -2,14 +2,24 @@
 
 ## Prerequisites
 
-- pipx
-- gcloud CLI
+- `pipx`
+- `gcloud` CLI
 
 This script reads the Jaffle Shop CSV files and converts them to Parquet format for more efficient storage and querying in Snowflake.
 
+### Dependencies
+
+Ensure you are in the `integration/jaffle-shop-data` directory and have `uv` set up:
+
+```bash
+cd integration/jaffle-shop-data/
+uv sync
+```
+
 ## Generate Jaffle Shop Data (CSV)
 
-To generate the Jaffle Shop CSV data, run the following command:
+To generate the Jaffle Shop CSV data,
+run the following command (in `jaffle-shop-data` directory):
 
 ```bash
 pipx run jafgen 6
@@ -22,7 +32,7 @@ This will create the necessary CSV files in the `jaffle-data` directory.
 To convert the generated CSV files to Parquet format, run the following script:
 
 ```bash
-python convert_jaffle_csv_to_parquet.py
+uv run python convert_jaffle_csv_to_parquet.py
 ```
 
 This will read each CSV file from the `jaffle-data` directory and save the corresponding Parquet files in the `jaffle-data/parquet` directory.
diff --git a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py
index e97ae58..742ba8c 100644
--- a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py
+++ b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py
@@ -17,11 +17,11 @@
 if not JAFFLE_CSV_DATA_PATH.exists():
     raise FileNotFoundError(
         f"Jaffle CSV data path {JAFFLE_CSV_DATA_PATH} does not exist."
-        " Please run `jafgen` to generate CSVs."
+        " Please run `pipx run jafgen 6` to generate CSVs. (6 years)"
     )
 
-JAFFLE_PARQUET_DATA_PATH = JAFFLE_CSV_DATA_PATH / "parquet"
-Path.mkdir(JAFFLE_PARQUET_DATA_PATH, exist_ok=True)
+JAFFLE_PARQUET_DATA_PATH: Path = JAFFLE_CSV_DATA_PATH / "parquet"
+JAFFLE_PARQUET_DATA_PATH.mkdir(parents=True, exist_ok=True)
 
 
 for name in NAMES:
@@ -29,11 +29,9 @@
     parquet_filepath = JAFFLE_PARQUET_DATA_PATH / f"{name}.parquet"
     print(f"Loading {csv_filepath}...")
 
-    # 1. Read CSV into memory
     df: pd.DataFrame = pd.read_csv(csv_filepath)
 
-    # 2. Write DataFrame to Parquet
-    # 'index=False' prevents pandas from adding an extra index column
+    # 'index=False' prevents adding an extra index column
     df.to_parquet(parquet_filepath, index=False)
 
     print(f"Converted {name} to parquet format at {parquet_filepath}.")
diff --git a/integration/jaffle-shop-data/pyproject.toml b/integration/jaffle-shop-data/pyproject.toml
new file mode 100644
index 0000000..b11b3b0
--- /dev/null
+++ b/integration/jaffle-shop-data/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "jaffle-shop-parquet-generator"
+version = "0.1.0"
+description = "Convert Jaffle Shop CSV data to Parquet format"
+requires-python = ">=3.12"
+
+dependencies = [
+    "pandas>=2.0.0",
+    "pyarrow>=14.0.0",
+]
+
+[tool.uv]
+package = false
diff --git a/integration/pyproject.toml b/integration/pyproject.toml
deleted file mode 100644
index 0eee2da..0000000
--- a/integration/pyproject.toml
+++ /dev/null
@@ -1,137 +0,0 @@
-[project]
-name = "getml-featurestore-integrations"
-version = "0.1.0"
-description = "Integrations and Data Preparation for getML Feature Stores"
-authors = [
-    { name = "Code17 GmbH", email = "hello@code17.io" },
-    { name = "getML", email = "hello@getml.com" },
-]
-maintainers = [
-    { name = "Code17 GmbH", email = "hello@code17.io" },
-    { name = "getML", email = "hello@getml.com" },
-]
-license = { text = "Proprietary" }
-classifiers = [
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.12",
-    "Programming Language :: Python :: 3.13",
-    "Operating System :: OS Independent",
-    "Private :: Do Not Upload",
-    "Intended Audience :: Developers",
-    "Intended Audience :: Science/Research",
-    "Topic :: Scientific/Engineering",
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Software Development :: Libraries",
-    "Topic :: Software Development :: Libraries :: Python Modules",
-]
-readme = "README.md"
-requires-python = ">=3.12"
-
-dependencies = [
-    "fastparquet>=2024.11.0",
-    "httpx>=0.27.0",
-    "ipykernel>=7.1.0",
-    "pandas>=2.3.3",
-    "pyarrow>=18.0.0",
-    "pydantic>=2.12.5",
-    "pydantic-settings>=2.12.0",
-    "snowflake-connector-python>=3.17.3",
-    "snowflake-snowpark-python>=1.42.0",
-]
-
-[dependency-groups]
-dev = [
-    "ruff~=0.12.2",
-    "basedpyright~=1.28.4",
-    "pytest~=8.0.0",
-    "pytest-cov>=6.2.1",
-    "pytest-dependency>=0.6.0",
-]
-
-[tool.uv]
-package = false
-
-[tool.pytest.ini_options]
-testpaths = ["tests"]
-python_files = ["test_*.py"]
-python_classes = ["Test*"]
-python_functions = ["test_*"]
-markers = [
-    "integration: marks tests as integration tests (require Snowflake credentials)",
-]
-
-[project.urls]
-"Homepage" = "https://github.com/getml/getml-demo"
-"Bug Tracker" = "https://github.com/getml/getml-demo/issues"
-"getML" = "https://getml.com"
-"Code17 GmbH" = "https://www.code17.io/"
-
-[tool.pyright]
-venvPath = "."
-venv = ".venv"
-reportMissingTypeStubs = false
-reportImplicitStringConcatenation = false
-
-[[tool.pyright.executionEnvironments]]
-root = "tests"
-extraPaths = ["."]
-reportUnusedParameter = false
-
-[build-system]
-requires = ["uv_build>=0.7.21,<0.8.0"]
-build-backend = "uv_build"
-
-[tool.ruff]
-line-length = 88
-target-version = "py312"
-
-[tool.ruff.format]
-preview = false
-quote-style = "double"
-line-ending = "auto"
-docstring-code-format = true
-
-[tool.ruff.lint]
-select = ["ALL"]
-ignore = [
-    # Allow for string literals in exceptions
-    "EM",
-    # Allow missing copyright notice at top of files
-    "CPY001",
-    # Allow missing docstrings in public modules
-    "D100",
-    # Allow missing docstrings in public classes
-    "D101",
-    # Allow missing docstrings in public packages
-    "D104",
-    # Allow docstrings without blank line before class docstring
-    "D203",
-    # Allow multi-line docstring summary to start at second line
-    "D213",
-    # Allow first-party imports outside type-checking blocks
-    "TC001",
-    # Allow third-party imports outside type-checking blocks
-    "TC002",
-    # Allow standard library imports outside type-checking blocks
-    "TC003",
-    # Allow TODO comments  
-    "FIX002",
-    # Allow TODO comments without author
-    "TD002",
-    # Allow TODO comments without link to issue
-    "TD003",
-    # Allow specifying long messages outside the exception class
-    "TRY003",
-    # Conflicts with formatter - trailing commas are handled by ruff format
-    "COM812",
-]
-
-fixable = ["ALL"]
-
-[tool.ruff.lint.pydocstyle]
-convention = "google"
-
-[tool.ruff.lint.per-file-ignores]
-# S101: Allow for use of the assert keyword
-# PLR2004: Allow "magic value" used in comparison
-"test_*.py" = ["S101", "PLR2004"]