From 2ea0638011720bd33b99625a467f2a69fed182a6 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 13 Feb 2025 06:15:29 -0500 Subject: [PATCH 1/2] chore(deps): set lower bound of duckdb to support `delta` extension --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6f7a6b45f48e..41d3df3b81d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,7 +97,7 @@ druid = [ "rich>=12.4.4,<14", ] duckdb = [ - "duckdb>=0.10,<2", + "duckdb>=0.10.3,<2", "pyarrow>=10.0.1", "pyarrow-hotfix>=0.4,<1", "numpy>=1.23.2,<3", diff --git a/uv.lock b/uv.lock index 0395f4ec241c..b20690ac1010 100644 --- a/uv.lock +++ b/uv.lock @@ -2270,7 +2270,7 @@ requires-dist = [ { name = "datafusion", marker = "extra == 'datafusion'", specifier = ">=0.6,<45" }, { name = "db-dtypes", marker = "extra == 'bigquery'", specifier = ">=0.3,<2" }, { name = "deltalake", marker = "extra == 'deltalake'", specifier = ">=0.9.0,<1" }, - { name = "duckdb", marker = "extra == 'duckdb'", specifier = ">=0.10,<2" }, + { name = "duckdb", marker = "extra == 'duckdb'", specifier = ">=0.10.3,<2" }, { name = "fsspec", extras = ["s3"], marker = "extra == 'athena'" }, { name = "geoarrow-types", marker = "extra == 'geospatial'", specifier = ">=0.2,<1" }, { name = "geopandas", marker = "extra == 'geospatial'", specifier = ">=0.6,<2" }, From 7f5f17fe47425f9497e5d07f27c130e4a4aee5fa Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 13 Feb 2025 06:18:26 -0500 Subject: [PATCH 2/2] fix(duckdb): use `delta` for reading deltalake data --- ibis/backends/duckdb/__init__.py | 32 +++++++++++++++--------------- ibis/backends/tests/test_export.py | 6 ------ 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index 63a21381fc15..dd55b40ac5fd 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -827,43 +827,43 @@ def _read_parquet_pyarrow_dataset( # explicitly. def read_delta( - self, path: str, /, *, table_name: str | None = None, **kwargs: Any + self, path: str | Path, /, *, table_name: str | None = None, **kwargs: Any ) -> ir.Table: """Register a Delta Lake table as a table in the current database. Parameters ---------- path - The data source. Must be a directory - containing a Delta Lake table. + The data source. Must be a directory containing a Delta Lake table. table_name An optional name to use for the created table. This defaults to a sequentially generated name. - **kwargs + kwargs Additional keyword arguments passed to deltalake.DeltaTable. Returns ------- ir.Table The just-registered table. - """ - path = util.normalize_filenames(path)[0] + (path,) = util.normalize_filenames(path) + + extensions = ["delta"] + if path.startswith(("http://", "https://", "s3://")): + extensions.append("httpfs") table_name = table_name or util.gen_name("read_delta") - try: - from deltalake import DeltaTable - except ImportError: - raise ImportError( - "The deltalake extra is required to use the " - "read_delta method. You can install it using pip:\n\n" - "pip install 'ibis-framework[deltalake]'\n" - ) + options = [ + sg.to_identifier(key).eq(sge.convert(val)) for key, val in kwargs.items() + ] - delta_table = DeltaTable(path, **kwargs) + self._load_extensions(extensions) - self.con.register(table_name, delta_table.to_pyarrow_dataset()) + self._create_temp_view( + table_name, + sg.select(STAR).from_(self.compiler.f.delta_scan(path, *options)), + ) return self.table(table_name) def list_tables( diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py index a08b6947f83c..1be87dfdb975 100644 --- a/ibis/backends/tests/test_export.py +++ b/ibis/backends/tests/test_export.py @@ -12,7 +12,6 @@ from ibis import util from ibis.backends.tests.errors import ( DatabricksServerOperationError, - DuckDBInvalidInputException, DuckDBNotImplementedException, DuckDBParserException, ExaQueryError, @@ -476,11 +475,6 @@ def test_to_pyarrow_decimal(backend, dtype, pyarrow_dtype): condition=CI and IS_SPARK_REMOTE, reason="not supported until pyspark 4", ) -@pytest.mark.xfail_version( - duckdb=["pyarrow>=19"], - raises=DuckDBInvalidInputException, - reason="decoding delta file fails", -) @pytest.mark.xfail_version( datafusion=["pyarrow>=19", "datafusion>=44"], raises=Exception,