pydiverse
diff --git a/‎docs/source/changelog.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/source/changelog.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/examples/realistic_pipeline.md‎
Lines changed: 11 additions & 6 deletions b/‎docs/source/examples/realistic_pipeline.md‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎docs/source/quickstart.md‎
Lines changed: 4 additions & 1 deletion b/‎docs/source/quickstart.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎pixi.lock‎
Lines changed: 130 additions & 131 deletions b/‎pixi.lock‎
Lines changed: 130 additions & 131 deletions
diff --git a/‎pixi.toml‎
Lines changed: 1 addition & 1 deletion b/‎pixi.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/pydiverse/pipedag/backend/table/cache/parquet.py‎
Lines changed: 2 additions & 2 deletions b/‎src/pydiverse/pipedag/backend/table/cache/parquet.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/pydiverse/pipedag/backend/table/dict.py‎
Lines changed: 2 additions & 2 deletions b/‎src/pydiverse/pipedag/backend/table/dict.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/pydiverse/pipedag/backend/table/sql/dialects/duckdb.py‎
Lines changed: 4 additions & 8 deletions b/‎src/pydiverse/pipedag/backend/table/sql/dialects/duckdb.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎src/pydiverse/pipedag/backend/table/sql/dialects/duckdb_parquet.py‎
Lines changed: 1 addition & 26 deletions b/‎src/pydiverse/pipedag/backend/table/sql/dialects/duckdb_parquet.py‎
Lines changed: 1 addition & 26 deletions
@@ -1,5 +1,8 @@
 # Changelog
 
+## 0.12.6 (2025-XX-XX)
+- Feat: Automatically check cache-validity of polars and pandas DataFrame tasks marked as lazy
+
 ## 0.12.5 (2025-11-26)
 - Workaround snowflake sqlalchemy dialect to enable ExternalTableReference to other database
 - Flag in CreateTable and DropTable DDL statements allows not quoting schema (needed for multi-part schema)
 
@@ -323,14 +323,19 @@ are `sqlalchemy.Table`, `pandas.DataFrame`, `polars.DataFrame`, or `polars.LazyF
 
 ### Controlling automatic cache invalidation
 
-For input_type `sa.Table`, and `pdt.SqlAlchemy`, in general, it is best to set lazy=True. This means the task is always
+For input_type `sa.Table`, and `pdt.SqlAlchemy`, in general, it is best to set `lazy=True`. This means the task is always
 executed because producing a query is fast, but the query is only executed when it is actually needed. For
 `pl.LazyFrame`, `version=AUTO_VERSION` is a good choice, because then the task is executed once with empty input
-dataframes and only if resulting LazyFrame expressions change, the task is executed again with full input data. For
-`pd.DataFrame` and `pl.DataFrame`, we don't try to guess which changes of the code are actually meaningful. Thus the
-user needs to help manually bumpig a version number like `version="1.0.0"`. For development, `version=None` simply
-deactivates caching until the code is more stable. It is recommended to always develop with small pipeline instances
-anyways to achieve high iteration speed (see [multi_instance_pipeline.md](multi_instance_pipeline.md)).
+dataframes and only if resulting LazyFrame expressions change, the task is executed again with full input data.
+
+For `pd.DataFrame` and `pl.DataFrame`, we don't try to guess which changes of the code are actually meaningful. Thus,
+to avoid running the task, the user needs to help manually bumping a version number like `version="1.0.0"`.
+For development, `version=None` simply deactivates caching until the code is more stable. It is recommended to always
+develop with small pipeline instances anyways to achieve high iteration speed (see [multi_instance_pipeline.md](multi_instance_pipeline.md)).
+Setting `lazy=True` for tasks returning `pd.DataFrame` or `pl.DataFrame` objects, always executes the task, but hashes the result to
+determine the cache-validity of the task output and hence the cache invalidation of downstream tasks.
+This is a good choice for tasks returning small dataframes which are quick to compute and where bumping the version number adds unwanted
+complexity to the development process. It is allowed to produce both dataframe and SQL output in one `@materialize(lazy=True, ...)` task.
 
 ### Integration with pydiverse colspec (same as dataframely but with pydiverse transform based SQL support)
 
 
@@ -187,9 +187,12 @@ In this case, the task must produce a SQLAlchemy expression for
 all tabular outputs without executing them. Pipedag can render the query and will only produce a table based on this
 query expression if the query changed or one of the inputs to the task changed.
 
+For tasks returning a Polars or Pandas DataFrame, the hash of the resulting DataFrame is used to determine whether to
+cache-invalidate downstream tasks.
+
 ### Manual cache invalidation with `version` parameter
 
-For non-SQL tasks, the `version` parameter of the {py:func}`@materialize <pydiverse.pipedag.materialize>` decorator must
+For non-lazy tasks, the `version` parameter of the {py:func}`@materialize <pydiverse.pipedag.materialize>` decorator must
 be used for manual cache invalidation. As long as the version stays the same, it is assumed that the code of the task
 did not materially change and will produce the same outputs given the same inputs. We refrained from automatically
 inspecting any python code changes since this would break at shared code changes where it is very hard to distinguish
 
@@ -7,7 +7,7 @@ platforms = ["linux-64", "osx-64", "osx-arm64", "win-64"] # "linux-aarch64"
 #pip = "*"
 hatchling = "*"
 python = ">=3.11.14,<3.15.0a0"
-pydiverse-common = ">=0.4.1,<0.5"
+pydiverse-common = ">=0.4.3,<0.5"
 typing-extensions = ">=4.15.0,<5"
 networkx = ">=3.4,<4"
 attrs = ">=25.4.0,<26"
 
@@ -29,7 +29,7 @@ classifiers = [
 ]
 
 dependencies = [
-  "pydiverse-common >=0.4.1,<0.5",
+  "pydiverse-common >=0.4.3,<0.5",
   "typing-extensions >=4.15.0,<5",
   "networkx >=3.4,<4",
   "attrs >=25.4.0,<26",
 
@@ -15,7 +15,7 @@
 from pydiverse.pipedag import ConfigContext, Stage, Table
 from pydiverse.pipedag.materialize.materializing_task import MaterializingTask
 from pydiverse.pipedag.materialize.store import BaseTableCache
-from pydiverse.pipedag.materialize.table_hook_base import CanMatResult, CanRetResult, TableHook
+from pydiverse.pipedag.materialize.table_hook_base import CanMatResult, CanRetResult, DataFrameTableHook, TableHook
 from pydiverse.pipedag.optional_dependency.transform import pdt, pdt_new, pdt_old
 from pydiverse.pipedag.util import normalize_name
 from pydiverse.pipedag.util.path import is_file_uri
@@ -98,7 +98,7 @@ def get_table_path(self, table: Table, file_extension: str) -> UPath:
 
 
 @ParquetTableCache.register_table(pd)
-class PandasTableHook(TableHook[ParquetTableCache]):
+class PandasTableHook(DataFrameTableHook, TableHook[ParquetTableCache]):
     pd_version = Version(pd.__version__)
 
     @classmethod
 
@@ -9,7 +9,7 @@
 from pydiverse.pipedag.materialize.materializing_task import MaterializingTask
 from pydiverse.pipedag.materialize.metadata import LazyTableMetadata, TaskMetadata
 from pydiverse.pipedag.materialize.store import BaseTableStore
-from pydiverse.pipedag.materialize.table_hook_base import CanMatResult, CanRetResult, TableHook
+from pydiverse.pipedag.materialize.table_hook_base import CanMatResult, CanRetResult, DataFrameTableHook, TableHook
 from pydiverse.pipedag.optional_dependency.transform import C, pdt
 
 
@@ -126,7 +126,7 @@ def get_table_objects_in_stage(self, stage: Stage, include_views=True) -> list[s
 
 
 @DictTableStore.register_table(pd)
-class PandasTableHook(TableHook[DictTableStore]):
+class PandasTableHook(DataFrameTableHook, TableHook[DictTableStore]):
     @classmethod
     def can_materialize(cls, tbl: Table) -> CanMatResult:
         type_ = type(tbl.obj)
 
@@ -9,13 +9,9 @@
 import sqlalchemy as sa
 from packaging.version import Version
 
+import pydiverse.pipedag.backend.table.sql.hooks as sql_hooks
 from pydiverse.common import Dtype
 from pydiverse.pipedag import Table
-from pydiverse.pipedag.backend.table.sql.hooks import (
-    IbisTableHook,
-    PandasTableHook,
-    PolarsTableHook,
-)
 from pydiverse.pipedag.backend.table.sql.sql import SQLTableStore
 from pydiverse.pipedag.container import Schema
 from pydiverse.pipedag.materialize.details import resolve_materialization_details_label
@@ -65,7 +61,7 @@ def dialect_requests_empty_creation(self, table: Table, is_sql: bool) -> bool:
 
 
 @DuckDBTableStore.register_table(pd)
-class PandasTableHook(PandasTableHook):
+class PandasTableHook(sql_hooks.PandasTableHook):
     @classmethod
     def _execute_materialize(
         cls,
@@ -102,7 +98,7 @@ def _execute_materialize(
 
 
 @DuckDBTableStore.register_table(pl, duckdb)
-class PolarsTableHook(PolarsTableHook):
+class PolarsTableHook(sql_hooks.PolarsTableHook):
     @classmethod
     def dialect_supports_connectorx(cls):
         # ConnectorX (used by Polars read_database_uri) does not support DuckDB.
@@ -133,7 +129,7 @@ def download_table(
 
 
 @DuckDBTableStore.register_table(ibis.api.Table)
-class IbisTableHook(IbisTableHook):
+class IbisTableHook(sql_hooks.IbisTableHook):
     @classmethod
     def _conn(cls, store: DuckDBTableStore):
         return ibis.duckdb.from_connection(store.engine.raw_connection())
@@ -27,12 +27,6 @@
 from pydiverse.pipedag.container import SortOrder, View
 from pydiverse.pipedag.context import RunContext
 from pydiverse.pipedag.materialize.store import BaseTableStore
-from pydiverse.pipedag.materialize.table_hook_base import (
-    AutoVersionSupport,
-    CanMatResult,
-    CanRetResult,
-    TableHook,
-)
 from pydiverse.pipedag.optional_dependency.sqlalchemy import Select, SqlText, TextClause
 from pydiverse.pipedag.util.path import is_file_uri
 
@@ -903,18 +897,7 @@ def drop_subquery_table(
 
 
 @ParquetTableStore.register_table(pd)
-class PandasTableHook(TableHook[ParquetTableStore]):
-    auto_version_support = AutoVersionSupport.TRACE
-
-    @classmethod
-    def can_materialize(cls, tbl: Table) -> CanMatResult:
-        type_ = type(tbl.obj)
-        return CanMatResult.new(issubclass(type_, pd.DataFrame))
-
-    @classmethod
-    def can_retrieve(cls, type_) -> CanRetResult:
-        return CanRetResult.new(issubclass(type_, pd.DataFrame))
-
+class PandasTableHook(sql_hooks.PandasTableHook):
     @classmethod
     def materialize(
         cls,
@@ -1060,14 +1043,6 @@ def get_pyarrow_path(path: UPath, store: ParquetTableStore) -> tuple[str, fsspec
             pyarrow_fs = None
         return pyarrow_path, pyarrow_fs
 
-    @classmethod
-    def auto_table(cls, obj: pd.DataFrame):
-        return sql_hooks.PandasTableHook.auto_table(obj)
-
-    @classmethod
-    def get_computation_tracer(cls):
-        return sql_hooks.PandasTableHook.ComputationTracer()
-
 
 @ParquetTableStore.register_table(pl, duckdb)
 class PolarsTableHook(sql_hooks.PolarsTableHook):
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ classifiers = [`
`29`	`29`	`]`
`30`	`30`
`31`	`31`	`dependencies = [`
`32`		`- "pydiverse-common >=0.4.1,<0.5",`
	`32`	`+ "pydiverse-common >=0.4.3,<0.5",`
`33`	`33`	`"typing-extensions >=4.15.0,<5",`
`34`	`34`	`"networkx >=3.4,<4",`
`35`	`35`	`"attrs >=25.4.0,<26",`