diff --git a/.github/workflows/MinIO.yml b/.github/workflows/MinIO.yml index 8340557341..97b739748b 100644 --- a/.github/workflows/MinIO.yml +++ b/.github/workflows/MinIO.yml @@ -21,6 +21,7 @@ jobs: VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake PIP_BREAK_SYSTEM_PACKAGES: 1 BUILD_EXTENSION_TEST_DEPS: full + S3_TEST_SERVER_AVAILABLE: 1 steps: - name: Install required ubuntu packages diff --git a/src/storage/ducklake_delete_filter.cpp b/src/storage/ducklake_delete_filter.cpp index 41f43b8ae7..c60545bcbe 100644 --- a/src/storage/ducklake_delete_filter.cpp +++ b/src/storage/ducklake_delete_filter.cpp @@ -1,11 +1,53 @@ #include "storage/ducklake_delete_filter.hpp" #include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp" +#include "duckdb/common/multi_file/multi_file_list.hpp" +#include "duckdb/common/multi_file/multi_file_reader.hpp" +#include "duckdb/common/multi_file/multi_file_states.hpp" #include "duckdb/parser/tableref/table_function_ref.hpp" #include "duckdb/parallel/thread_context.hpp" #include "duckdb/main/database.hpp" namespace duckdb { +//! FunctionInfo to pass delete file metadata to the MultiFileReader +struct DeleteFileFunctionInfo : public TableFunctionInfo { + DuckLakeFileData file_data; +}; + +//! Custom MultiFileReader that creates a SimpleMultiFileList with extended info +struct DeleteFileMultiFileReader : public MultiFileReader { + static unique_ptr CreateInstance(const TableFunction &table_function) { + return make_uniq(table_function); + } + + explicit DeleteFileMultiFileReader(const TableFunction &table_function) { + auto &info = table_function.function_info->Cast(); + auto &delete_file = info.file_data; + + OpenFileInfo file_info(delete_file.path); + auto extended_info = make_shared_ptr(); + extended_info->options["file_size"] = Value::UBIGINT(delete_file.file_size_bytes); + extended_info->options["etag"] = Value(""); + extended_info->options["last_modified"] = Value::TIMESTAMP(timestamp_t(0)); + if (!delete_file.encryption_key.empty()) { + extended_info->options["encryption_key"] = Value::BLOB_RAW(delete_file.encryption_key); + } + file_info.extended_info = std::move(extended_info); + + vector files; + files.push_back(std::move(file_info)); + file_list = make_shared_ptr(std::move(files)); + } + + shared_ptr CreateFileList(ClientContext &context, const vector &paths, + const FileGlobInput &options) override { + return file_list; + } + +private: + shared_ptr file_list; +}; + DuckLakeDeleteFilter::DuckLakeDeleteFilter() : delete_data(make_shared_ptr()) { } @@ -52,7 +94,14 @@ vector DuckLakeDeleteFilter::ScanDeleteFile(ClientContext &context, const auto &instance = DatabaseInstance::GetDatabase(context); ExtensionLoader loader(instance, "ducklake"); auto &parquet_scan_entry = loader.GetTableFunction("parquet_scan"); - auto &parquet_scan = parquet_scan_entry.functions.functions[0]; + auto parquet_scan = parquet_scan_entry.functions.functions[0]; + + // Set up function_info with delete file metadata and custom MultiFileReader + // This allows the bind to use our file list with extended info (file_size, etag, last_modified) + auto function_info = make_shared_ptr(); + function_info->file_data = delete_file; + parquet_scan.function_info = std::move(function_info); + parquet_scan.get_multi_file_reader = DeleteFileMultiFileReader::CreateInstance; // Prepare the inputs for the bind vector children; @@ -68,10 +117,8 @@ vector DuckLakeDeleteFilter::ScanDeleteFile(ClientContext &context, const } TableFunctionRef empty; - TableFunction dummy_table_function; - dummy_table_function.name = "DuckLakeDeleteScan"; - TableFunctionBindInput bind_input(children, named_params, input_types, input_names, nullptr, nullptr, - dummy_table_function, empty); + TableFunctionBindInput bind_input(children, named_params, input_types, input_names, nullptr, nullptr, parquet_scan, + empty); vector return_types; vector return_names; diff --git a/test/sql/delete/delete_metadata.test b/test/sql/delete/delete_metadata.test new file mode 100644 index 0000000000..d52f83ad99 --- /dev/null +++ b/test/sql/delete/delete_metadata.test @@ -0,0 +1,54 @@ +# name: test/sql/delete/delete_metadata.test +# description: Test ducklake deletes +# group: [delete] + +require httpfs + +require-env S3_TEST_SERVER_AVAILABLE 1 + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +require ducklake + +require parquet + +statement ok +SET autoinstall_known_extensions=1; + +statement ok +SET autoload_known_extensions=1; + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH 's3://mybucket') + +# Clean up any existing table from previous runs +statement ok +DROP TABLE IF EXISTS ducklake.test; + +statement ok +CREATE TABLE ducklake.test AS SELECT i id FROM range(1000) t(i); + +statement ok +INSERT INTO ducklake.test SELECT i id FROM range(15000, 16000) t(i) + +statement ok +BEGIN + +query I +DELETE FROM ducklake.test WHERE id%2=0 +---- +1000 + +statement ok +COMMIT + +query II +EXPLAIN ANALYZE SELECT COUNT(*) FILTER(WHERE id%2=0) FROM ducklake.test +---- +analyzed_plan :.*#HEAD: 0.* + +# we can time travel to see the state of the table before deletes +query II +EXPLAIN ANALYZE SELECT COUNT(*) FILTER(WHERE id%2=0) FROM ducklake.test AT (VERSION => 2) +---- +analyzed_plan :.*#HEAD: 0.*