Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/MinIO.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake
PIP_BREAK_SYSTEM_PACKAGES: 1
BUILD_EXTENSION_TEST_DEPS: full
S3_TEST_SERVER_AVAILABLE: 1

steps:
- name: Install required ubuntu packages
Expand Down
57 changes: 52 additions & 5 deletions src/storage/ducklake_delete_filter.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,53 @@
#include "storage/ducklake_delete_filter.hpp"
#include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp"
#include "duckdb/common/multi_file/multi_file_list.hpp"
#include "duckdb/common/multi_file/multi_file_reader.hpp"
#include "duckdb/common/multi_file/multi_file_states.hpp"
#include "duckdb/parser/tableref/table_function_ref.hpp"
#include "duckdb/parallel/thread_context.hpp"
#include "duckdb/main/database.hpp"

namespace duckdb {

//! FunctionInfo to pass delete file metadata to the MultiFileReader
struct DeleteFileFunctionInfo : public TableFunctionInfo {
DuckLakeFileData file_data;
};

//! Custom MultiFileReader that creates a SimpleMultiFileList with extended info
struct DeleteFileMultiFileReader : public MultiFileReader {
static unique_ptr<MultiFileReader> CreateInstance(const TableFunction &table_function) {
return make_uniq<DeleteFileMultiFileReader>(table_function);
}

explicit DeleteFileMultiFileReader(const TableFunction &table_function) {
auto &info = table_function.function_info->Cast<DeleteFileFunctionInfo>();
auto &delete_file = info.file_data;

OpenFileInfo file_info(delete_file.path);
auto extended_info = make_shared_ptr<ExtendedOpenFileInfo>();
extended_info->options["file_size"] = Value::UBIGINT(delete_file.file_size_bytes);
extended_info->options["etag"] = Value("");
extended_info->options["last_modified"] = Value::TIMESTAMP(timestamp_t(0));
if (!delete_file.encryption_key.empty()) {
extended_info->options["encryption_key"] = Value::BLOB_RAW(delete_file.encryption_key);
}
file_info.extended_info = std::move(extended_info);

vector<OpenFileInfo> files;
files.push_back(std::move(file_info));
file_list = make_shared_ptr<SimpleMultiFileList>(std::move(files));
}

shared_ptr<MultiFileList> CreateFileList(ClientContext &context, const vector<string> &paths,
const FileGlobInput &options) override {
return file_list;
}

private:
shared_ptr<MultiFileList> file_list;
};

DuckLakeDeleteFilter::DuckLakeDeleteFilter() : delete_data(make_shared_ptr<DuckLakeDeleteData>()) {
}

Expand Down Expand Up @@ -52,7 +94,14 @@ vector<idx_t> DuckLakeDeleteFilter::ScanDeleteFile(ClientContext &context, const
auto &instance = DatabaseInstance::GetDatabase(context);
ExtensionLoader loader(instance, "ducklake");
auto &parquet_scan_entry = loader.GetTableFunction("parquet_scan");
auto &parquet_scan = parquet_scan_entry.functions.functions[0];
auto parquet_scan = parquet_scan_entry.functions.functions[0];

// Set up function_info with delete file metadata and custom MultiFileReader
// This allows the bind to use our file list with extended info (file_size, etag, last_modified)
auto function_info = make_shared_ptr<DeleteFileFunctionInfo>();
function_info->file_data = delete_file;
parquet_scan.function_info = std::move(function_info);
parquet_scan.get_multi_file_reader = DeleteFileMultiFileReader::CreateInstance;

// Prepare the inputs for the bind
vector<Value> children;
Expand All @@ -68,10 +117,8 @@ vector<idx_t> DuckLakeDeleteFilter::ScanDeleteFile(ClientContext &context, const
}

TableFunctionRef empty;
TableFunction dummy_table_function;
dummy_table_function.name = "DuckLakeDeleteScan";
TableFunctionBindInput bind_input(children, named_params, input_types, input_names, nullptr, nullptr,
dummy_table_function, empty);
TableFunctionBindInput bind_input(children, named_params, input_types, input_names, nullptr, nullptr, parquet_scan,
empty);
vector<LogicalType> return_types;
vector<string> return_names;

Expand Down
54 changes: 54 additions & 0 deletions test/sql/delete/delete_metadata.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# name: test/sql/delete/delete_metadata.test
# description: Test ducklake deletes
# group: [delete]

require httpfs

require-env S3_TEST_SERVER_AVAILABLE 1

test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db

require ducklake

require parquet

statement ok
SET autoinstall_known_extensions=1;

statement ok
SET autoload_known_extensions=1;
Comment on lines +15 to +19
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can ensure the test only runs in the MinIo CI by having

require httpfs
require-env S3_TEST_SERVER_AVAILABLE 1

and S3_TEST_SERVER_AVAILABLE: 1 in the .github/workflows/MinIO.yml env.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've done it @pdet 🫡


statement ok
ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH 's3://mybucket')

# Clean up any existing table from previous runs
statement ok
DROP TABLE IF EXISTS ducklake.test;

statement ok
CREATE TABLE ducklake.test AS SELECT i id FROM range(1000) t(i);

statement ok
INSERT INTO ducklake.test SELECT i id FROM range(15000, 16000) t(i)

statement ok
BEGIN

query I
DELETE FROM ducklake.test WHERE id%2=0
----
1000

statement ok
COMMIT

query II
EXPLAIN ANALYZE SELECT COUNT(*) FILTER(WHERE id%2=0) FROM ducklake.test
----
analyzed_plan <REGEX>:.*#HEAD: 0.*

# we can time travel to see the state of the table before deletes
query II
EXPLAIN ANALYZE SELECT COUNT(*) FILTER(WHERE id%2=0) FROM ducklake.test AT (VERSION => 2)
----
analyzed_plan <REGEX>:.*#HEAD: 0.*
Loading