Skip to content

feat[duckdb]: add s3 duckdb benchmarks #3286

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 51 additions & 9 deletions .github/workflows/bench-pr.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
name: PR Benchmarks

env:
DUCKDB_VERSION: v1.2.2

on:
pull_request:
types: [ labeled, synchronize ]
Expand All @@ -22,8 +25,44 @@ jobs:
with:
labels: benchmark

bench:
build-duckdb:
needs: label_trigger
runs-on:
- runs-on=${{ github.run_id }}
- family=c6id.8xlarge
- image=ubuntu24-full-x64
- spot=false
- tag=${{ matrix.id }}
steps:
- uses: runs-on/action@v1
- name: Cache duckdb compiled binary
uses: runs-on/cache@v4
id: cache-duckdb-binary
with:
key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
path: ${{ github.workspace }}/duckdb/build/release/duckdb

- name: Install duckdb compile requirements
if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
run: sudo apt-get update && sudo apt-get install ninja-build cmake build-essential make ccache clang -y

- name: Build duckdb binary
if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
env:
CC: clang
CXX: clang++
GEN: ninja
NATIVE_ARCH: 1
DUCKDB_PLATFORM: linux_amd64
LTO: thin
run: |
git clone https://github.com/duckdb/duckdb
cd duckdb
git checkout "$DUCKDB_VERSION"
make release

bench:
needs: build-duckdb
runs-on:
- runs-on=${{ github.run_id }}
- family=c6id.8xlarge
Expand All @@ -46,15 +85,18 @@ jobs:
submodules: "recursive"
# rustup is pre-installed on the ubuntu24-full-x64 image.

# The compression benchmarks rely on DuckDB being installed to convert CSV to Parquet
- name: Install DuckDB
uses: opt-nc/setup-duckdb-[email protected]
- name: Cache duckdb compiled binary
uses: runs-on/cache@v4
id: cache-duckdb-binary
with:
version: v1.2.2
# This should always be true so fail here, then we can figure out why the cache was empty.
fail-on-cache-miss: true
key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
path: ${{ github.workspace }}/duckdb/build/release/duckdb

# TODO(joe): remove ninja once duckdb is not built by the benchmark
- name: Install gzip
run: sudo apt-get update && sudo apt-get install -y gzip ninja-build
- name: Add duckdb to path
run: |
echo "${{ github.workspace }}/duckdb/build/release/" >> $GITHUB_PATH

- name: Build binary
shell: bash
Expand All @@ -73,7 +115,7 @@ jobs:
- name: Run ${{ matrix.benchmark.name }} benchmark
shell: bash
run: |
target/release_debug/${{ matrix.benchmark.id }} -d gh-json | tee ${{ matrix.benchmark.id }}.json
target/release_debug/${{ matrix.benchmark.id }} -d gh-json --skip-rebuild | tee ${{ matrix.benchmark.id }}.json

- name: Setup AWS CLI
uses: aws-actions/configure-aws-credentials@v4
Expand Down
52 changes: 47 additions & 5 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,43 @@ jobs:
bash scripts/commit-json.sh > new-commit.json
bash scripts/cat-s3.sh vortex-benchmark-results-database commits.json new-commit.json

build-duckdb:
runs-on:
- runs-on=${{ github.run_id }}
- family=c6id.8xlarge
- image=ubuntu24-full-x64
- spot=false
- tag=${{ matrix.id }}
steps:
- uses: runs-on/action@v1
- name: Cache duckdb compiled binary
uses: runs-on/cache@v4
id: cache-duckdb-binary
with:
key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
path: ${{ github.workspace }}/duckdb/build/release/duckdb

- name: Install duckdb compile requirements
if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
run: sudo apt-get update && sudo apt-get install ninja-build cmake build-essential make ccache clang -y

- name: Build duckdb binary
if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
env:
CC: clang
CXX: clang++
GEN: ninja
NATIVE_ARCH: 1
DUCKDB_PLATFORM: linux_amd64
LTO: thin
run: |
git clone https://github.com/duckdb/duckdb
cd duckdb
git checkout "$DUCKDB_VERSION"
make release

bench:
needs: build-duckdb
runs-on:
- runs-on=${{ github.run_id }}
- family=c6id.8xlarge
Expand All @@ -50,19 +86,25 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: "recursive"
# rustup is pre-installed on the ubuntu24-full-x64 image.

# The compression benchmarks rely on DuckDB being installed to convert CSV to Parquet
- name: Install DuckDB
uses: opt-nc/setup-duckdb-[email protected]
- name: Cache duckdb compiled binary
uses: runs-on/cache@v4
id: cache-duckdb-binary
with:
version: v1.2.1
fail-on-cache-miss: true
key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
path: ${{ github.workspace }}/duckdb/build/release/duckdb

- name: Add duckdb to path
run: |
echo "${{ github.workspace }}/duckdb/build/release/" >> $GITHUB_PATH

# cat-s3 script uses gzip to append to compressed benchmark results
# TODO(joe): remove ninja once duckdb is not built by the benchmark
- name: Install gzip
run: sudo apt-get update && sudo apt-get install -y gzip ninja-build

# rustup is pre-installed on the ubuntu24-full-x64 image.
- name: Run ${{ matrix.benchmark.name }} benchmark
shell: bash
env:
Expand Down
75 changes: 65 additions & 10 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
name: "SQL-related benchmarks"

env:
DUCKDB_VERSION: v1.2.2

on:
workflow_call:
inputs:
Expand All @@ -8,7 +11,43 @@ on:
type: string

jobs:
build-duckdb:
runs-on:
- runs-on=${{ github.run_id }}
- family=c6id.8xlarge
- image=ubuntu24-full-x64
- spot=false
- tag=${{ matrix.id }}
steps:
- uses: runs-on/action@v1
- name: Cache duckdb compiled binary
uses: runs-on/cache@v4
id: cache-duckdb-binary
with:
key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
path: ${{ github.workspace }}/duckdb/build/release/duckdb

- name: Install duckdb compile requirements
if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
run: sudo apt-get update && sudo apt-get install ninja-build cmake build-essential make ccache clang -y

- name: Build duckdb binary
if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
env:
CC: clang
CXX: clang++
GEN: ninja
NATIVE_ARCH: 1
DUCKDB_PLATFORM: linux_amd64
LTO: thin
run: |
git clone https://github.com/duckdb/duckdb
cd duckdb
git checkout "$DUCKDB_VERSION"
make release

bench:
needs: build-duckdb
# S3 is shared state here, and we want to make sure only one of each job runs at a time
concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}-${{matrix.id}}
Expand All @@ -33,8 +72,8 @@ jobs:
binary_name: tpch
name: TPC-H on S3
local_dir: bench-vortex/data/tpch/1
remote_storage: s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch-sf1/
targets: "datafusion:parquet,datafusion:vortex"
remote_storage: s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/1/
targets: "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex"
runs-on:
- runs-on=${{ github.run_id }}
- family=c6id.8xlarge
Expand All @@ -55,6 +94,21 @@ jobs:
with:
submodules: "recursive"

- name: Cache duckdb compiled binary
uses: runs-on/cache@v4
id: cache-duckdb-binary
with:
fail-on-cache-miss: true
key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
path: ${{ github.workspace }}/duckdb/build/release/duckdb

- name: Add duckdb to path
run: |
echo "${{ github.workspace }}/duckdb/build/release/" >> $GITHUB_PATH

- name: Verify duckdb available
run: duckdb --version

- name: Setup AWS CLI
uses: aws-actions/configure-aws-credentials@v4
with:
Expand All @@ -68,27 +122,26 @@ jobs:
run: |
cargo build --bin ${{ matrix.binary_name }} --package bench-vortex --profile release_debug

- name: Install Ninja
run: sudo apt-get update && sudo apt-get install -y ninja-build

- name: Install gzip
run: sudo apt-get update && sudo apt-get install -y gzip
- name: Install ninja & gzip
run: sudo apt-get update && sudo apt-get install -y ninja-build gzip

- name: DuckDB extension build
env:
# Build DuckDB and the Vortex extension with `-march=native`.
# The `NATIVE_ARCH` environment variable is picked up by `duckdb/Makefile`.
NATIVE_ARCH: 1
DUCKDB_PLATFORM: linux_amd64
CFLAGS: -ftls-model=global-dynamic
run: GEN=ninja make release
working-directory: ${{ github.workspace }}/duckdb-vortex

- name: Generate data
shell: bash
run: |
# Generate data, running each query once to make sure they don't panic.
target/release_debug/${{ matrix.binary_name }} --targets datafusion:parquet -i1 -d gh-json
target/release_debug/${{ matrix.binary_name }} --targets datafusion:vortex -i1 -d gh-json
target/release_debug/${{ matrix.binary_name }} --targets duckdb:vortex -i1 -d gh-json
RUST_BACKTRACE=1 target/release_debug/${{ matrix.binary_name }} --targets datafusion:parquet -i1 -d gh-json --skip-rebuild
RUST_BACKTRACE=1 target/release_debug/${{ matrix.binary_name }} --targets datafusion:vortex -i1 -d gh-json --skip-rebuild
RUST_BACKTRACE=1 target/release_debug/${{ matrix.binary_name }} --targets duckdb:vortex -i1 -d gh-json --skip-rebuild

- name: Upload data
if: matrix.remote_storage != null
Expand Down Expand Up @@ -120,6 +173,7 @@ jobs:
-d gh-json \
--targets ${{ matrix.targets }} \
--export-spans \
--skip-rebuild \
| tee results.json

- name: Run ${{ matrix.name }} benchmark (remote)
Expand All @@ -137,6 +191,7 @@ jobs:
--use-remote-data-dir ${{ matrix.remote_storage }} \
--targets ${{ matrix.targets }} \
--export-spans \
--skip-rebuild \
-d gh-json \
| tee results.json

Expand Down
5 changes: 4 additions & 1 deletion bench-vortex/src/bin/clickbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ struct Args {
hide_progress_bar: bool,
#[arg(long, default_value_t = false)]
show_metrics: bool,
// Don't try to rebuild duckdb
#[arg(long)]
skip_rebuild: bool,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about skip_duckdb_build?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then we can also get rid of the comment.

}

struct DataFusionCtx {
Expand Down Expand Up @@ -211,7 +214,7 @@ fn main() -> anyhow::Result<()> {
.then(|| {
let path = ddb::get_executable_path(&args.duckdb_path);
// If the path is to the duckdb-vortex extension, try to rebuild
if args.duckdb_path.is_none() {
if args.duckdb_path.is_none() && !args.skip_rebuild {
ddb::build_vortex_duckdb();
}
path
Expand Down
3 changes: 1 addition & 2 deletions bench-vortex/src/bin/tpcds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,13 +177,12 @@ async fn bench_main(
match engine {
// TODO(joe): support datafusion
Engine::DuckDB => {
let duckdb_path = duckdb_resolved_path;
let temp_dir = tempdir()?;
let duckdb_file = temp_dir
.path()
.join(format!("duckdb-file-{}.db", format.name()));

let executor = DuckDBExecutor::new(duckdb_path.to_owned(), duckdb_file);
let executor = DuckDBExecutor::new(duckdb_resolved_path.to_owned(), duckdb_file);
register_tables(&executor, &url, format, BenchmarkDataset::TpcDS)?;

for (query_idx, sql_query) in tpch_queries.clone() {
Expand Down
Loading
Loading