vortex-data · joseph-isaacs · May 19, 2025 · May 13, 2025 · May 13, 2025 · May 13, 2025
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
@@ -1,5 +1,8 @@
 name: PR Benchmarks
 
+env:
+  DUCKDB_VERSION: v1.2.2
+
 on:
   pull_request:
     types: [ labeled, synchronize ]
@@ -22,8 +25,44 @@ jobs:
         with:
           labels: benchmark
 
-  bench:
+  build-duckdb:
     needs: label_trigger
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - family=c6id.8xlarge
+      - image=ubuntu24-full-x64
+      - spot=false
+      - tag=${{ matrix.id }}
+    steps:
+      - uses: runs-on/action@v1
+      - name: Cache duckdb compiled binary
+        uses: runs-on/cache@v4
+        id: cache-duckdb-binary
+        with:
+          key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
+          path: ${{ github.workspace }}/duckdb/build/release/duckdb
+
+      - name: Install duckdb compile requirements
+        if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
+        run: sudo apt-get update && sudo apt-get install ninja-build cmake build-essential make ccache clang -y
+
+      - name: Build duckdb binary
+        if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
+        env:
+          CC: clang
+          CXX: clang++
+          GEN: ninja
+          NATIVE_ARCH: 1
+          DUCKDB_PLATFORM: linux_amd64
+          LTO: thin
+        run: |
+          git clone https://github.com/duckdb/duckdb
+          cd duckdb
+          git checkout "$DUCKDB_VERSION"
+          make release
+
+  bench:
+    needs: build-duckdb
     runs-on:
       - runs-on=${{ github.run_id }}
       - family=c6id.8xlarge
@@ -46,15 +85,18 @@ jobs:
           submodules: "recursive"
       # rustup is pre-installed on the ubuntu24-full-x64 image.
 
-      # The compression benchmarks rely on DuckDB being installed to convert CSV to Parquet
-      - name: Install DuckDB
-        uses: opt-nc/setup-duckdb-[email protected]
+      - name: Cache duckdb compiled binary
+        uses: runs-on/cache@v4
+        id: cache-duckdb-binary
         with:
-          version: v1.2.2
+          # This should always be true so fail here, then we can figure out why the cache was empty.
+          fail-on-cache-miss: true
+          key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
+          path: ${{ github.workspace }}/duckdb/build/release/duckdb
 
-      # TODO(joe): remove ninja once duckdb is not built by the benchmark
-      - name: Install gzip
-        run: sudo apt-get update && sudo apt-get install -y gzip ninja-build
+      - name: Add duckdb to path
+        run: |
+          echo "${{ github.workspace }}/duckdb/build/release/" >> $GITHUB_PATH
 
       - name: Build binary
         shell: bash
@@ -73,7 +115,7 @@ jobs:
       - name: Run ${{ matrix.benchmark.name }} benchmark
         shell: bash
         run: |
-          target/release_debug/${{ matrix.benchmark.id }} -d gh-json | tee ${{ matrix.benchmark.id }}.json
+          target/release_debug/${{ matrix.benchmark.id }} -d gh-json --skip-rebuild | tee ${{ matrix.benchmark.id }}.json
 
       - name: Setup AWS CLI
         uses: aws-actions/configure-aws-credentials@v4

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -31,7 +31,43 @@ jobs:
           bash scripts/commit-json.sh > new-commit.json
           bash scripts/cat-s3.sh vortex-benchmark-results-database commits.json new-commit.json
 
+  build-duckdb:
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - family=c6id.8xlarge
+      - image=ubuntu24-full-x64
+      - spot=false
+      - tag=${{ matrix.id }}
+    steps:
+      - uses: runs-on/action@v1
+      - name: Cache duckdb compiled binary
+        uses: runs-on/cache@v4
+        id: cache-duckdb-binary
+        with:
+          key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
+          path: ${{ github.workspace }}/duckdb/build/release/duckdb
+
+      - name: Install duckdb compile requirements
+        if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
+        run: sudo apt-get update && sudo apt-get install ninja-build cmake build-essential make ccache clang -y
+
+      - name: Build duckdb binary
+        if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
+        env:
+          CC: clang
+          CXX: clang++
+          GEN: ninja
+          NATIVE_ARCH: 1
+          DUCKDB_PLATFORM: linux_amd64
+          LTO: thin
+        run: |
+          git clone https://github.com/duckdb/duckdb
+          cd duckdb
+          git checkout "$DUCKDB_VERSION"
+          make release
+
   bench:
+    needs: build-duckdb
     runs-on:
       - runs-on=${{ github.run_id }}
       - family=c6id.8xlarge
@@ -50,19 +86,25 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: "recursive"
-      # rustup is pre-installed on the ubuntu24-full-x64 image.
 
-      # The compression benchmarks rely on DuckDB being installed to convert CSV to Parquet
-      - name: Install DuckDB
-        uses: opt-nc/setup-duckdb-[email protected]
+      - name: Cache duckdb compiled binary
+        uses: runs-on/cache@v4
+        id: cache-duckdb-binary
         with:
-          version: v1.2.1
+          fail-on-cache-miss: true
+          key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
+          path: ${{ github.workspace }}/duckdb/build/release/duckdb
+
+      - name: Add duckdb to path
+        run: |
+          echo "${{ github.workspace }}/duckdb/build/release/" >> $GITHUB_PATH
 
       # cat-s3 script uses gzip to append to compressed benchmark results
       # TODO(joe): remove ninja once duckdb is not built by the benchmark
       - name: Install gzip
         run: sudo apt-get update && sudo apt-get install -y gzip ninja-build
 
+      # rustup is pre-installed on the ubuntu24-full-x64 image.
       - name: Run ${{ matrix.benchmark.name }} benchmark
         shell: bash
         env:

diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
@@ -1,5 +1,8 @@
 name: "SQL-related benchmarks"
 
+env:
+  DUCKDB_VERSION: v1.2.2
+
 on:
   workflow_call:
     inputs:
@@ -8,7 +11,43 @@ on:
         type: string
 
 jobs:
+  build-duckdb:
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - family=c6id.8xlarge
+      - image=ubuntu24-full-x64
+      - spot=false
+      - tag=${{ matrix.id }}
+    steps:
+      - uses: runs-on/action@v1
+      - name: Cache duckdb compiled binary
+        uses: runs-on/cache@v4
+        id: cache-duckdb-binary
+        with:
+          key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
+          path: ${{ github.workspace }}/duckdb/build/release/duckdb
+
+      - name: Install duckdb compile requirements
+        if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
+        run: sudo apt-get update && sudo apt-get install ninja-build cmake build-essential make ccache clang -y
+
+      - name: Build duckdb binary
+        if: steps.cache-duckdb-binary.outputs.cache-hit != 'true'
+        env:
+          CC: clang
+          CXX: clang++
+          GEN: ninja
+          NATIVE_ARCH: 1
+          DUCKDB_PLATFORM: linux_amd64
+          LTO: thin
+        run: |
+          git clone https://github.com/duckdb/duckdb
+          cd duckdb
+          git checkout "$DUCKDB_VERSION"
+          make release
+
   bench:
+    needs: build-duckdb
     # S3 is shared state here, and we want to make sure only one of each job runs at a time
     concurrency:
       group: ${{ github.workflow }}-${{ github.ref_name }}-${{matrix.id}}
@@ -33,8 +72,8 @@ jobs:
             binary_name: tpch
             name: TPC-H on S3
             local_dir: bench-vortex/data/tpch/1
-            remote_storage: s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch-sf1/
-            targets: "datafusion:parquet,datafusion:vortex"
+            remote_storage: s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/1/
+            targets: "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex"
     runs-on:
       - runs-on=${{ github.run_id }}
       - family=c6id.8xlarge
@@ -55,6 +94,21 @@ jobs:
         with:
           submodules: "recursive"
 
+      - name: Cache duckdb compiled binary
+        uses: runs-on/cache@v4
+        id: cache-duckdb-binary
+        with:
+          fail-on-cache-miss: true
+          key: "${{ runner.os }}-duckdb-linux_amd64-${{ env.DUCKDB_VERSION }}"
+          path: ${{ github.workspace }}/duckdb/build/release/duckdb
+
+      - name: Add duckdb to path
+        run: |
+          echo "${{ github.workspace }}/duckdb/build/release/" >> $GITHUB_PATH
+
+      - name: Verify duckdb available
+        run: duckdb --version
+
       - name: Setup AWS CLI
         uses: aws-actions/configure-aws-credentials@v4
         with:
@@ -68,27 +122,26 @@ jobs:
         run: |
           cargo build --bin ${{ matrix.binary_name }} --package bench-vortex --profile release_debug
 
-      - name: Install Ninja
-        run: sudo apt-get update && sudo apt-get install -y ninja-build
-
-      - name: Install gzip
-        run: sudo apt-get update && sudo apt-get install -y gzip
+      - name: Install ninja & gzip
+        run: sudo apt-get update && sudo apt-get install -y ninja-build gzip
 
       - name: DuckDB extension build
         env:
           # Build DuckDB and the Vortex extension with `-march=native`.
           # The `NATIVE_ARCH` environment variable is picked up by `duckdb/Makefile`.
           NATIVE_ARCH: 1
+          DUCKDB_PLATFORM: linux_amd64
+          CFLAGS: -ftls-model=global-dynamic
         run: GEN=ninja make release
         working-directory: ${{ github.workspace }}/duckdb-vortex
 
       - name: Generate data
         shell: bash
         run: |
           # Generate data, running each query once to make sure they don't panic.
-          target/release_debug/${{ matrix.binary_name }} --targets datafusion:parquet -i1 -d gh-json
-          target/release_debug/${{ matrix.binary_name }} --targets datafusion:vortex -i1 -d gh-json
-          target/release_debug/${{ matrix.binary_name }} --targets duckdb:vortex -i1 -d gh-json
+          RUST_BACKTRACE=1 target/release_debug/${{ matrix.binary_name }} --targets datafusion:parquet -i1 -d gh-json --skip-rebuild
+          RUST_BACKTRACE=1 target/release_debug/${{ matrix.binary_name }} --targets datafusion:vortex -i1 -d gh-json --skip-rebuild
+          RUST_BACKTRACE=1 target/release_debug/${{ matrix.binary_name }} --targets duckdb:vortex -i1 -d gh-json --skip-rebuild
 
       - name: Upload data
         if: matrix.remote_storage != null
@@ -120,6 +173,7 @@ jobs:
             -d gh-json \
             --targets ${{ matrix.targets }} \
             --export-spans \
+            --skip-rebuild \
           | tee results.json
 
       - name: Run ${{ matrix.name }} benchmark (remote)
@@ -137,6 +191,7 @@ jobs:
               --use-remote-data-dir ${{ matrix.remote_storage }} \
               --targets ${{ matrix.targets }} \
               --export-spans \
+              --skip-rebuild \
               -d gh-json \
             | tee results.json
 

diff --git a/bench-vortex/src/bin/clickbench.rs b/bench-vortex/src/bin/clickbench.rs
@@ -71,6 +71,9 @@ struct Args {
     hide_progress_bar: bool,
     #[arg(long, default_value_t = false)]
     show_metrics: bool,
+    // Don't try to rebuild duckdb
+    #[arg(long)]
+    skip_rebuild: bool,
 }
 
 struct DataFusionCtx {
@@ -211,7 +214,7 @@ fn main() -> anyhow::Result<()> {
         .then(|| {
             let path = ddb::get_executable_path(&args.duckdb_path);
             // If the path is to the duckdb-vortex extension, try to rebuild
-            if args.duckdb_path.is_none() {
+            if args.duckdb_path.is_none() && !args.skip_rebuild {
                 ddb::build_vortex_duckdb();
             }
             path

diff --git a/bench-vortex/src/bin/tpcds.rs b/bench-vortex/src/bin/tpcds.rs
@@ -177,13 +177,12 @@ async fn bench_main(
         match engine {
             // TODO(joe): support datafusion
             Engine::DuckDB => {
-                let duckdb_path = duckdb_resolved_path;
                 let temp_dir = tempdir()?;
                 let duckdb_file = temp_dir
                     .path()
                     .join(format!("duckdb-file-{}.db", format.name()));
 
-                let executor = DuckDBExecutor::new(duckdb_path.to_owned(), duckdb_file);
+                let executor = DuckDBExecutor::new(duckdb_resolved_path.to_owned(), duckdb_file);
                 register_tables(&executor, &url, format, BenchmarkDataset::TpcDS)?;
 
                 for (query_idx, sql_query) in tpch_queries.clone() {