apache · 2010YOUY01 · Feb 21, 2025 · Feb 16, 2025 · Feb 19, 2025 · Feb 19, 2025
diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml
@@ -39,43 +39,54 @@ jobs:
   linux-build-lib:
     name: linux build test
     runs-on: ubuntu-latest
-    container:
-      image: amd64/rust
     steps:
       - uses: actions/checkout@v4
-      - name: Setup Rust toolchain
-        uses: ./.github/actions/setup-builder
         with:
-          rust-version: stable
+          submodules: true
+          fetch-depth: 1
+      - name: Install Rust
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+          source $HOME/.cargo/env
+          rustup default stable
+      - name: Install Protobuf Compiler
+        run: sudo apt-get install -y protobuf-compiler
       - name: Prepare cargo build
         run: |
           cargo check --profile ci --all-targets
           cargo clean
 
-#  # Run extended tests (with feature 'extended_tests')
-#  # Disabling as it is running out of disk space
-#  # see https://github.com/apache/datafusion/issues/14576
-#  linux-test-extended:
-#    name: cargo test 'extended_tests' (amd64)
-#    needs: linux-build-lib
-#    runs-on: ubuntu-latest
-#    container:
-#      image: amd64/rust
-#    steps:
-#      - uses: actions/checkout@v4
-#        with:
-#          submodules: true
-#          fetch-depth: 1
-#      - name: Setup Rust toolchain
-#        uses: ./.github/actions/setup-builder
-#        with:
-#          rust-version: stable
-#      - name: Run tests (excluding doctests)
-#        run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests
-#      - name: Verify Working Directory Clean
-#        run: git diff --exit-code
-#      - name: Cleanup
-#        run: cargo clean
+  # Run extended tests (with feature 'extended_tests')
+  linux-test-extended:
+    name: cargo test 'extended_tests' (amd64)
+    needs: linux-build-lib
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/[email protected]
+      - name: Install Rust
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+          source $HOME/.cargo/env
+          rustup default stable
+      - name: Install Protobuf Compiler
+        run: sudo apt-get install -y protobuf-compiler
+      # For debugging, test binaries can be large.
+      - name: Show available disk space
+        run: |
+          df -h
+      - name: Run tests (excluding doctests)
+        env:
+          RUST_BACKTRACE: 1
+        run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests
+      - name: Verify Working Directory Clean
+        run: git diff --exit-code
+      - name: Cleanup
+        run: cargo clean
 
   # Check answers are correct when hash values collide
   hash-collisions:
@@ -95,7 +106,7 @@ jobs:
       - name: Run tests
         run: |
           cd datafusion
-          cargo test  --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro,extended_tests
+          cargo test  --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro
           cargo clean
 
   sqllogictest-sqlite:

diff --git a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
@@ -21,12 +21,14 @@
 //! This file is organized as:
 //! - Test runners that spawn individual test processes
 //! - Test cases that contain the actual validation logic
-use std::{process::Command, str};
-
 use log::info;
+use std::sync::Once;
+use std::{process::Command, str};
 
 use crate::memory_limit::memory_limit_validation::utils;
 
+static INIT: Once = Once::new();
+
 // ===========================================================================
 // Test runners:
 // Runners are splitted into multiple tests to run in parallel
@@ -67,10 +69,35 @@ fn sort_with_mem_limit_2_cols_2_runner() {
     spawn_test_process("sort_with_mem_limit_2_cols_2");
 }
 
+/// `spawn_test_process` might trigger multiple recompilation, and the test binary
+/// size might grow indifinitely. This initilizer ensures recompilation is only done
+/// once and the target size is bounded.
+///
+/// TODO: This is a hack, can be cleaned up if we have a better way to let multiple
+/// test cases run in different processes (instead of different threads by default)
+fn init_once() {
+    INIT.call_once(|| {
+        let _ = Command::new("cargo")
+            .arg("test")
+            .arg("--no-run")
+            .arg("--package")
+            .arg("datafusion")
+            .arg("--test")
+            .arg("core_integration")
+            .arg("--features")
+            .arg("extended_tests")
+            .env("DATAFUSION_TEST_MEM_LIMIT_VALIDATION", "1")
+            .output()
+            .expect("Failed to execute test command");
+    });
+}
+
 /// Helper function that executes a test in a separate process with the required environment
 /// variable set. Memory limit validation tasks need to measure memory resident set
 /// size (RSS), so they must run in a separate process.
 fn spawn_test_process(test: &str) {
+    init_once();
+
     let test_path = format!(
         "memory_limit::memory_limit_validation::sort_mem_validation::{}",
         test