diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json index 0c05492f4..d5097ab0b 100644 --- a/.devcontainer/cuda12.9-conda/devcontainer.json +++ b/.devcontainer/cuda12.9-conda/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda12.9-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda12.9-conda", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json index 8b8e00b16..25102bd6d 100644 --- a/.devcontainer/cuda12.9-pip/devcontainer.json +++ b/.devcontainer/cuda12.9-pip/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda12.9-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda12.9-pip", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda13.0-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json index 3237e9648..dab3b22e9 100644 --- a/.devcontainer/cuda13.0-conda/devcontainer.json +++ b/.devcontainer/cuda13.0-conda/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda13.0-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda13.0-conda", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda13.0-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json index 4987bc5b2..dd09f0dba 100644 --- a/.devcontainer/cuda13.0-pip/devcontainer.json +++ b/.devcontainer/cuda13.0-pip/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda13.0-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda13.0-pip", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.github/actions/install-devcontainers-cli/action.yml b/.github/actions/install-devcontainers-cli/action.yml index e157e8164..2a14ffaf6 100644 --- a/.github/actions/install-devcontainers-cli/action.yml +++ b/.github/actions/install-devcontainers-cli/action.yml @@ -14,4 +14,4 @@ runs: run: | sudo apt update; sudo apt install -y --no-install-recommends build-essential; - npm install -g @devcontainers/cli@v0.76.0; + npm install -g @devcontainers/cli@v0.80.2; diff --git a/.github/workflows/build-all-rapids-repos.yml b/.github/workflows/build-all-rapids-repos.yml index 11d76b1c7..6a5d4a6c0 100644 --- a/.github/workflows/build-all-rapids-repos.yml +++ b/.github/workflows/build-all-rapids-repos.yml @@ -37,7 +37,7 @@ jobs: with: arch: '["amd64", "arm64"]' cuda: '["12.9", "13.0"]' - node_type: cpu8 + node_type: cpu16 rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN timeout-minutes: 720 # 1. Prohibit sccache from shutting down automatically diff --git a/features/src/rapids-build-utils/devcontainer-feature.json b/features/src/rapids-build-utils/devcontainer-feature.json index 393239585..0e88ace80 100644 --- a/features/src/rapids-build-utils/devcontainer-feature.json +++ b/features/src/rapids-build-utils/devcontainer-feature.json @@ -1,7 +1,7 @@ { "name": "NVIDIA RAPIDS devcontainer build utilities", "id": "rapids-build-utils", - "version": "25.12.0", + "version": "25.12.1", "description": "A feature to install the RAPIDS devcontainer build utilities", "containerEnv": { "BASH_ENV": "/etc/bash.bash_env" diff --git a/features/src/rapids-build-utils/opt/rapids-build-utils/bin/get-num-archs-jobs-and-load.sh b/features/src/rapids-build-utils/opt/rapids-build-utils/bin/get-num-archs-jobs-and-load.sh index c1035c758..aaa009ec4 100755 --- a/features/src/rapids-build-utils/opt/rapids-build-utils/bin/get-num-archs-jobs-and-load.sh +++ b/features/src/rapids-build-utils/opt/rapids-build-utils/bin/get-num-archs-jobs-and-load.sh @@ -77,7 +77,20 @@ get_num_archs_jobs_and_load() { fi if test "$parallel" -eq 0; then - parallel="$(ulimit -n)" + # Memory (in KB) used by each `sccache ...` invocation from ninja + # * 1.5Mb for the shell launched by ninja + # * 6.5MiB for each sccache client process + local mem_per_sccache_client="$((1024 * 8))" + # It's usually around 400-600MiB, but be conservative + # and assume the sccache daemon will use 1GiB of RAM + local mem_for_sccache_daemon="$((1 * 1024 * 1024))" + # Preprocessor invocations take ~300Mb or so + local mem_for_preprocessor="$((n_cpus * 300 * 1024))" + # Available memory (in KB), for more details see free(1). + local mem_avail="$(cat /proc/meminfo | grep MemAvailable | tr -s '[:space:]' | cut -d' ' -f2)" + # Total job count is available memory after accounting for `nproc` preprocessor calls + # divided by the amount of memory required to invoke the sccache thin client process. + parallel="$(((mem_avail - mem_for_preprocessor - mem_for_sccache_daemon) / mem_per_sccache_client))" fi local n_load="$((parallel > n_cpus ? n_cpus : parallel))"; diff --git a/features/src/utils/devcontainer-feature.json b/features/src/utils/devcontainer-feature.json index d2c82ba74..2dca8d0e9 100644 --- a/features/src/utils/devcontainer-feature.json +++ b/features/src/utils/devcontainer-feature.json @@ -1,7 +1,7 @@ { "name": "devcontainer-utils", "id": "utils", - "version": "25.12.1", + "version": "25.12.2", "description": "A feature to install RAPIDS devcontainer utility scripts", "containerEnv": { "BASH_ENV": "/etc/bash.bash_env" diff --git a/features/src/utils/opt/devcontainer/bin/sccache/dist/status.sh b/features/src/utils/opt/devcontainer/bin/sccache/dist/status.sh index 244d8b5e8..3e797b9aa 100755 --- a/features/src/utils/opt/devcontainer/bin/sccache/dist/status.sh +++ b/features/src/utils/opt/devcontainer/bin/sccache/dist/status.sh @@ -42,9 +42,11 @@ _sccache_dist_status() { c="${c:-${col_width:-${COLUMNS:-1000000000}}}"; if [[ "$f" != json ]] && ! test -n "${no_procs:+x}"; then + echo "open fds: $(lsof -a -p $(cat /tmp/sccache.*.pid) | wc -l)" echo "sccache procs: $(pgrep sccache | wc -l)" fi + if [[ "$f" != json ]] && ! test -n "${no_temps:+x}"; then echo -n "preprocessed tempfiles: " echo -n "$(ls -All /tmp/.sccache_temp/.tmp* 2>/dev/null | wc -l) " @@ -56,6 +58,15 @@ _sccache_dist_status() { echo fi + if ! test -n "${no_stats:+x}"; then + if test "$f" = json; then + sccache --show-stats --stats-format json + else + sccache --show-stats + fi + echo + fi + # Print current dist status to verify we're connected sccache 2>/dev/null --dist-status \ | { @@ -84,14 +95,14 @@ _sccache_dist_status() { loading: .jobs.loading, pending: .jobs.pending, running: .jobs.running, - max: ((.max_job_age // 0) | tostring | . + "s"), + oldest: ((.max_job_age // 0) | tostring | . + "s"), accepted: .jobs.accepted, finished: .jobs.finished, seen: ((.u_time // 0) | tostring | . + "s"), }; .SchedulerStatus as [\$x, \$y] | [ - (\$y + { id: \$x, type: "scheduler", u_time: (\$y.servers // {} | map(.u_time) | min | . // "-" | tostring), max_job_age: (\$y.servers // {} | map(.u_time + .max_job_age) | max | . // "-" | tostring) }), + (\$y + { id: \$x, type: "scheduler", u_time: (\$y.servers // {} | map(.u_time) | min | . // "-" | tostring), max_job_age: (\$y.servers // {} | map(.max_job_age) | max | . // "-" | tostring) }), (\$y.servers // [] | sort_by(.id)[]) ] | map(info_to_row) as \$rows @@ -115,15 +126,6 @@ EOF cat - fi } - - if ! test -n "${no_stats:+x}"; then - echo - if test "$f" = json; then - sccache --show-stats --stats-format json - else - sccache --show-stats - fi - fi fi } diff --git a/features/src/utils/opt/devcontainer/bin/sccache/start.sh b/features/src/utils/opt/devcontainer/bin/sccache/start.sh index c54148e45..3ea982ede 100755 --- a/features/src/utils/opt/devcontainer/bin/sccache/start.sh +++ b/features/src/utils/opt/devcontainer/bin/sccache/start.sh @@ -41,6 +41,8 @@ _start_sccache() { if test -n "${f:-${foreground:+x}}"; then # Unset this so sccache outputs to stderr unset SCCACHE_ERROR_LOG; + # Increase the open file limit so users can do `make -j(ulimit -n)` + ulimit -n "$(ulimit -Hn)"; # Start the sccache server in the foreground RUST_LOG_STYLE="always" \ SCCACHE_NO_DAEMON=1 \ @@ -62,7 +64,9 @@ _start_sccache() { sccache --start-server 1>&2 2>/dev/null \ | tee "$logfile"; # Write the pid to the pidfile - pgrep sccache | sort -nr | head -n1 | tee "${pidfile}" >/dev/null; + pgrep sccache | sort -n | head -n1 | tee "${pidfile}" >/dev/null; + # Increase the open file limit so users can do `make -j(ulimit -n)` + prlimit --nofile=$(ulimit -Hn):$(ulimit -Hn) --pid "$(cat "${pidfile}")"; echo "=== sccache logfile: $logfile ===" >&2; echo "=== sccache pidfile: $pidfile ===" >&2; fi