From 2d3e0362bac428f7409f8e9033ea707ea1cbc3ae Mon Sep 17 00:00:00 2001 From: Pete Bachant Date: Mon, 6 Oct 2025 10:47:44 -0700 Subject: [PATCH] Rename CUDA kernels in benchmarks --- .buildkite/Manifest-v1.11.toml | 2 +- .buildkite/pipeline.yml | 28 ++++------------------ perf/benchmark_step.jl | 43 +++++++++++++++++++++++++++------- 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/.buildkite/Manifest-v1.11.toml b/.buildkite/Manifest-v1.11.toml index ff5c48ee1b..45d24d916f 100644 --- a/.buildkite/Manifest-v1.11.toml +++ b/.buildkite/Manifest-v1.11.toml @@ -408,7 +408,7 @@ weakdeps = ["CUDA", "MPI"] deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LazyBroadcast", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "UnrolledUtilities"] git-tree-sha1 = "344711aa776e0bbd007ad127e5ba9f2113a1c88b" uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.14.41" +version = "0.14.42" weakdeps = ["CUDA", "Krylov"] [deps.ClimaCore.extensions] diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 16c2ed66da..137edc9c07 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -20,7 +20,7 @@ steps: - label: "init :computer:" key: "init_cpu_env" concurrency: 1 - concurrency_group: 'depot/climaatmos-ci' + concurrency_group: "depot/climaatmos-ci" command: - "echo $$JULIA_DEPOT_PATH" @@ -41,13 +41,11 @@ steps: - group: "Reproducibility infrastructure" steps: - - label: ":computer: Test reproducibility infrastructure" command: "julia --color=yes --project=.buildkite test/unit_reproducibility_infra.jl" - group: "Radiation" steps: - - label: ":computer: single column radiative equilibrium gray" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -98,7 +96,6 @@ steps: - group: "Gravity wave" steps: - - label: ":computer: non-orographic gravity wave parameterization unit test 3d" command: "julia --color=yes --project=.buildkite test/parameterized_tendencies/gravity_wave/non_orographic_gravity_wave/nogw_test_3d.jl" artifact_paths: "nonorographic_gravity_wave_test_3d/*" @@ -132,7 +129,6 @@ steps: - group: "Column Examples" steps: - - label: ":computer: single column hydrostatic balance float64" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -149,7 +145,6 @@ steps: - group: "Box Examples" steps: - - label: ":computer: Box hydrostatic balance" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -188,7 +183,6 @@ steps: - group: "Plane Examples" steps: - - label: ":computer: Density current experiment" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -287,7 +281,6 @@ steps: - group: "Conservation check" steps: - - label: ":computer: baroclinic wave check conservation" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_conservation.yml @@ -324,7 +317,6 @@ steps: - group: "Sphere Examples (Dycore)" steps: - - label: ":computer: hydrostatic balance float64" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -394,7 +386,6 @@ steps: - group: "Sphere Examples (Aquaplanet)" steps: - - label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 1-moment" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -462,7 +453,6 @@ steps: - group: "Sphere Examples (Topography)" steps: - - label: ":computer: baroclinic wave topography (dcmip)" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -489,7 +479,6 @@ steps: - group: "Restarting" steps: - - label: ":computer: test restart" command: > julia --color=yes --project=.buildkite test/restart.jl @@ -554,7 +543,6 @@ steps: - group: "MPI Examples" steps: - - label: ":computer: Prep restart for MPI" key: "mpi_baro_wave_make_restart" command: > @@ -624,7 +612,6 @@ steps: - group: "EDOnlyEDMFX" steps: - - label: ":man_in_business_suit_levitating: EDOnly EDMFX aquaplanet" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -647,7 +634,6 @@ steps: - group: "Diagnostic EDMFX" steps: - - label: ":genie: Diagnostic EDMFX test in a box" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -746,14 +732,12 @@ steps: - group: "Prognostic EDMFX" steps: - - label: ":genie: Prognostic EDMFX advection test in a column" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/prognostic_edmfx_adv_test_column.yml --job_id prognostic_edmfx_adv_test_column artifact_paths: "prognostic_edmfx_adv_test_column/output_active/*" - agents: slurm_mem: 20GB @@ -948,7 +932,6 @@ steps: - group: "Autodiff" steps: - - label: "baroclinic wave moist check conservation float64 sparse autodiff" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_equil_conservation_ft64_sparse_autodiff.yml @@ -1044,7 +1027,6 @@ steps: - group: "GPU" steps: - - label: "GPU: Gravity waves" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -1113,7 +1095,6 @@ steps: - "baroclinic_wave" - "baroclinic_wave_gpu" - - label: "GPU: baroclinic wave - 2 gpus" key: "baroclinic_wave_2gpu" command: @@ -1233,7 +1214,6 @@ steps: - group: "Benchmarks" steps: - - label: ":computer: Benchmark: CPU baroclinic wave moist" command: > julia --color=yes --project=.buildkite perf/benchmark_step.jl @@ -1251,6 +1231,7 @@ steps: artifact_paths: "bm_baroclinic_wave_moist_gpu/output_active/*" env: CLIMACOMMS_DEVICE: "CUDA" + CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" agents: slurm_mem: 16G slurm_gpus: 1 @@ -1272,6 +1253,7 @@ steps: --job_id bm_default_gpu env: CLIMACOMMS_DEVICE: "CUDA" + CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" agents: slurm_mem: 24GB slurm_gpus: 1 @@ -1284,6 +1266,7 @@ steps: --job_id bm_diag_edmf_gpu env: CLIMACOMMS_DEVICE: "CUDA" + CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" agents: slurm_mem: 24GB slurm_gpus: 1 @@ -1296,13 +1279,13 @@ steps: --job_id bm_prog_edmf_gpu env: CLIMACOMMS_DEVICE: "CUDA" + CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" agents: slurm_mem: 24GB slurm_gpus: 1 - group: "Flame graphs" steps: - - label: ":fire: Flame graph: gpu job" command: > julia --color=yes --project=.buildkite perf/flame.jl @@ -1420,7 +1403,6 @@ steps: - group: "Checkbounds/Inference/Invalidations" steps: - # TODO: we should somehow decouple this unit test from the perf env / scripts # Checkbounds - label: ":computer: checkbounds" diff --git a/perf/benchmark_step.jl b/perf/benchmark_step.jl index b5d717e9e5..3d5424573d 100644 --- a/perf/benchmark_step.jl +++ b/perf/benchmark_step.jl @@ -17,7 +17,7 @@ ClimaComms.@import_required_backends import Random Random.seed!(1234) import ClimaAtmos as CA -import ClimaComms +import CUDA include("common.jl") (; config_file, job_id) = CA.commandline_kwargs() @@ -26,17 +26,44 @@ config = CA.AtmosConfig(config_file; job_id) simulation = CA.get_simulation(config) (; integrator) = simulation; Y₀ = deepcopy(integrator.u); +# Run one step to compile @info "Compiling benchmark_step!..." -CA.benchmark_step!(integrator, Y₀); # compile first +CA.benchmark_step!(integrator, Y₀); @info "Running benchmark_step!..." -n_steps = 10 comms_ctx = ClimaComms.context(integrator.u.c) device = ClimaComms.device(comms_ctx) -local e -s = CA.@timed_str begin - e = ClimaComms.elapsed(device) do - CA.benchmark_step!(integrator, Y₀, n_steps) # run + +# If we're running on CUDA, use CUDA's profiler +if device isa ClimaComms.CUDADevice + e = 0.0 + n_steps = 5 + use_external_profiler = CUDA.Profile.detect_cupti() + if use_external_profiler + @info "Using external CUDA profiler" + CUDA.@profile external = true begin + e = CUDA.@elapsed begin + CA.benchmark_step!(integrator, Y₀, n_steps) + end + end + else + @info "Using internal CUDA profiler" + res = CUDA.@profile external = false begin + e = CUDA.@elapsed begin + CA.benchmark_step!(integrator, Y₀, n_steps) + end + end + show(IOContext(stdout, :limit => false), res) + end + @info "Ran step! with CUDA $n_steps times in $e s, ($(CA.prettytime(e/n_steps*1e9)) per step)" +else + # Profile with Julia's built-in profiler + n_steps = 10 + local e + s = CA.@timed_str begin + e = ClimaComms.elapsed(device) do + CA.benchmark_step!(integrator, Y₀, n_steps) # run + end end + @info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)" end -@info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)"