Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/Manifest-v1.11.toml
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ weakdeps = ["CUDA", "MPI"]
deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LazyBroadcast", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "UnrolledUtilities"]
git-tree-sha1 = "344711aa776e0bbd007ad127e5ba9f2113a1c88b"
uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
version = "0.14.41"
version = "0.14.42"
weakdeps = ["CUDA", "Krylov"]

[deps.ClimaCore.extensions]
Expand Down
28 changes: 5 additions & 23 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ steps:
- label: "init :computer:"
key: "init_cpu_env"
concurrency: 1
concurrency_group: 'depot/climaatmos-ci'
concurrency_group: "depot/climaatmos-ci"
command:
- "echo $$JULIA_DEPOT_PATH"

Expand All @@ -41,13 +41,11 @@ steps:

- group: "Reproducibility infrastructure"
steps:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes were made by a YAML auto-formatter in VS Code. Is there a style guide I might be breaking here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure... this is something I have been wondering as well. I considered following this example, which is used in Buildkite's docs.

- label: ":computer: Test reproducibility infrastructure"
command: "julia --color=yes --project=.buildkite test/unit_reproducibility_infra.jl"

- group: "Radiation"
steps:

- label: ":computer: single column radiative equilibrium gray"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -98,7 +96,6 @@ steps:

- group: "Gravity wave"
steps:

- label: ":computer: non-orographic gravity wave parameterization unit test 3d"
command: "julia --color=yes --project=.buildkite test/parameterized_tendencies/gravity_wave/non_orographic_gravity_wave/nogw_test_3d.jl"
artifact_paths: "nonorographic_gravity_wave_test_3d/*"
Expand Down Expand Up @@ -132,7 +129,6 @@ steps:

- group: "Column Examples"
steps:

- label: ":computer: single column hydrostatic balance float64"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand All @@ -149,7 +145,6 @@ steps:

- group: "Box Examples"
steps:

- label: ":computer: Box hydrostatic balance"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -188,7 +183,6 @@ steps:

- group: "Plane Examples"
steps:

- label: ":computer: Density current experiment"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -287,7 +281,6 @@ steps:

- group: "Conservation check"
steps:

- label: ":computer: baroclinic wave check conservation"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_conservation.yml
Expand Down Expand Up @@ -324,7 +317,6 @@ steps:

- group: "Sphere Examples (Dycore)"
steps:

- label: ":computer: hydrostatic balance float64"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -394,7 +386,6 @@ steps:

- group: "Sphere Examples (Aquaplanet)"
steps:

- label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 1-moment"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -462,7 +453,6 @@ steps:

- group: "Sphere Examples (Topography)"
steps:

- label: ":computer: baroclinic wave topography (dcmip)"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand All @@ -489,7 +479,6 @@ steps:

- group: "Restarting"
steps:

- label: ":computer: test restart"
command: >
julia --color=yes --project=.buildkite test/restart.jl
Expand Down Expand Up @@ -554,7 +543,6 @@ steps:

- group: "MPI Examples"
steps:

- label: ":computer: Prep restart for MPI"
key: "mpi_baro_wave_make_restart"
command: >
Expand Down Expand Up @@ -624,7 +612,6 @@ steps:

- group: "EDOnlyEDMFX"
steps:

- label: ":man_in_business_suit_levitating: EDOnly EDMFX aquaplanet"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand All @@ -647,7 +634,6 @@ steps:

- group: "Diagnostic EDMFX"
steps:

- label: ":genie: Diagnostic EDMFX test in a box"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -746,14 +732,12 @@ steps:

- group: "Prognostic EDMFX"
steps:

- label: ":genie: Prognostic EDMFX advection test in a column"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file $CONFIG_PATH/prognostic_edmfx_adv_test_column.yml
--job_id prognostic_edmfx_adv_test_column
artifact_paths: "prognostic_edmfx_adv_test_column/output_active/*"

agents:
slurm_mem: 20GB

Expand Down Expand Up @@ -948,7 +932,6 @@ steps:

- group: "Autodiff"
steps:

- label: "baroclinic wave moist check conservation float64 sparse autodiff"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_equil_conservation_ft64_sparse_autodiff.yml
Expand Down Expand Up @@ -1044,7 +1027,6 @@ steps:

- group: "GPU"
steps:

- label: "GPU: Gravity waves"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -1113,7 +1095,6 @@ steps:
- "baroclinic_wave"
- "baroclinic_wave_gpu"


- label: "GPU: baroclinic wave - 2 gpus"
key: "baroclinic_wave_2gpu"
command:
Expand Down Expand Up @@ -1233,7 +1214,6 @@ steps:

- group: "Benchmarks"
steps:

- label: ":computer: Benchmark: CPU baroclinic wave moist"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this label is incorrect

command: >
julia --color=yes --project=.buildkite perf/benchmark_step.jl
Expand All @@ -1251,6 +1231,7 @@ steps:
artifact_paths: "bm_baroclinic_wave_moist_gpu/output_active/*"
env:
CLIMACOMMS_DEVICE: "CUDA"
CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true"
agents:
slurm_mem: 16G
slurm_gpus: 1
Expand All @@ -1272,6 +1253,7 @@ steps:
--job_id bm_default_gpu
env:
CLIMACOMMS_DEVICE: "CUDA"
CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true"
agents:
slurm_mem: 24GB
slurm_gpus: 1
Expand All @@ -1284,6 +1266,7 @@ steps:
--job_id bm_diag_edmf_gpu
env:
CLIMACOMMS_DEVICE: "CUDA"
CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true"
agents:
slurm_mem: 24GB
slurm_gpus: 1
Expand All @@ -1296,13 +1279,13 @@ steps:
--job_id bm_prog_edmf_gpu
env:
CLIMACOMMS_DEVICE: "CUDA"
CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true"
agents:
slurm_mem: 24GB
slurm_gpus: 1

- group: "Flame graphs"
steps:

- label: ":fire: Flame graph: gpu job"
command: >
julia --color=yes --project=.buildkite perf/flame.jl
Expand Down Expand Up @@ -1420,7 +1403,6 @@ steps:

- group: "Checkbounds/Inference/Invalidations"
steps:

# TODO: we should somehow decouple this unit test from the perf env / scripts
# Checkbounds
- label: ":computer: checkbounds"
Expand Down
43 changes: 35 additions & 8 deletions perf/benchmark_step.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ClimaComms.@import_required_backends
import Random
Random.seed!(1234)
import ClimaAtmos as CA
import ClimaComms
import CUDA

include("common.jl")
(; config_file, job_id) = CA.commandline_kwargs()
Expand All @@ -26,17 +26,44 @@ config = CA.AtmosConfig(config_file; job_id)
simulation = CA.get_simulation(config)
(; integrator) = simulation;
Y₀ = deepcopy(integrator.u);
# Run one step to compile
@info "Compiling benchmark_step!..."
CA.benchmark_step!(integrator, Y₀); # compile first
CA.benchmark_step!(integrator, Y₀);

@info "Running benchmark_step!..."
n_steps = 10
comms_ctx = ClimaComms.context(integrator.u.c)
device = ClimaComms.device(comms_ctx)
local e
s = CA.@timed_str begin
e = ClimaComms.elapsed(device) do
CA.benchmark_step!(integrator, Y₀, n_steps) # run

# If we're running on CUDA, use CUDA's profiler
if device isa ClimaComms.CUDADevice
e = 0.0
n_steps = 5
use_external_profiler = CUDA.Profile.detect_cupti()
if use_external_profiler
@info "Using external CUDA profiler"
CUDA.@profile external = true begin
e = CUDA.@elapsed begin
CA.benchmark_step!(integrator, Y₀, n_steps)
end
end
else
@info "Using internal CUDA profiler"
res = CUDA.@profile external = false begin
e = CUDA.@elapsed begin
CA.benchmark_step!(integrator, Y₀, n_steps)
end
end
show(IOContext(stdout, :limit => false), res)
end
@info "Ran step! with CUDA $n_steps times in $e s, ($(CA.prettytime(e/n_steps*1e9)) per step)"
else
# Profile with Julia's built-in profiler
n_steps = 10
local e
s = CA.@timed_str begin
e = ClimaComms.elapsed(device) do
CA.benchmark_step!(integrator, Y₀, n_steps) # run
end
end
@info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)"
end
@info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)"
Loading