Skip to content

Commit b4a4542

Browse files
committed
Test fixes for multi-GPU instances.
1 parent 0e9ad9d commit b4a4542

File tree

2 files changed

+9
-3
lines changed

2 files changed

+9
-3
lines changed

test/nvml.jl

+5-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ macro maybe_unsupported(ex)
55
try
66
$(esc(ex))
77
catch err
8-
(isa(err, NVMLError) && err.code == NVML.ERROR_NOT_SUPPORTED) || rethrow()
8+
# XXX: we get a permission error when trying to access an entire device
9+
# when we've actually been assigned a compute instance (MIG);
10+
# but how do we know if the current device is an instance?
11+
(isa(err, NVML.NVMLError) && err.code in [NVML.ERROR_NOT_SUPPORTED, NVML.NVML_ERROR_NO_PERMISSION]) || rethrow()
912
end
1013
end
1114
end
@@ -25,7 +28,7 @@ end
2528

2629
@test NVML.uuid(nvml_dev) == uuid(cuda_dev)
2730
NVML.brand(nvml_dev)
28-
@test NVML.name(nvml_dev) == name(cuda_dev)
31+
@test occursin(NVML.name(nvml_dev), name(cuda_dev)) # entire device vs MIG
2932
@maybe_unsupported NVML.serial(nvml_dev)
3033

3134
@maybe_unsupported NVML.power_usage(nvml_dev)

test/setup.jl

+4-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,10 @@ function runtests(f, name, time_source=:cuda, snoop=nothing)
8787
missing
8888
end
8989
catch err
90-
(isa(err, NVML.NVMLError) && err.code == NVML.ERROR_NOT_SUPPORTED) || rethrow()
90+
# XXX: we get a permission error when trying to access an entire device
91+
# when we've actually been assigned a compute instance (MIG);
92+
# but how do we know if the current device is an instance?
93+
(isa(err, NVML.NVMLError) && err.code in [NVML.ERROR_NOT_SUPPORTED, NVML.NVML_ERROR_NO_PERMISSION]) || rethrow()
9194
missing
9295
end
9396
else

0 commit comments

Comments
 (0)