Skip to content

Commit 0696123

Browse files
graeme-a-stewartJohannes Elmsheusergraeme-of-cern
authored
Nvidia monitor fixes (#239)
* Update output parsing for nvidiamon * Update test files for nvidia-smi parsing Update to the new nvidia-smi pmon output fields Add a pycuda GPU burner script for tests * Parse ccpm field as string This can be a "-" instead of 0 * Update precook script and precooked ouptuts Ensure precooked values are fixed to what we want * Fix hash-bang an mode on GPU burner * Python reformatting With latest verisons of black and flake8 There is one import in gpu-burner.py that is needed (pycuda.autoinit) as it has side effects, so this is marked as excempt for flake8 --------- Co-authored-by: Johannes Elmsheuser <[email protected]> Co-authored-by: Graeme Stewart <[email protected]>
1 parent 05939d1 commit 0696123

File tree

8 files changed

+65
-17
lines changed

8 files changed

+65
-17
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,4 +254,4 @@ to CMake using `Gperftools_ROOT_DIR`.
254254

255255
# Copyright
256256

257-
Copyright (c) 2018-2023 CERN.
257+
Copyright (c) 2018-2024 CERN.

package/scripts/gpu-burner.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#! /usr/bin/env python3
2+
#
3+
# This is a slightly adapted "hello, world" script from
4+
# pycuda, that can be used for stressing a CUDA GPU for
5+
# tests
6+
#
7+
# pycuda is required!
8+
#
9+
10+
import pycuda.autoinit # noqa: F401
11+
import pycuda.driver as drv
12+
import numpy
13+
from time import time
14+
15+
from pycuda.compiler import SourceModule
16+
17+
mod = SourceModule(
18+
"""
19+
__global__ void multiply_them(float *dest, float *a, float *b, float *c)
20+
{
21+
const int i = threadIdx.x;
22+
dest[i] = a[i] * b[i] + c[i];
23+
}
24+
"""
25+
)
26+
27+
multiply_them = mod.get_function("multiply_them")
28+
29+
a = numpy.random.randn(1024).astype(numpy.float32)
30+
b = numpy.random.randn(1024).astype(numpy.float32)
31+
c = numpy.random.randn(1024).astype(numpy.float32)
32+
33+
dest = numpy.zeros_like(a)
34+
35+
start = time()
36+
while time() - start < 20:
37+
multiply_them(
38+
drv.Out(dest), drv.In(a), drv.In(b), drv.In(c), block=(1024, 1, 1), grid=(1, 1)
39+
)
40+
41+
print(dest - a * b + c)

package/scripts/precook_test.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,19 +95,24 @@ def make_net(proc_net, fixed_value, rand=False):
9595

9696
def make_nvidia(proc_nvidia, fixed_value, rand=False):
9797
# idx
98+
print(proc_nvidia, fixed_value, rand)
9899
smi_fname = os.path.join(proc_nvidia, "smi")
100+
pct_lim = 100
99101
memory_lim = 10000
100102
with open(smi_fname, "w") as f:
101103
params = [
102104
0, # idx
103105
pid, # pid
104106
"G", # type
105-
random.randint(0, memory_lim) if rand else fixed_value, # sm
106-
random.randint(0, memory_lim) if rand else fixed_value, # mem
107-
# enc, dec are not monitored metrics
108-
0, # enc
109-
0, # dec
107+
random.randint(0, pct_lim) if rand else fixed_value, # sm
108+
random.randint(0, pct_lim) if rand else fixed_value, # mem
109+
# The following are not monitored metrics
110+
"-", # enc
111+
"-", # dec
112+
"-", # jpg
113+
"-", # ofa
110114
random.randint(0, memory_lim) if rand else fixed_value, # fb
115+
0, # ccpm
111116
"python3", # command
112117
]
113118
for param in params:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0 1729 G 50 50 0 0 50 python3
1+
0 1729 G 50 50 - - - - 50 0 python3
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0 1729 G 100 100 0 0 100 python3
1+
0 1729 G 100 100 - - - - 100 0 python3
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0 1729 G 20 20 0 0 20 python3
1+
0 1729 G 20 20 - - - - 20 0 python3

package/scripts/prmon_compress_output.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,11 @@ def main():
122122

123123
parser.add_argument(
124124
"--precision",
125-
type=lambda x: float(x)
126-
if 0 < float(x) < 1
127-
else parser.exit(-1, "Precision must be strictly between 0 and 1"),
125+
type=lambda x: (
126+
float(x)
127+
if 0 < float(x) < 1
128+
else parser.exit(-1, "Precision must be strictly between 0 and 1")
129+
),
128130
default=0.05,
129131
help="precision value for interpolation threshold",
130132
)

package/src/nvidiamon.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,20 +76,20 @@ void nvidiamon::update_stats(const std::vector<pid_t>& pids,
7676
// Loop over output
7777
unsigned int gpu_idx{}, sm{}, mem{}, fb_mem{};
7878
pid_t pid{};
79-
std::string enc{}, dec{}, cg_type{}, cmd_name{};
79+
std::string enc{}, dec{}, jpg{}, ofa{}, cg_type{}, ccpm{}, cmd_name{};
8080
std::unordered_map<unsigned int, bool>
8181
activegpus{}; // Avoid double counting active GPUs
8282
for (const auto& s : cmd_result.second) {
8383
if (s[0] == '#') continue;
8484
std::istringstream instr(s);
85-
instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> fb_mem >>
86-
cmd_name;
85+
instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> jpg >> ofa >> fb_mem >>
86+
ccpm >> cmd_name;
8787
auto read_ok = !(instr.fail() || instr.bad()); // eof() is ok
8888
if (read_ok) {
8989
if (log_level <= spdlog::level::debug) {
9090
std::stringstream strm;
9191
strm << "Good read: " << gpu_idx << " " << pid << " " << cg_type << " "
92-
<< sm << " " << mem << " " << enc << " " << dec << " " << fb_mem
92+
<< sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm
9393
<< " " << cmd_name << std::endl;
9494
debug(strm.str());
9595
}
@@ -115,7 +115,7 @@ void nvidiamon::update_stats(const std::vector<pid_t>& pids,
115115
std::stringstream strm;
116116
strm << "Bad read of line: " << s << std::endl;
117117
strm << "Parsed to: " << gpu_idx << " " << pid << " " << cg_type << " "
118-
<< sm << " " << mem << " " << enc << " " << dec << " " << fb_mem
118+
<< sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm
119119
<< " " << cmd_name << std::endl;
120120

121121
strm << "StringStream status: good()=" << instr.good();

0 commit comments

Comments
 (0)