Nvidia monitor fixes (#239)

graeme-a-stewart · Johannes Elmsheuser · graeme-of-cern · web-flow · commit 0696123125da · 2024-05-03T15:12:14.000+02:00
* Update output parsing for nvidiamon

* Update test files for nvidia-smi parsing

Update to the new nvidia-smi pmon output fields
Add a pycuda GPU burner script for tests

* Parse ccpm field as string

This can be a "-" instead of 0

* Update precook script and precooked ouptuts

Ensure precooked values are fixed to what we want

* Fix hash-bang an mode on GPU burner

* Python reformatting

With latest verisons of black and flake8

There is one import in gpu-burner.py that is needed (pycuda.autoinit)
as it has side effects, so this is marked as excempt for flake8

---------

Co-authored-by: Johannes Elmsheuser &lt;johannes.elmsheuser@cern.ch&gt;
Co-authored-by: Graeme Stewart &lt;graemes.cern@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -254,4 +254,4 @@ to CMake using `Gperftools_ROOT_DIR`.
 
 # Copyright
 
-Copyright (c) 2018-2023 CERN.
+Copyright (c) 2018-2024 CERN.
diff --git a/package/scripts/gpu-burner.py b/package/scripts/gpu-burner.py
@@ -0,0 +1,41 @@
+#! /usr/bin/env python3
+#
+# This is a slightly adapted "hello, world" script from
+# pycuda, that can be used for stressing a CUDA GPU for
+# tests
+#
+# pycuda is required!
+#
+
+import pycuda.autoinit        # noqa: F401
+import pycuda.driver as drv
+import numpy
+from time import time
+
+from pycuda.compiler import SourceModule
+
+mod = SourceModule(
+    """
+__global__ void multiply_them(float *dest, float *a, float *b, float *c)
+{
+  const int i = threadIdx.x;
+  dest[i] = a[i] * b[i] + c[i];
+}
+"""
+)
+
+multiply_them = mod.get_function("multiply_them")
+
+a = numpy.random.randn(1024).astype(numpy.float32)
+b = numpy.random.randn(1024).astype(numpy.float32)
+c = numpy.random.randn(1024).astype(numpy.float32)
+
+dest = numpy.zeros_like(a)
+
+start = time()
+while time() - start < 20:
+    multiply_them(
+        drv.Out(dest), drv.In(a), drv.In(b), drv.In(c), block=(1024, 1, 1), grid=(1, 1)
+    )
+
+print(dest - a * b + c)
diff --git a/package/scripts/precook_test.py b/package/scripts/precook_test.py
@@ -95,19 +95,24 @@ def make_net(proc_net, fixed_value, rand=False):
 
 def make_nvidia(proc_nvidia, fixed_value, rand=False):
     # idx
+    print(proc_nvidia, fixed_value, rand)
     smi_fname = os.path.join(proc_nvidia, "smi")
+    pct_lim = 100
     memory_lim = 10000
     with open(smi_fname, "w") as f:
         params = [
             0,  # idx
             pid,  # pid
             "G",  # type
-            random.randint(0, memory_lim) if rand else fixed_value,  # sm
-            random.randint(0, memory_lim) if rand else fixed_value,  # mem
-            # enc, dec are not monitored metrics
-            0,  # enc
-            0,  # dec
+            random.randint(0, pct_lim) if rand else fixed_value,  # sm
+            random.randint(0, pct_lim) if rand else fixed_value,  # mem
+            # The following are not monitored metrics
+            "-",  # enc
+            "-",  # dec
+            "-",  # jpg
+            "-",  # ofa
             random.randint(0, memory_lim) if rand else fixed_value,  # fb
+            0,  # ccpm
             "python3",  # command
         ]
         for param in params:
diff --git a/package/scripts/precooked_tests/drop/1/nvidia/smi b/package/scripts/precooked_tests/drop/1/nvidia/smi
@@ -1 +1 @@
-0 1729 G 50 50 0 0 50 python3 
+0 1729 G 50 50 - - - - 50 0 python3 
diff --git a/package/scripts/precooked_tests/drop/2/nvidia/smi b/package/scripts/precooked_tests/drop/2/nvidia/smi
@@ -1 +1 @@
-0 1729 G 100 100 0 0 100 python3 
+0 1729 G 100 100 - - - - 100 0 python3 
diff --git a/package/scripts/precooked_tests/drop/3/nvidia/smi b/package/scripts/precooked_tests/drop/3/nvidia/smi
@@ -1 +1 @@
-0 1729 G 20 20 0 0 20 python3 
+0 1729 G 20 20 - - - - 20 0 python3 
diff --git a/package/scripts/prmon_compress_output.py b/package/scripts/prmon_compress_output.py
@@ -122,9 +122,11 @@ def main():
 
     parser.add_argument(
         "--precision",
-        type=lambda x: float(x)
-        if 0 < float(x) < 1
-        else parser.exit(-1, "Precision must be strictly between 0 and 1"),
+        type=lambda x: (
+            float(x)
+            if 0 < float(x) < 1
+            else parser.exit(-1, "Precision must be strictly between 0 and 1")
+        ),
         default=0.05,
         help="precision value for interpolation threshold",
     )
diff --git a/package/src/nvidiamon.cpp b/package/src/nvidiamon.cpp
@@ -76,20 +76,20 @@ void nvidiamon::update_stats(const std::vector<pid_t>& pids,
   // Loop over output
   unsigned int gpu_idx{}, sm{}, mem{}, fb_mem{};
   pid_t pid{};
-  std::string enc{}, dec{}, cg_type{}, cmd_name{};
+  std::string enc{}, dec{}, jpg{}, ofa{}, cg_type{}, ccpm{}, cmd_name{};
   std::unordered_map<unsigned int, bool>
       activegpus{};  // Avoid double counting active GPUs
   for (const auto& s : cmd_result.second) {
     if (s[0] == '#') continue;
     std::istringstream instr(s);
-    instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> fb_mem >>
-        cmd_name;
+    instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> jpg >> ofa >> fb_mem >>
+          ccpm  >> cmd_name;
     auto read_ok = !(instr.fail() || instr.bad());  // eof() is ok
     if (read_ok) {
       if (log_level <= spdlog::level::debug) {
         std::stringstream strm;
         strm << "Good read: " << gpu_idx << " " << pid << " " << cg_type << " "
-             << sm << " " << mem << " " << enc << " " << dec << " " << fb_mem
+             << sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm
              << " " << cmd_name << std::endl;
         debug(strm.str());
       }
@@ -115,7 +115,7 @@ void nvidiamon::update_stats(const std::vector<pid_t>& pids,
       std::stringstream strm;
       strm << "Bad read of line: " << s << std::endl;
       strm << "Parsed to: " << gpu_idx << " " << pid << " " << cg_type << " "
-           << sm << " " << mem << " " << enc << " " << dec << " " << fb_mem
+           << sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm
            << " " << cmd_name << std::endl;
 
       strm << "StringStream status: good()=" << instr.good();

Original file line number	Diff line number	Diff line change
@@ -254,4 +254,4 @@ to CMake using `Gperftools_ROOT_DIR`.
`254`	`254`
`255`	`255`	`# Copyright`
`256`	`256`
`257`		`-Copyright (c) 2018-2023 CERN.`
	`257`	`+Copyright (c) 2018-2024 CERN.`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0 1729 G 50 50 0 0 50 python3`
	`1`	`+0 1729 G 50 50 - - - - 50 0 python3`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0 1729 G 100 100 0 0 100 python3`
	`1`	`+0 1729 G 100 100 - - - - 100 0 python3`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0 1729 G 20 20 0 0 20 python3`
	`1`	`+0 1729 G 20 20 - - - - 20 0 python3`