DataDog · danielsn · Dec 13, 2024 · May 3, 2024 · May 3, 2024 · May 3, 2024
@@ -0,0 +1,40 @@
+name: Pytorch Unit Tests (with GPU) 
+
+on:
+  pull_request:
+    branches:
+      - 'main'
+    paths:
+      - 'ddtrace/profiling/collector/pytorch.py'
+  workflow_dispatch: 
+
+jobs:
+   unit-tests:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        arch: [x86_64]
+    runs-on: ${{ matrix.os }} 
+    steps:
+      - uses: actions/checkout@v4
+        # Include all history and tags
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        name: Install Python
+        with:
+          python-version: '3.12'
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+      - name: Install latest stable toolchain and rustfmt
+        run: rustup update stable && rustup default stable && rustup component add rustfmt clippy
+
+      - name: Install hatch
+        run: pip install hatch
+
+      - name: Install PyTorch
+        run: pip install torch
+
+      - name: Run tests
+        run: hatch run profiling_pytorch:test 
@@ -44,6 +44,9 @@ extern "C"
     void ddup_push_release(Datadog::Sample* sample, int64_t release_time, int64_t count);
     void ddup_push_alloc(Datadog::Sample* sample, int64_t size, int64_t count);
     void ddup_push_heap(Datadog::Sample* sample, int64_t size);
+    void ddup_push_gpu_gputime(Datadog::Sample* sample, int64_t time, int64_t count);
+    void ddup_push_gpu_memory(Datadog::Sample* sample, int64_t mem, int64_t count);
+    void ddup_push_gpu_flops(Datadog::Sample* sample, int64_t flops, int64_t count);
     void ddup_push_lock_name(Datadog::Sample* sample, std::string_view lock_name);
     void ddup_push_threadinfo(Datadog::Sample* sample,
                               int64_t thread_id,
@@ -56,6 +59,7 @@ extern "C"
     void ddup_push_trace_type(Datadog::Sample* sample, std::string_view trace_type);
     void ddup_push_exceptioninfo(Datadog::Sample* sample, std::string_view exception_type, int64_t count);
     void ddup_push_class_name(Datadog::Sample* sample, std::string_view class_name);
+    void ddup_push_gpu_device_name(Datadog::Sample*, std::string_view device_name);
     void ddup_push_frame(Datadog::Sample* sample,
                          std::string_view _name,
                          std::string_view _filename,

@@ -45,7 +45,8 @@ namespace Datadog {
     X(local_root_span_id, "local root span id")                                                                        \
     X(trace_type, "trace type")                                                                                        \
     X(class_name, "class name")                                                                                        \
-    X(lock_name, "lock name")
+    X(lock_name, "lock name")                                                                                          \
+    X(gpu_device_name, "gpu device name")
 
 #define X_ENUM(a, b) a,
 #define X_STR(a, b) b,

@@ -100,6 +100,9 @@ class Sample
     bool push_release(int64_t lock_time, int64_t count);
     bool push_alloc(int64_t size, int64_t count);
     bool push_heap(int64_t size);
+    bool push_gpu_gputime(int64_t time, int64_t count);
+    bool push_gpu_memory(int64_t size, int64_t count);
+    bool push_gpu_flops(int64_t flops, int64_t count);
 
     // Adds metadata to sample
     bool push_lock_name(std::string_view lock_name);
@@ -117,6 +120,9 @@ class Sample
     bool is_timeline_enabled() const;
     static void set_timeline(bool enabled);
 
+    // Pytorch GPU metadata
+    bool push_gpu_device_name(std::string_view device_name);
+
     // Assumes frames are pushed in leaf-order
     void push_frame(std::string_view name,     // for ddog_prof_Function
                     std::string_view filename, // for ddog_prof_Function

@@ -11,7 +11,10 @@ enum SampleType : unsigned int
     LockRelease = 1 << 4,
     Allocation = 1 << 5,
     Heap = 1 << 6,
-    All = CPU | Wall | Exception | LockAcquire | LockRelease | Allocation | Heap
+    GPUTime = 1 << 7,
+    GPUMemory = 1 << 8,
+    GPUFlops = 1 << 9,
+    All = CPU | Wall | Exception | LockAcquire | LockRelease | Allocation | Heap | GPUTime | GPUMemory | GPUFlops
 };
 
 // Every Sample object has a corresponding `values` vector, since libdatadog expects contiguous values per sample.
@@ -30,6 +33,12 @@ struct ValueIndex
     unsigned short alloc_space;
     unsigned short alloc_count;
     unsigned short heap_space;
+    unsigned short gpu_time;
+    unsigned short gpu_count;
+    unsigned short gpu_alloc_space;
+    unsigned short gpu_alloc_count;
+    unsigned short gpu_flops;
+    unsigned short gpu_flops_samples; // Should be "count," but flops is already a count
 };
 
 } // namespace Datadog
@@ -193,6 +193,24 @@ ddup_push_heap(Datadog::Sample* sample, int64_t size) // cppcheck-suppress unuse
     sample->push_heap(size);
 }
 
+void
+ddup_push_gpu_gputime(Datadog::Sample* sample, int64_t time, int64_t count) // cppcheck-suppress unusedFunction
+{
+    sample->push_gpu_gputime(time, count);
+}
+
+void
+ddup_push_gpu_memory(Datadog::Sample* sample, int64_t size, int64_t count) // cppcheck-suppress unusedFunction
+{
+    sample->push_gpu_memory(size, count);
+}
+
+void
+ddup_push_gpu_flops(Datadog::Sample* sample, int64_t flops, int64_t count) // cppcheck-suppress unusedFunction
+{
+    sample->push_gpu_flops(flops, count);
+}
+
 void
 ddup_push_lock_name(Datadog::Sample* sample, std::string_view lock_name) // cppcheck-suppress unusedFunction
 {
@@ -252,6 +270,12 @@ ddup_push_class_name(Datadog::Sample* sample, std::string_view class_name) // cp
     sample->push_class_name(class_name);
 }
 
+void
+ddup_push_gpu_device_name(Datadog::Sample* sample, std::string_view gpu_device_name) // cppcheck-suppress unusedFunction
+{
+    sample->push_gpu_device_name(gpu_device_name);
+}
+
 void
 ddup_push_frame(Datadog::Sample* sample, // cppcheck-suppress unusedFunction
                 std::string_view _name,

@@ -89,6 +89,23 @@ Datadog::Profile::setup_samplers()
     if (0U != (type_mask & SampleType::Heap)) {
         val_idx.heap_space = get_value_idx("heap-space", "bytes");
     }
+    if (0U != (type_mask & SampleType::GPUTime)) {
+        val_idx.gpu_time = get_value_idx("gpu-time", "nanoseconds");
+        val_idx.gpu_count = get_value_idx("gpu-samples", "count");
+    }
+    if (0U != (type_mask & SampleType::GPUMemory)) {
+        // In the backend the unit is called 'gpu-space', but maybe for consistency
+        // it should be gpu-alloc-space
+        // gpu-alloc-samples may be unused, but it's passed along for scaling purposes
+        val_idx.gpu_alloc_space = get_value_idx("gpu-space", "bytes");
+        val_idx.gpu_alloc_count = get_value_idx("gpu-alloc-samples", "count");
+    }
+    if (0U != (type_mask & SampleType::GPUFlops)) {
+        // Technically "FLOPS" is a unit, but we call it a 'count' because no
+        // other profiler uses it as a unit.
+        val_idx.gpu_flops = get_value_idx("gpu-flops", "count");
+        val_idx.gpu_flops_samples = get_value_idx("gpu-flops-samples", "count");
+    }
 
     // Whatever the first sampler happens to be is the default "period" for the profile
     // The value of 1 is a pointless default.

@@ -262,6 +262,42 @@ Datadog::Sample::push_heap(int64_t size)
     return false;
 }
 
+bool
+Datadog::Sample::push_gpu_gputime(int64_t time, int64_t count)
+{
+    if (0U != (type_mask & SampleType::GPUTime)) {
+        values[profile_state.val().gpu_time] += time * count;
+        values[profile_state.val().gpu_count] += count;
+        return true;
+    }
+    std::cout << "bad push gpu" << std::endl;
+    return false;
+}
+
+bool
+Datadog::Sample::push_gpu_memory(int64_t size, int64_t count)
+{
+    if (0U != (type_mask & SampleType::GPUMemory)) {
+        values[profile_state.val().gpu_alloc_space] += size * count;
+        values[profile_state.val().gpu_alloc_count] += count;
+        return true;
+    }
+    std::cout << "bad push gpu memory" << std::endl;
+    return false;
+}
+
+bool
+Datadog::Sample::push_gpu_flops(int64_t size, int64_t count)
+{
+    if (0U != (type_mask & SampleType::GPUFlops)) {
+        values[profile_state.val().gpu_flops] += size * count;
+        values[profile_state.val().gpu_flops_samples] += count;
+        return true;
+    }
+    std::cout << "bad push gpu flops" << std::endl;
+    return false;
+}
+
 bool
 Datadog::Sample::push_lock_name(std::string_view lock_name)
 {
@@ -351,6 +387,16 @@ Datadog::Sample::push_class_name(std::string_view class_name)
     return true;
 }
 
+bool
+Datadog::Sample::push_gpu_device_name(std::string_view device_name)
+{
+    if (!push_label(ExportLabelKey::gpu_device_name, device_name)) {
+        std::cout << "bad push" << std::endl;
+        return false;
+    }
+    return true;
+}
+
 bool
 Datadog::Sample::push_monotonic_ns(int64_t _monotonic_ns)
 {

@@ -19,19 +19,23 @@ def start() -> None: ...
 def upload() -> None: ...
 
 class SampleHandle:
-    def push_cputime(self, value: int, count: int) -> None: ...
-    def push_walltime(self, value: int, count: int) -> None: ...
+    def flush_sample(self) -> None: ...
     def push_acquire(self, value: int, count: int) -> None: ...
-    def push_release(self, value: int, count: int) -> None: ...
     def push_alloc(self, value: int, count: int) -> None: ...
+    def push_class_name(self, class_name: StringType) -> None: ...
+    def push_cputime(self, value: int, count: int) -> None: ...
+    def push_exceptioninfo(self, exc_type: Union[None, bytes, str, type], count: int) -> None: ...
+    def push_frame(self, name: StringType, filename: StringType, address: int, line: int) -> None: ...
+    def push_gpu_device_name(self, device_name: StringType) -> None: ...
+    def push_gpu_flops(self, value: int, count: int) -> None: ...
+    def push_gpu_gputime(self, value: int, count: int) -> None: ...
+    def push_gpu_memory(self, value: int, count: int) -> None: ...
     def push_heap(self, value: int) -> None: ...
     def push_lock_name(self, lock_name: StringType) -> None: ...
-    def push_frame(self, name: StringType, filename: StringType, address: int, line: int) -> None: ...
-    def push_threadinfo(self, thread_id: int, thread_native_id: int, thread_name: StringType) -> None: ...
+    def push_monotonic_ns(self, monotonic_ns: int) -> None: ...
+    def push_release(self, value: int, count: int) -> None: ...
+    def push_span(self, span: Optional[Span]) -> None: ...
     def push_task_id(self, task_id: Optional[int]) -> None: ...
     def push_task_name(self, task_name: StringType) -> None: ...
-    def push_exceptioninfo(self, exc_type: Union[None, bytes, str, type], count: int) -> None: ...
-    def push_class_name(self, class_name: StringType) -> None: ...
-    def push_span(self, span: Optional[Span]) -> None: ...
-    def push_monotonic_ns(self, monotonic_ns: int) -> None: ...
-    def flush_sample(self) -> None: ...
+    def push_threadinfo(self, thread_id: int, thread_native_id: int, thread_name: StringType) -> None: ...
+    def push_walltime(self, value: int, count: int) -> None: ...
@@ -67,6 +67,9 @@ cdef extern from "ddup_interface.hpp":
     void ddup_push_release(Sample *sample, int64_t release_time, int64_t count)
     void ddup_push_alloc(Sample *sample, int64_t size, int64_t count)
     void ddup_push_heap(Sample *sample, int64_t size)
+    void ddup_push_gpu_gputime(Sample *sample, int64_t gputime, int64_t count)
+    void ddup_push_gpu_memory(Sample *sample, int64_t size, int64_t count)
+    void ddup_push_gpu_flops(Sample *sample, int64_t flops, int64_t count)
     void ddup_push_lock_name(Sample *sample, string_view lock_name)
     void ddup_push_threadinfo(Sample *sample, int64_t thread_id, int64_t thread_native_id, string_view thread_name)
     void ddup_push_task_id(Sample *sample, int64_t task_id)
@@ -76,6 +79,7 @@ cdef extern from "ddup_interface.hpp":
     void ddup_push_trace_type(Sample *sample, string_view trace_type)
     void ddup_push_exceptioninfo(Sample *sample, string_view exception_type, int64_t count)
     void ddup_push_class_name(Sample *sample, string_view class_name)
+    void ddup_push_gpu_device_name(Sample *sample, string_view device_name)
     void ddup_push_frame(Sample *sample, string_view _name, string_view _filename, uint64_t address, int64_t line)
     void ddup_push_monotonic_ns(Sample *sample, int64_t monotonic_ns)
     void ddup_flush_sample(Sample *sample)
@@ -301,6 +305,18 @@ cdef call_ddup_push_class_name(Sample* sample, class_name: StringType):
     if utf8_data != NULL:
         ddup_push_class_name(sample, string_view(utf8_data, utf8_size))
 
+cdef call_ddup_push_gpu_device_name(Sample* sample, device_name: StringType):
+    if not device_name:
+        return
+    if isinstance(device_name, bytes):
+        ddup_push_gpu_device_name(sample, string_view(<const char*>device_name, len(device_name)))
+        return
+    cdef const char* utf8_data
+    cdef Py_ssize_t utf8_size
+    utf8_data = PyUnicode_AsUTF8AndSize(device_name, &utf8_size)
+    if utf8_data != NULL:
+        ddup_push_gpu_device_name(sample, string_view(utf8_data, utf8_size))
+
 cdef call_ddup_push_trace_type(Sample* sample, trace_type: StringType):
     if not trace_type:
         return
@@ -447,6 +463,18 @@ cdef class SampleHandle:
         if self.ptr is not NULL:
             ddup_push_heap(self.ptr, clamp_to_int64_unsigned(value))
 
+    def push_gpu_gputime(self, value: int, count: int) -> None:
+        if self.ptr is not NULL:
+            ddup_push_gpu_gputime(self.ptr, clamp_to_int64_unsigned(value), clamp_to_int64_unsigned(count))
+
+    def push_gpu_memory(self, value: int, count: int) -> None:
+        if self.ptr is not NULL:
+            ddup_push_gpu_memory(self.ptr, clamp_to_int64_unsigned(value), clamp_to_int64_unsigned(count))
+
+    def push_gpu_flops(self, value: int, count: int) -> None:
+        if self.ptr is not NULL:
+            ddup_push_gpu_flops(self.ptr, clamp_to_int64_unsigned(value), clamp_to_int64_unsigned(count))
+
     def push_lock_name(self, lock_name: StringType) -> None:
         if self.ptr is not NULL:
             call_ddup_push_lock_name(self.ptr, lock_name)
@@ -493,6 +521,10 @@ cdef class SampleHandle:
         if self.ptr is not NULL:
             call_ddup_push_class_name(self.ptr, class_name)
 
+    def push_gpu_device_name(self, device_name: StringType) -> None:
+        if self.ptr is not NULL:
+            call_ddup_push_gpu_device_name(self.ptr, device_name)
+
     def push_span(self, span: Optional[Span]) -> None:
         if self.ptr is NULL:
             return