Change nvtx_archive to download from GitHub repo

benbarsdell · nluehr · commit a645904ea0ab · 2021-09-13T15:34:32.000-05:00
- The NVTX headers are now on GitHub so we no longer need to download
  and unpack the .deb package.
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -586,6 +586,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/profiler:nvtx_utils",
         "//tensorflow/core/profiler/lib:traceme",
         #"//tensorflow/core/profiler/lib:scoped_annotation",
         "//tensorflow/stream_executor",
@@ -605,7 +606,6 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
-        "@nvtx_archive//:nvtx",
     ] + if_cuda_is_configured([
         "//tensorflow/stream_executor/cuda:cuda_stream",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/nvtx.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/profiler/nvtx_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -58,12 +58,14 @@ Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(hlo_instruction());
-  auto nvtx_range = tensorflow::nvtx::MaybeNvtxRangeStart(
-      hlo_instruction()->NvtxNodeOpString(),
-      hlo_instruction()->NvtxNodeNameString());
+  tensorflow::nvtx::ScopedRangeIfEnabled<tensorflow::nvtx::CoreDomain>
+      nvtx_range(cudnn_call_->metadata().op_type(), [&]() {
+        return tensorflow::nvtx::GetThunkExecutionRangeMessage(
+            cudnn_call_->GetModule()->name(),
+            cudnn_call_->metadata().op_name());
+      });
   TF_RETURN_IF_ERROR(RunGpuConv(cudnn_call_, absl::MakeSpan(operand_se_buffers),
                                 result_buffer, scratch, params.stream));
-  tensorflow::nvtx::MaybeNvtxRangeEnd(nvtx_range);
 
   // Write the output tuple.
   const int kNumOutputs = 2;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/nvtx.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/nvtx_utils.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/device_memory.h"
 
@@ -265,8 +265,11 @@ Status RunGemm(const HloInstruction *gemm,
   complex128 alpha = {backend_config.alpha_real(), backend_config.alpha_imag()};
   double beta = backend_config.beta();
 
-  auto nvtx_range = tensorflow::nvtx::MaybeNvtxRangeStart(
-      gemm->NvtxNodeOpString(), gemm->NvtxNodeNameString());
+  tensorflow::nvtx::ScopedRangeIfEnabled<tensorflow::nvtx::CoreDomain>
+      nvtx_range(gemm->metadata().op_type(), [&]() {
+        return tensorflow::nvtx::GetThunkExecutionRangeMessage(
+            gemm->GetModule()->name(), gemm->metadata().op_name());
+      });
 
   bool launch_ok = [&]() {
     switch (output_shape.element_type()) {
@@ -303,8 +306,6 @@ Status RunGemm(const HloInstruction *gemm,
     }
   }();
 
-  tensorflow::nvtx::MaybeNvtxRangeEnd(nvtx_range);
-
   if (!launch_ok) {
     return InternalError("Unable to launch cuBLAS gemm on stream %p", stream);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -2334,14 +2334,6 @@ string PrintName(const string& name, bool print_ids) {
   }
 }
 
-string HloInstruction::NvtxNodeOpString() const { return metadata().op_type(); }
-
-string HloInstruction::NvtxNodeNameString() const {
-  string cluster_name = GetModule()->name();
-  cluster_name = cluster_name.substr(0, cluster_name.find("__XlaCompile"));
-  return cluster_name + "_1/xla_run/" + metadata().op_name();
-}
-
 namespace {
 
 using DFSStack = absl::InlinedVector<std::pair<int, HloInstruction*>, 16>;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1285,13 +1285,6 @@ class HloInstruction {
   // function, e.g. the signature of an F32 add is (F32, F32) -> F32.
   string SignatureString() const;
 
-  // Returns a string that is the node op for the node associated with this hlo
-  string NvtxNodeOpString() const;
-
-  // Returns a string that is the node name for the node associated with this
-  // hlo
-  string NvtxNodeNameString() const;
-
   // Returns a debugging string that represents this instruction.
   //
   // (We express the default options using an overload rather than a default
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
@@ -2937,9 +2937,10 @@ tf_cuda_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "//tensorflow/core/platform/default/build_config:platformlib",
-	"//tensorflow/core:framework/bfloat16",
-	"//tensorflow/core:framework/numeric_types",
+        "//tensorflow/core:framework/bfloat16",
+        "//tensorflow/core:framework/numeric_types",
         "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/profiler:nvtx_utils",
         "//tensorflow/core/profiler/lib:traceme",
         "//third_party/eigen3",
     ] + if_static(
@@ -3286,8 +3287,8 @@ tf_cuda_library(
         "@com_google_absl//absl/types:optional",
         "//third_party/eigen3",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/profiler:nvtx_utils",
         "//tensorflow/core/profiler/lib:traceme",
-        "@nvtx_archive//:nvtx",
         "//tensorflow/core/profiler/internal:traceme_recorder",
     ] + mkl_deps(),
     alwayslink = 1,
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
@@ -202,8 +202,8 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/profiler/lib:traceme",
+            "//tensorflow/core/profiler:nvtx_utils",
             "//tensorflow/core/grappler/optimizers:meta_optimizer",
-            "@nvtx_archive//:nvtx",
         ],
     }),
 )
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -39,16 +39,13 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/nvtx_utils.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #endif  // !IS_MOBILE_PLATFORM
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/platform/nvtx.h"
-#endif // GOOGLE_CUDA
-
 namespace tensorflow {
 
 std::function<void(std::function<void()>)>* KernelAndDevice::get_runner()
@@ -308,49 +305,14 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
 
   OpKernelContext context(&params);
 
-#if GOOGLE_CUDA
-  string msg;
-  if (nvtx::NvtxRangesEnabled() || nvtx::NvtxRangesDetailedEnabled()) {
-    if (nvtx::NvtxRangesDetailedEnabled()) {
-      std::vector<string> args_pieces;
-      for (int i = 0; i < inputs.size(); i++) {
-        if (i == 10) {
-          // Truncate long arg lists and indicate with an ending null value.
-          args_pieces.push_back("null");
-          break;
-        }
-        const auto& shape = inputs[i].tensor->shape();
-        string shape_str = shape.unknown_rank() ? "null" : shape.DebugString();
-        args_pieces.push_back(
-            strings::StrCat("{\"name\":\"", kernel_->def().input(i),
-                            "\",\"shape\":", shape_str, "}"));
-      }
-      std::vector<string> attrs_pieces;
-      const auto& attrs = kernel_->def().attr();
-      for (auto it = attrs.begin(); it != attrs.end(); ++it) {
-        const string& key = it->first;
-        const AttrValue& value = it->second;
-        // Exclude types that aren't useful for profiling.
-        if (value.value_case() == AttrValue::kFunc ||
-            value.value_case() == AttrValue::kPlaceholder ||
-            value.value_case() == AttrValue::VALUE_NOT_SET) {
-          continue;
-        }
-        string value_str = nvtx::AttrValueToJson(value);
-        attrs_pieces.push_back(strings::StrCat("\"", key, "\":", value_str));
-      }
-      msg = strings::StrCat("{\"op\":\"", kernel_->def().op(), "\",\"name\":\"",
-                            kernel_->name(), "\",\"args\":[",
-                            str_util::Join(args_pieces, ","), "],\"attrs\":{",
-                            str_util::Join(attrs_pieces, ","), "}}");
-    } else {
-      msg = kernel_->def().op() + ": " + kernel_->name();
-    }
-  }
-  auto nvtx_range = nvtx::MaybeNvtxDomainRangeStartMsg(msg,
-                                                       kernel_->def().op());
-#endif // GOOGLE_CUDA
-
+  nvtx::ScopedRangeIfEnabled<nvtx::CoreDomain> nvtx_range(
+      kernel_->def().op(), [&]() {
+        return nvtx::GetNodeExecutionRangeMessage(
+            kernel_.get(), inputs.size(), inputs,
+            [](const TensorValue& tensor_value) {
+              return tensor_value.tensor;
+            });
+      });
   if (kernel_->def().op() == "_Recv") {
     // TODO(apassos) do not special-case _Recv. Currently the GPU device fails
     // if trying to run _Recv->Compute(), specifically checking for _Recv. To go
@@ -397,10 +359,6 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
     UpdateStats(&context, step_stats_collector.get(), stats);
   }
 
-#if GOOGLE_CUDA
-  nvtx::MaybeNvtxDomainRangeEnd(nvtx_range);
-#endif // GOOGLE_CUDA
-
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
@@ -25,10 +25,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/platform/nvtx.h"
-#endif // GOOGLE_CUDA
-
 #include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/common_runtime/pending_counts.h"
@@ -75,8 +71,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/nvtx_utils.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
-#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace {
@@ -1729,48 +1725,14 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
     Entry* first_input = input_tensors + item.input_start;
     outputs.clear();
 
-#if GOOGLE_CUDA
-    string msg;
-    if (nvtx::NvtxRangesEnabled() || nvtx::NvtxRangesDetailedEnabled()) {
-      if (nvtx::NvtxRangesDetailedEnabled()) {
-        std::vector<string> args_pieces;
-        for (int i = 0; i < item.num_inputs; ++i) {
-          if (i == 10) {
-            // Truncate long arg lists and indicate with an ending null value.
-            args_pieces.push_back("null");
-            break;
-          }
-          const auto& shape = GetTensorValueForDump(first_input[i])->shape();
-          string shape_str =
-              shape.unknown_rank() ? "null" : shape.DebugString();
-          args_pieces.push_back(
-              strings::StrCat("{\"name\":\"", node->def().input(i),
-                              "\",\"shape\":", shape_str, "}"));
-        }
-        std::vector<string> attrs_pieces;
-        const auto& attrs = node->def().attr();
-        for (auto it = attrs.begin(); it != attrs.end(); ++it) {
-          const string& key = it->first;
-          const AttrValue& value = it->second;
-          // Exclude types that aren't useful for profiling.
-          if (value.value_case() == AttrValue::kFunc ||
-              value.value_case() == AttrValue::kPlaceholder ||
-              value.value_case() == AttrValue::VALUE_NOT_SET) {
-            continue;
-          }
-          string value_str = nvtx::AttrValueToJson(value);
-          attrs_pieces.push_back(strings::StrCat("\"", key, "\":", value_str));
-        }
-        msg = strings::StrCat("{\"op\":\"", node->def().op(), "\",\"name\":\"",
-                              node->name(), "\",\"args\":[",
-                              str_util::Join(args_pieces, ","), "],\"attrs\":{",
-                              str_util::Join(attrs_pieces, ","), "}}");
-      } else {
-        msg = node->def().op() + ": " + node->name();
-      }
-    }
-    auto nvtx_range = nvtx::MaybeNvtxDomainRangeStartMsg(msg, node->def().op());
-#endif // GOOGLE_CUDA
+    nvtx::ScopedRangeIfEnabled<nvtx::CoreDomain> nvtx_range(
+        item.kernel->def().op(), [&]() {
+          return nvtx::GetNodeExecutionRangeMessage(
+              item.kernel, item.num_inputs, first_input,
+              [this](const Entry& entry) {
+                return GetTensorValueForDump(entry);
+              });
+        });
 
     TensorReferenceVector accessed_tensors;
     DeviceContext* device_context = nullptr;
@@ -1794,9 +1756,6 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         MaybeMarkCompleted(input_frame, input_iter, id);
         // Continue to process the nodes in 'inline_ready'.
         completed = NodeDone(s, item.node, ready, stats, &inline_ready);
-#if GOOGLE_CUDA
-        nvtx::MaybeNvtxDomainRangeEnd(nvtx_range);
-#endif // GOOGLE_CUDA
         continue;
       }
 
@@ -1816,11 +1775,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         AsyncState* state =
             new AsyncState(params, tagged_node, &item, first_input, stats);
 
-#if GOOGLE_CUDA
-        auto done = [this, state, nvtx_range]() {
-#else
         auto done = [this, state]() {
-#endif // GOOGLE_CUDA
           Device* device = impl_->params_.device;
           NodeExecStatsInterface* stats = state->stats;  // Shorthand
           Entry* first_input = state->first_input;       // Shorthand
@@ -1862,9 +1817,6 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
           }
           const bool completed =
               NodeDone(s, state->item->node, ready, stats, nullptr);
-#if GOOGLE_CUDA
-          nvtx::MaybeNvtxDomainRangeEnd(nvtx_range);
-#endif // GOOGLE_CUDA
           delete state;
           if (completed) ScheduleFinish();
         };
@@ -1951,9 +1903,6 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
       }
       // Postprocess.
       completed = NodeDone(s, item.node, ready, stats, &inline_ready);
-#if GOOGLE_CUDA
-      nvtx::MaybeNvtxDomainRangeEnd(nvtx_range);
-#endif // GOOGLE_CUDA
     }
   }  // while !inline_ready.empty()
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
@@ -3893,7 +3893,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "segment_reduction_ops",
     prefix = "segment_reduction_ops",
-    deps = MATH_DEPS + if_cuda_or_rocm([
+    deps = [
+        "//tensorflow/core/profiler:nvtx_utils",
+    ] + MATH_DEPS + if_cuda_or_rocm([
         ":cuda_solvers",
     ]),
 )
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/nvtx_utils.h"
 #include "tensorflow/core/util/util.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -268,8 +269,10 @@ class SegmentSumGPUOp : public AsyncOpKernel {
         done);
 
     functor::SegmentSumFunctor<T, Index> functor_;
-    auto create_and_check_output = [context, output_rows_host, &input,
+    auto create_and_check_output = [this, context, output_rows_host, &input,
                                     &segment_ids, &functor_, done]() {
+      nvtx::ScopedRangeIfEnabled<nvtx::CoreDomain> nvtx_range(this);
+
       // Ensure that within the callback, the proper GPU settings are
       // configured.
       auto stream = context->op_device_context()->stream();
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
diff --git a/tensorflow/core/profiler/nvtx_utils.cc b/tensorflow/core/profiler/nvtx_utils.cc
diff --git a/tensorflow/core/profiler/nvtx_utils.h b/tensorflow/core/profiler/nvtx_utils.h
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
diff --git a/third_party/nvtx.BUILD b/third_party/nvtx.BUILD
diff --git a/third_party/repo.bzl b/third_party/repo.bzl

Original file line number	Diff line number	Diff line change
`@@ -2334,14 +2334,6 @@ string PrintName(const string& name, bool print_ids) {`
`2334`	`2334`	`}`
`2335`	`2335`	`}`
`2336`	`2336`
`2337`		`-string HloInstruction::NvtxNodeOpString() const { return metadata().op_type(); }`
`2338`		`-`
`2339`		`-string HloInstruction::NvtxNodeNameString() const {`
`2340`		`- string cluster_name = GetModule()->name();`
`2341`		`- cluster_name = cluster_name.substr(0, cluster_name.find("__XlaCompile"));`
`2342`		`- return cluster_name + "_1/xla_run/" + metadata().op_name();`
`2343`		`-}`
`2344`		`-`
`2345`	`2337`	`namespace {`
`2346`	`2338`
`2347`	`2339`	`using DFSStack = absl::InlinedVector<std::pair<int, HloInstruction*>, 16>;`