Skip to content

Commit a645904

Browse files
benbarsdellnluehr
authored andcommitted
Change nvtx_archive to download from GitHub repo
- The NVTX headers are now on GitHub so we no longer need to download and unpack the .deb package.
1 parent 9e62e6e commit a645904

File tree

18 files changed

+414
-349
lines changed

18 files changed

+414
-349
lines changed

tensorflow/compiler/xla/service/gpu/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,7 @@ cc_library(
586586
"//tensorflow/core:lib",
587587
"//tensorflow/core:lib_internal",
588588
"//tensorflow/core:stream_executor_no_cuda",
589+
"//tensorflow/core/profiler:nvtx_utils",
589590
"//tensorflow/core/profiler/lib:traceme",
590591
#"//tensorflow/core/profiler/lib:scoped_annotation",
591592
"//tensorflow/stream_executor",
@@ -605,7 +606,6 @@ cc_library(
605606
"@com_google_absl//absl/strings:str_format",
606607
"@com_google_absl//absl/types:optional",
607608
"@com_google_absl//absl/types:span",
608-
"@nvtx_archive//:nvtx",
609609
] + if_cuda_is_configured([
610610
"//tensorflow/stream_executor/cuda:cuda_stream",
611611
"//tensorflow/core/platform/default/build_config:cublas_plugin",

tensorflow/compiler/xla/service/gpu/convolution_thunk.cc

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ limitations under the License.
2424
#include "tensorflow/compiler/xla/types.h"
2525
#include "tensorflow/compiler/xla/util.h"
2626
#include "tensorflow/core/platform/logging.h"
27-
#include "tensorflow/core/platform/nvtx.h"
2827
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
28+
#include "tensorflow/core/profiler/nvtx_utils.h"
2929

3030
namespace xla {
3131
namespace gpu {
@@ -58,12 +58,14 @@ Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
5858

5959
auto op_profiler =
6060
params.profiler->MakeScopedInstructionProfiler(hlo_instruction());
61-
auto nvtx_range = tensorflow::nvtx::MaybeNvtxRangeStart(
62-
hlo_instruction()->NvtxNodeOpString(),
63-
hlo_instruction()->NvtxNodeNameString());
61+
tensorflow::nvtx::ScopedRangeIfEnabled<tensorflow::nvtx::CoreDomain>
62+
nvtx_range(cudnn_call_->metadata().op_type(), [&]() {
63+
return tensorflow::nvtx::GetThunkExecutionRangeMessage(
64+
cudnn_call_->GetModule()->name(),
65+
cudnn_call_->metadata().op_name());
66+
});
6467
TF_RETURN_IF_ERROR(RunGpuConv(cudnn_call_, absl::MakeSpan(operand_se_buffers),
6568
result_buffer, scratch, params.stream));
66-
tensorflow::nvtx::MaybeNvtxRangeEnd(nvtx_range);
6769

6870
// Write the output tuple.
6971
const int kNumOutputs = 2;

tensorflow/compiler/xla/service/gpu/gemm_thunk.cc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ limitations under the License.
2626
#include "tensorflow/compiler/xla/util.h"
2727
#include "tensorflow/core/platform/logging.h"
2828
#include "tensorflow/core/platform/mutex.h"
29-
#include "tensorflow/core/platform/nvtx.h"
3029
#include "tensorflow/core/platform/types.h"
30+
#include "tensorflow/core/profiler/nvtx_utils.h"
3131
#include "tensorflow/stream_executor/blas.h"
3232
#include "tensorflow/stream_executor/device_memory.h"
3333

@@ -265,8 +265,11 @@ Status RunGemm(const HloInstruction *gemm,
265265
complex128 alpha = {backend_config.alpha_real(), backend_config.alpha_imag()};
266266
double beta = backend_config.beta();
267267

268-
auto nvtx_range = tensorflow::nvtx::MaybeNvtxRangeStart(
269-
gemm->NvtxNodeOpString(), gemm->NvtxNodeNameString());
268+
tensorflow::nvtx::ScopedRangeIfEnabled<tensorflow::nvtx::CoreDomain>
269+
nvtx_range(gemm->metadata().op_type(), [&]() {
270+
return tensorflow::nvtx::GetThunkExecutionRangeMessage(
271+
gemm->GetModule()->name(), gemm->metadata().op_name());
272+
});
270273

271274
bool launch_ok = [&]() {
272275
switch (output_shape.element_type()) {
@@ -303,8 +306,6 @@ Status RunGemm(const HloInstruction *gemm,
303306
}
304307
}();
305308

306-
tensorflow::nvtx::MaybeNvtxRangeEnd(nvtx_range);
307-
308309
if (!launch_ok) {
309310
return InternalError("Unable to launch cuBLAS gemm on stream %p", stream);
310311
}

tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ limitations under the License.
1919
#include "tensorflow/compiler/xla/layout_util.h"
2020
#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
2121
#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
22+
#include "tensorflow/compiler/xla/service/hlo_module.h"
2223
#include "tensorflow/compiler/xla/shape_util.h"
2324
#include "tensorflow/compiler/xla/status_macros.h"
2425
#include "tensorflow/compiler/xla/util.h"

tensorflow/compiler/xla/service/hlo_instruction.cc

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2334,14 +2334,6 @@ string PrintName(const string& name, bool print_ids) {
23342334
}
23352335
}
23362336

2337-
string HloInstruction::NvtxNodeOpString() const { return metadata().op_type(); }
2338-
2339-
string HloInstruction::NvtxNodeNameString() const {
2340-
string cluster_name = GetModule()->name();
2341-
cluster_name = cluster_name.substr(0, cluster_name.find("__XlaCompile"));
2342-
return cluster_name + "_1/xla_run/" + metadata().op_name();
2343-
}
2344-
23452337
namespace {
23462338

23472339
using DFSStack = absl::InlinedVector<std::pair<int, HloInstruction*>, 16>;

tensorflow/compiler/xla/service/hlo_instruction.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1285,13 +1285,6 @@ class HloInstruction {
12851285
// function, e.g. the signature of an F32 add is (F32, F32) -> F32.
12861286
string SignatureString() const;
12871287

1288-
// Returns a string that is the node op for the node associated with this hlo
1289-
string NvtxNodeOpString() const;
1290-
1291-
// Returns a string that is the node name for the node associated with this
1292-
// hlo
1293-
string NvtxNodeNameString() const;
1294-
12951288
// Returns a debugging string that represents this instruction.
12961289
//
12971290
// (We express the default options using an overload rather than a default

tensorflow/core/BUILD

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2937,9 +2937,10 @@ tf_cuda_library(
29372937
"@com_google_absl//absl/strings",
29382938
"@com_google_absl//absl/time",
29392939
"//tensorflow/core/platform/default/build_config:platformlib",
2940-
"//tensorflow/core:framework/bfloat16",
2941-
"//tensorflow/core:framework/numeric_types",
2940+
"//tensorflow/core:framework/bfloat16",
2941+
"//tensorflow/core:framework/numeric_types",
29422942
"//tensorflow/core/kernels:bounds_check",
2943+
"//tensorflow/core/profiler:nvtx_utils",
29432944
"//tensorflow/core/profiler/lib:traceme",
29442945
"//third_party/eigen3",
29452946
] + if_static(
@@ -3286,8 +3287,8 @@ tf_cuda_library(
32863287
"@com_google_absl//absl/types:optional",
32873288
"//third_party/eigen3",
32883289
"//tensorflow/core/grappler/utils:functions",
3290+
"//tensorflow/core/profiler:nvtx_utils",
32893291
"//tensorflow/core/profiler/lib:traceme",
3290-
"@nvtx_archive//:nvtx",
32913292
"//tensorflow/core/profiler/internal:traceme_recorder",
32923293
] + mkl_deps(),
32933294
alwayslink = 1,

tensorflow/core/common_runtime/eager/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,8 @@ tf_cuda_library(
202202
"//tensorflow/core:lib_internal",
203203
"//tensorflow/core:protos_all_cc",
204204
"//tensorflow/core/profiler/lib:traceme",
205+
"//tensorflow/core/profiler:nvtx_utils",
205206
"//tensorflow/core/grappler/optimizers:meta_optimizer",
206-
"@nvtx_archive//:nvtx",
207207
],
208208
}),
209209
)

tensorflow/core/common_runtime/eager/kernel_and_device.cc

Lines changed: 9 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,13 @@ limitations under the License.
3939
#include "tensorflow/core/platform/mutex.h"
4040
#include "tensorflow/core/platform/tracing.h"
4141
#include "tensorflow/core/profiler/lib/traceme.h"
42+
#include "tensorflow/core/profiler/nvtx_utils.h"
4243
#include "tensorflow/core/public/version.h"
4344
#include "tensorflow/core/util/tensor_slice_reader_cache.h"
4445
#if !defined(IS_MOBILE_PLATFORM)
4546
#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
4647
#endif // !IS_MOBILE_PLATFORM
4748

48-
#if GOOGLE_CUDA
49-
#include "tensorflow/core/platform/nvtx.h"
50-
#endif // GOOGLE_CUDA
51-
5249
namespace tensorflow {
5350

5451
std::function<void(std::function<void()>)>* KernelAndDevice::get_runner()
@@ -308,49 +305,14 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
308305

309306
OpKernelContext context(&params);
310307

311-
#if GOOGLE_CUDA
312-
string msg;
313-
if (nvtx::NvtxRangesEnabled() || nvtx::NvtxRangesDetailedEnabled()) {
314-
if (nvtx::NvtxRangesDetailedEnabled()) {
315-
std::vector<string> args_pieces;
316-
for (int i = 0; i < inputs.size(); i++) {
317-
if (i == 10) {
318-
// Truncate long arg lists and indicate with an ending null value.
319-
args_pieces.push_back("null");
320-
break;
321-
}
322-
const auto& shape = inputs[i].tensor->shape();
323-
string shape_str = shape.unknown_rank() ? "null" : shape.DebugString();
324-
args_pieces.push_back(
325-
strings::StrCat("{\"name\":\"", kernel_->def().input(i),
326-
"\",\"shape\":", shape_str, "}"));
327-
}
328-
std::vector<string> attrs_pieces;
329-
const auto& attrs = kernel_->def().attr();
330-
for (auto it = attrs.begin(); it != attrs.end(); ++it) {
331-
const string& key = it->first;
332-
const AttrValue& value = it->second;
333-
// Exclude types that aren't useful for profiling.
334-
if (value.value_case() == AttrValue::kFunc ||
335-
value.value_case() == AttrValue::kPlaceholder ||
336-
value.value_case() == AttrValue::VALUE_NOT_SET) {
337-
continue;
338-
}
339-
string value_str = nvtx::AttrValueToJson(value);
340-
attrs_pieces.push_back(strings::StrCat("\"", key, "\":", value_str));
341-
}
342-
msg = strings::StrCat("{\"op\":\"", kernel_->def().op(), "\",\"name\":\"",
343-
kernel_->name(), "\",\"args\":[",
344-
str_util::Join(args_pieces, ","), "],\"attrs\":{",
345-
str_util::Join(attrs_pieces, ","), "}}");
346-
} else {
347-
msg = kernel_->def().op() + ": " + kernel_->name();
348-
}
349-
}
350-
auto nvtx_range = nvtx::MaybeNvtxDomainRangeStartMsg(msg,
351-
kernel_->def().op());
352-
#endif // GOOGLE_CUDA
353-
308+
nvtx::ScopedRangeIfEnabled<nvtx::CoreDomain> nvtx_range(
309+
kernel_->def().op(), [&]() {
310+
return nvtx::GetNodeExecutionRangeMessage(
311+
kernel_.get(), inputs.size(), inputs,
312+
[](const TensorValue& tensor_value) {
313+
return tensor_value.tensor;
314+
});
315+
});
354316
if (kernel_->def().op() == "_Recv") {
355317
// TODO(apassos) do not special-case _Recv. Currently the GPU device fails
356318
// if trying to run _Recv->Compute(), specifically checking for _Recv. To go
@@ -397,10 +359,6 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
397359
UpdateStats(&context, step_stats_collector.get(), stats);
398360
}
399361

400-
#if GOOGLE_CUDA
401-
nvtx::MaybeNvtxDomainRangeEnd(nvtx_range);
402-
#endif // GOOGLE_CUDA
403-
404362
return Status::OK();
405363
}
406364

tensorflow/core/common_runtime/executor.cc

Lines changed: 9 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ limitations under the License.
2525
#include "absl/memory/memory.h"
2626
#include "absl/strings/string_view.h"
2727

28-
#if GOOGLE_CUDA
29-
#include "tensorflow/core/platform/nvtx.h"
30-
#endif // GOOGLE_CUDA
31-
3228
#include "tensorflow/core/common_runtime/costmodel_manager.h"
3329
#include "tensorflow/core/common_runtime/executor_factory.h"
3430
#include "tensorflow/core/common_runtime/pending_counts.h"
@@ -75,8 +71,8 @@ limitations under the License.
7571
#include "tensorflow/core/platform/types.h"
7672
#include "tensorflow/core/profiler/internal/traceme_recorder.h"
7773
#include "tensorflow/core/profiler/lib/traceme.h"
74+
#include "tensorflow/core/profiler/nvtx_utils.h"
7875
#include "tensorflow/core/util/tensor_slice_reader_cache.h"
79-
#include "tensorflow/core/util/env_var.h"
8076

8177
namespace tensorflow {
8278
namespace {
@@ -1729,48 +1725,14 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
17291725
Entry* first_input = input_tensors + item.input_start;
17301726
outputs.clear();
17311727

1732-
#if GOOGLE_CUDA
1733-
string msg;
1734-
if (nvtx::NvtxRangesEnabled() || nvtx::NvtxRangesDetailedEnabled()) {
1735-
if (nvtx::NvtxRangesDetailedEnabled()) {
1736-
std::vector<string> args_pieces;
1737-
for (int i = 0; i < item.num_inputs; ++i) {
1738-
if (i == 10) {
1739-
// Truncate long arg lists and indicate with an ending null value.
1740-
args_pieces.push_back("null");
1741-
break;
1742-
}
1743-
const auto& shape = GetTensorValueForDump(first_input[i])->shape();
1744-
string shape_str =
1745-
shape.unknown_rank() ? "null" : shape.DebugString();
1746-
args_pieces.push_back(
1747-
strings::StrCat("{\"name\":\"", node->def().input(i),
1748-
"\",\"shape\":", shape_str, "}"));
1749-
}
1750-
std::vector<string> attrs_pieces;
1751-
const auto& attrs = node->def().attr();
1752-
for (auto it = attrs.begin(); it != attrs.end(); ++it) {
1753-
const string& key = it->first;
1754-
const AttrValue& value = it->second;
1755-
// Exclude types that aren't useful for profiling.
1756-
if (value.value_case() == AttrValue::kFunc ||
1757-
value.value_case() == AttrValue::kPlaceholder ||
1758-
value.value_case() == AttrValue::VALUE_NOT_SET) {
1759-
continue;
1760-
}
1761-
string value_str = nvtx::AttrValueToJson(value);
1762-
attrs_pieces.push_back(strings::StrCat("\"", key, "\":", value_str));
1763-
}
1764-
msg = strings::StrCat("{\"op\":\"", node->def().op(), "\",\"name\":\"",
1765-
node->name(), "\",\"args\":[",
1766-
str_util::Join(args_pieces, ","), "],\"attrs\":{",
1767-
str_util::Join(attrs_pieces, ","), "}}");
1768-
} else {
1769-
msg = node->def().op() + ": " + node->name();
1770-
}
1771-
}
1772-
auto nvtx_range = nvtx::MaybeNvtxDomainRangeStartMsg(msg, node->def().op());
1773-
#endif // GOOGLE_CUDA
1728+
nvtx::ScopedRangeIfEnabled<nvtx::CoreDomain> nvtx_range(
1729+
item.kernel->def().op(), [&]() {
1730+
return nvtx::GetNodeExecutionRangeMessage(
1731+
item.kernel, item.num_inputs, first_input,
1732+
[this](const Entry& entry) {
1733+
return GetTensorValueForDump(entry);
1734+
});
1735+
});
17741736

17751737
TensorReferenceVector accessed_tensors;
17761738
DeviceContext* device_context = nullptr;
@@ -1794,9 +1756,6 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
17941756
MaybeMarkCompleted(input_frame, input_iter, id);
17951757
// Continue to process the nodes in 'inline_ready'.
17961758
completed = NodeDone(s, item.node, ready, stats, &inline_ready);
1797-
#if GOOGLE_CUDA
1798-
nvtx::MaybeNvtxDomainRangeEnd(nvtx_range);
1799-
#endif // GOOGLE_CUDA
18001759
continue;
18011760
}
18021761

@@ -1816,11 +1775,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
18161775
AsyncState* state =
18171776
new AsyncState(params, tagged_node, &item, first_input, stats);
18181777

1819-
#if GOOGLE_CUDA
1820-
auto done = [this, state, nvtx_range]() {
1821-
#else
18221778
auto done = [this, state]() {
1823-
#endif // GOOGLE_CUDA
18241779
Device* device = impl_->params_.device;
18251780
NodeExecStatsInterface* stats = state->stats; // Shorthand
18261781
Entry* first_input = state->first_input; // Shorthand
@@ -1862,9 +1817,6 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
18621817
}
18631818
const bool completed =
18641819
NodeDone(s, state->item->node, ready, stats, nullptr);
1865-
#if GOOGLE_CUDA
1866-
nvtx::MaybeNvtxDomainRangeEnd(nvtx_range);
1867-
#endif // GOOGLE_CUDA
18681820
delete state;
18691821
if (completed) ScheduleFinish();
18701822
};
@@ -1951,9 +1903,6 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
19511903
}
19521904
// Postprocess.
19531905
completed = NodeDone(s, item.node, ready, stats, &inline_ready);
1954-
#if GOOGLE_CUDA
1955-
nvtx::MaybeNvtxDomainRangeEnd(nvtx_range);
1956-
#endif // GOOGLE_CUDA
19571906
}
19581907
} // while !inline_ready.empty()
19591908

tensorflow/core/kernels/BUILD

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3893,7 +3893,9 @@ tf_kernel_library(
38933893
tf_kernel_library(
38943894
name = "segment_reduction_ops",
38953895
prefix = "segment_reduction_ops",
3896-
deps = MATH_DEPS + if_cuda_or_rocm([
3896+
deps = [
3897+
"//tensorflow/core/profiler:nvtx_utils",
3898+
] + MATH_DEPS + if_cuda_or_rocm([
38973899
":cuda_solvers",
38983900
]),
38993901
)

tensorflow/core/kernels/segment_reduction_ops.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ limitations under the License.
3535
#include "tensorflow/core/framework/types.h"
3636
#include "tensorflow/core/lib/core/status.h"
3737
#include "tensorflow/core/platform/logging.h"
38+
#include "tensorflow/core/profiler/nvtx_utils.h"
3839
#include "tensorflow/core/util/util.h"
3940

4041
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -268,8 +269,10 @@ class SegmentSumGPUOp : public AsyncOpKernel {
268269
done);
269270

270271
functor::SegmentSumFunctor<T, Index> functor_;
271-
auto create_and_check_output = [context, output_rows_host, &input,
272+
auto create_and_check_output = [this, context, output_rows_host, &input,
272273
&segment_ids, &functor_, done]() {
274+
nvtx::ScopedRangeIfEnabled<nvtx::CoreDomain> nvtx_range(this);
275+
273276
// Ensure that within the callback, the proper GPU settings are
274277
// configured.
275278
auto stream = context->op_device_context()->stream();

0 commit comments

Comments
 (0)