[Serving] Support NVTX for benchmarking (#2043)

This PR supports MLC serve with NVTX which helps analyzing benchmarking results. **Note.** To enable NVTX, please add `set(USE_NVTX ON)` to file `build/config.cmake`.
mlc-ai · Mar 28, 2024 · 4255a45 · 4255a45
1 parent cf8d458
commit 4255a45
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 16 deletions.
diff --git a/cpp/serve/engine_actions/action_commons.cc b/cpp/serve/engine_actions/action_commons.cc
@@ -5,6 +5,8 @@
 
 #include "action_commons.h"
 
+#include <tvm/runtime/nvtx.h>
+
 namespace mlc {
 namespace llm {
 namespace serve {
@@ -19,6 +21,7 @@ void RemoveRequestFromModel(EngineState estate, int64_t req_internal_id, Array<M
 void ProcessFinishedRequestStateEntries(std::vector<RequestStateEntry> finished_rsentries,
                                         EngineState estate, Array<Model> models,
                                         int max_single_sequence_length) {
+  NVTXScopedRange nvtx_scope("Process finished requests");
   // - Remove the finished request state entries.
   for (const RequestStateEntry& rsentry : finished_rsentries) {
     // The finished entry must be a leaf.
@@ -83,6 +86,7 @@ void ActionStepPostProcess(Array<Request> requests, EngineState estate, Array<Mo
                            const Tokenizer& tokenizer,
                            FRequestStreamCallback request_stream_callback,
                            int max_single_sequence_length) {
+  NVTXScopedRange nvtx_scope("EngineAction postproc");
   std::vector<RequestStateEntry> finished_rsentries;
   finished_rsentries.reserve(requests.size());
 
@@ -128,8 +132,11 @@ void ActionStepPostProcess(Array<Request> requests, EngineState estate, Array<Mo
     }
   }
 
-  // - Invoke the stream callback function once for all collected requests.
-  request_stream_callback(callback_delta_outputs);
+  {
+    NVTXScopedRange nvtx_scope("Call request stream callback");
+    // - Invoke the stream callback function once for all collected requests.
+    request_stream_callback(callback_delta_outputs);
+  }
 
   ProcessFinishedRequestStateEntries(std::move(finished_rsentries), std::move(estate),
                                      std::move(models), max_single_sequence_length);

diff --git a/cpp/serve/engine_actions/batch_decode.cc b/cpp/serve/engine_actions/batch_decode.cc
@@ -3,6 +3,8 @@
  * \file serve/engine_actions/batch_decode.cc
  */
 
+#include <tvm/runtime/nvtx.h>
+
 #include <numeric>
 
 #include "../../random.h"
@@ -40,12 +42,16 @@ class BatchDecodeActionObj : public EngineActionObj {
     }
 
     // Preempt request state entries when decode cannot apply.
-    std::vector<RequestStateEntry> running_rsentries = GetRunningRequestStateEntries(estate);
-    while (!CanDecode(running_rsentries.size())) {
-      RequestStateEntry preempted =
-          PreemptLastRunningRequestStateEntry(estate, models_, trace_recorder_);
-      if (preempted.same_as(running_rsentries.back())) {
-        running_rsentries.pop_back();
+    std::vector<RequestStateEntry> running_rsentries;
+    {
+      NVTXScopedRange nvtx_scope("BatchDecode getting requests");
+      running_rsentries = GetRunningRequestStateEntries(estate);
+      while (!CanDecode(running_rsentries.size())) {
+        RequestStateEntry preempted =
+            PreemptLastRunningRequestStateEntry(estate, models_, trace_recorder_);
+        if (preempted.same_as(running_rsentries.back())) {
+          running_rsentries.pop_back();
+        }
       }
     }
 

diff --git a/cpp/serve/engine_actions/new_request_prefill.cc b/cpp/serve/engine_actions/new_request_prefill.cc
@@ -3,6 +3,8 @@
  * \file serve/engine_actions/new_request_prefill.cc
  */
 
+#include <tvm/runtime/nvtx.h>
+
 #include "../config.h"
 #include "../model.h"
 #include "../sampler/sampler.h"
@@ -33,10 +35,17 @@ class NewRequestPrefillActionObj : public EngineActionObj {
 
   Array<Request> Step(EngineState estate) final {
     // - Find the requests in `waiting_queue` that can prefill in this step.
-    auto [rsentries, prefill_lengths] = GetRequestStateEntriesToPrefill(estate);
-    ICHECK_EQ(rsentries.size(), prefill_lengths.size());
-    if (rsentries.empty()) {
-      return {};
+    Array<RequestStateEntry> rsentries;
+    std::vector<int> prefill_lengths;
+    {
+      NVTXScopedRange nvtx_scope("NewRequestPrefill getting requests");
+      auto tuple = GetRequestStateEntriesToPrefill(estate);
+      rsentries = std::move(std::get<0>(tuple));
+      prefill_lengths = std::move(std::get<1>(tuple));
+      ICHECK_EQ(rsentries.size(), prefill_lengths.size());
+      if (rsentries.empty()) {
+        return {};
+      }
     }
 
     int num_rsentries = rsentries.size();

diff --git a/cpp/serve/logit_processor.cc b/cpp/serve/logit_processor.cc
@@ -6,6 +6,7 @@
 #include "logit_processor.h"
 
 #include <picojson.h>
+#include <tvm/runtime/nvtx.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/threading_backend.h>
@@ -69,6 +70,7 @@ class LogitProcessorImpl : public LogitProcessorObj {
                            const Array<String>& request_ids,               //
                            const std::vector<int>* cum_num_token,          //
                            const std::vector<std::vector<SampleResult>>* draft_tokens) final {
+    NVTXScopedRange nvtx_scope("Logit inplace update");
     CHECK_EQ(logits->ndim, 2);
     CHECK_EQ(logits->shape[1], vocab_size_);
     CHECK(logits.DataType() == DataType::Float(32));
@@ -109,6 +111,7 @@ class LogitProcessorImpl : public LogitProcessorObj {
   NDArray ComputeProbsFromLogits(NDArray logits, const Array<GenerationConfig>& generation_cfg,
                                  const Array<String>& request_ids,
                                  const std::vector<int>* cum_num_token) final {
+    NVTXScopedRange nvtx_scope("Compute probs from logits");
     // logits: (n, v)
     CHECK_EQ(logits->ndim, 2);
     CHECK_LE(logits->shape[0], max_num_token_);

diff --git a/cpp/serve/model.cc b/cpp/serve/model.cc
@@ -7,6 +7,7 @@
 
 #include <picojson.h>
 #include <tvm/runtime/memory/memory_manager.h>
+#include <tvm/runtime/nvtx.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
@@ -72,13 +73,18 @@ class ModelImpl : public ModelObj {
   /*********************** Model Computation  ***********************/
 
   ObjectRef TokenEmbed(IntTuple token_ids, ObjectRef* dst, int offset) final {
+    NVTXScopedRange nvtx_scope("TokenEmbed");
     int num_tokens = token_ids.size();
     // Copy input token ids to device.
     DLDataType dtype(DataType::Int(32));
-    NDArray token_ids_nd = token_ids_storage_->AllocNDArray(offset * 4, {num_tokens}, dtype);
-    int* p_token_ids = static_cast<int*>(token_ids_nd->data) + (token_ids_nd->byte_offset) / 4;
-    for (int i = 0; i < num_tokens; ++i) {
-      p_token_ids[i] = token_ids[i];
+    NDArray token_ids_nd;
+    {
+      NVTXScopedRange nvtx_scope("Allocate token_ids at offset");
+      token_ids_nd = token_ids_storage_->AllocNDArray(offset * 4, {num_tokens}, dtype);
+      int* p_token_ids = static_cast<int*>(token_ids_nd->data) + (token_ids_nd->byte_offset) / 4;
+      for (int i = 0; i < num_tokens; ++i) {
+        p_token_ids[i] = token_ids[i];
+      }
     }
     ICHECK_EQ(token_ids_nd->ndim, 1);
     ICHECK_EQ(token_ids_nd->shape[0], num_tokens);
@@ -96,6 +102,7 @@ class ModelImpl : public ModelObj {
   }
 
   ObjectRef ImageEmbed(const NDArray& image, ObjectRef* dst, int offset) final {
+    NVTXScopedRange nvtx_scope("ImageEmbed");
     CHECK(ft_.image_embed_func_.defined()) << "`image_embed` function is not found in the model. ";
     auto image_dref_or_nd = ft_.CopyToWorker0(image, "image", image.Shape());
     ObjectRef embeddings = ft_.image_embed_func_(image_dref_or_nd, params_);
@@ -111,6 +118,7 @@ class ModelImpl : public ModelObj {
 
   NDArray BatchPrefill(const ObjectRef& embeddings, const std::vector<int64_t>& seq_ids,
                        const std::vector<int>& lengths) final {
+    NVTXScopedRange nvtx_scope("BatchPrefill");
     CHECK(!seq_ids.empty());
     CHECK_EQ(seq_ids.size(), lengths.size());
     int num_sequences = seq_ids.size();
@@ -180,6 +188,7 @@ class ModelImpl : public ModelObj {
   }
 
   NDArray BatchDecode(const ObjectRef& embeddings, const std::vector<int64_t>& seq_ids) final {
+    NVTXScopedRange nvtx_scope("BatchDecode");
     int num_sequence = seq_ids.size();
 
     CHECK(ft_.decode_func_.defined())
@@ -240,6 +249,7 @@ class ModelImpl : public ModelObj {
 
   NDArray BatchVerify(const ObjectRef& embeddings, const std::vector<int64_t>& seq_ids,
                       const std::vector<int>& lengths) final {
+    NVTXScopedRange nvtx_scope("BatchVerify");
     CHECK(!seq_ids.empty());
     CHECK_EQ(seq_ids.size(), lengths.size());
     int num_sequences = seq_ids.size();

diff --git a/cpp/serve/sampler/gpu_sampler.cc b/cpp/serve/sampler/gpu_sampler.cc
@@ -4,6 +4,7 @@
  * \brief The implementation for GPU sampler functions.
  */
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/nvtx.h>
 #include <tvm/runtime/packed_func.h>
 
 #include "../../random.h"
@@ -61,6 +62,7 @@ class GPUSampler : public SamplerObj {
                                               const Array<GenerationConfig>& generation_cfg,  //
                                               const std::vector<RandomGenerator*>& rngs,      //
                                               std::vector<NDArray>* output_prob_dist) final {
+    NVTXScopedRange nvtx_scope("BatchSampleTokens");
     // probs_on_device: (n, v)
     RECORD_EVENT(trace_recorder_, request_ids, "start sampling");
     CHECK_EQ(probs_on_device->ndim, 2);