Skip to content

Commit

Permalink
[Serving] Support NVTX for benchmarking (#2043)
Browse files Browse the repository at this point in the history
This PR supports MLC serve with NVTX which helps analyzing benchmarking
results.

**Note.** To enable NVTX, please add `set(USE_NVTX ON)` to file
`build/config.cmake`.
  • Loading branch information
MasterJH5574 authored Mar 28, 2024
1 parent cf8d458 commit 4255a45
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 16 deletions.
11 changes: 9 additions & 2 deletions cpp/serve/engine_actions/action_commons.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "action_commons.h"

#include <tvm/runtime/nvtx.h>

namespace mlc {
namespace llm {
namespace serve {
Expand All @@ -19,6 +21,7 @@ void RemoveRequestFromModel(EngineState estate, int64_t req_internal_id, Array<M
void ProcessFinishedRequestStateEntries(std::vector<RequestStateEntry> finished_rsentries,
EngineState estate, Array<Model> models,
int max_single_sequence_length) {
NVTXScopedRange nvtx_scope("Process finished requests");
// - Remove the finished request state entries.
for (const RequestStateEntry& rsentry : finished_rsentries) {
// The finished entry must be a leaf.
Expand Down Expand Up @@ -83,6 +86,7 @@ void ActionStepPostProcess(Array<Request> requests, EngineState estate, Array<Mo
const Tokenizer& tokenizer,
FRequestStreamCallback request_stream_callback,
int max_single_sequence_length) {
NVTXScopedRange nvtx_scope("EngineAction postproc");
std::vector<RequestStateEntry> finished_rsentries;
finished_rsentries.reserve(requests.size());

Expand Down Expand Up @@ -128,8 +132,11 @@ void ActionStepPostProcess(Array<Request> requests, EngineState estate, Array<Mo
}
}

// - Invoke the stream callback function once for all collected requests.
request_stream_callback(callback_delta_outputs);
{
NVTXScopedRange nvtx_scope("Call request stream callback");
// - Invoke the stream callback function once for all collected requests.
request_stream_callback(callback_delta_outputs);
}

ProcessFinishedRequestStateEntries(std::move(finished_rsentries), std::move(estate),
std::move(models), max_single_sequence_length);
Expand Down
18 changes: 12 additions & 6 deletions cpp/serve/engine_actions/batch_decode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* \file serve/engine_actions/batch_decode.cc
*/

#include <tvm/runtime/nvtx.h>

#include <numeric>

#include "../../random.h"
Expand Down Expand Up @@ -40,12 +42,16 @@ class BatchDecodeActionObj : public EngineActionObj {
}

// Preempt request state entries when decode cannot apply.
std::vector<RequestStateEntry> running_rsentries = GetRunningRequestStateEntries(estate);
while (!CanDecode(running_rsentries.size())) {
RequestStateEntry preempted =
PreemptLastRunningRequestStateEntry(estate, models_, trace_recorder_);
if (preempted.same_as(running_rsentries.back())) {
running_rsentries.pop_back();
std::vector<RequestStateEntry> running_rsentries;
{
NVTXScopedRange nvtx_scope("BatchDecode getting requests");
running_rsentries = GetRunningRequestStateEntries(estate);
while (!CanDecode(running_rsentries.size())) {
RequestStateEntry preempted =
PreemptLastRunningRequestStateEntry(estate, models_, trace_recorder_);
if (preempted.same_as(running_rsentries.back())) {
running_rsentries.pop_back();
}
}
}

Expand Down
17 changes: 13 additions & 4 deletions cpp/serve/engine_actions/new_request_prefill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* \file serve/engine_actions/new_request_prefill.cc
*/

#include <tvm/runtime/nvtx.h>

#include "../config.h"
#include "../model.h"
#include "../sampler/sampler.h"
Expand Down Expand Up @@ -33,10 +35,17 @@ class NewRequestPrefillActionObj : public EngineActionObj {

Array<Request> Step(EngineState estate) final {
// - Find the requests in `waiting_queue` that can prefill in this step.
auto [rsentries, prefill_lengths] = GetRequestStateEntriesToPrefill(estate);
ICHECK_EQ(rsentries.size(), prefill_lengths.size());
if (rsentries.empty()) {
return {};
Array<RequestStateEntry> rsentries;
std::vector<int> prefill_lengths;
{
NVTXScopedRange nvtx_scope("NewRequestPrefill getting requests");
auto tuple = GetRequestStateEntriesToPrefill(estate);
rsentries = std::move(std::get<0>(tuple));
prefill_lengths = std::move(std::get<1>(tuple));
ICHECK_EQ(rsentries.size(), prefill_lengths.size());
if (rsentries.empty()) {
return {};
}
}

int num_rsentries = rsentries.size();
Expand Down
3 changes: 3 additions & 0 deletions cpp/serve/logit_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "logit_processor.h"

#include <picojson.h>
#include <tvm/runtime/nvtx.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/threading_backend.h>
Expand Down Expand Up @@ -69,6 +70,7 @@ class LogitProcessorImpl : public LogitProcessorObj {
const Array<String>& request_ids, //
const std::vector<int>* cum_num_token, //
const std::vector<std::vector<SampleResult>>* draft_tokens) final {
NVTXScopedRange nvtx_scope("Logit inplace update");
CHECK_EQ(logits->ndim, 2);
CHECK_EQ(logits->shape[1], vocab_size_);
CHECK(logits.DataType() == DataType::Float(32));
Expand Down Expand Up @@ -109,6 +111,7 @@ class LogitProcessorImpl : public LogitProcessorObj {
NDArray ComputeProbsFromLogits(NDArray logits, const Array<GenerationConfig>& generation_cfg,
const Array<String>& request_ids,
const std::vector<int>* cum_num_token) final {
NVTXScopedRange nvtx_scope("Compute probs from logits");
// logits: (n, v)
CHECK_EQ(logits->ndim, 2);
CHECK_LE(logits->shape[0], max_num_token_);
Expand Down
18 changes: 14 additions & 4 deletions cpp/serve/model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <picojson.h>
#include <tvm/runtime/memory/memory_manager.h>
#include <tvm/runtime/nvtx.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>

Expand Down Expand Up @@ -72,13 +73,18 @@ class ModelImpl : public ModelObj {
/*********************** Model Computation ***********************/

ObjectRef TokenEmbed(IntTuple token_ids, ObjectRef* dst, int offset) final {
NVTXScopedRange nvtx_scope("TokenEmbed");
int num_tokens = token_ids.size();
// Copy input token ids to device.
DLDataType dtype(DataType::Int(32));
NDArray token_ids_nd = token_ids_storage_->AllocNDArray(offset * 4, {num_tokens}, dtype);
int* p_token_ids = static_cast<int*>(token_ids_nd->data) + (token_ids_nd->byte_offset) / 4;
for (int i = 0; i < num_tokens; ++i) {
p_token_ids[i] = token_ids[i];
NDArray token_ids_nd;
{
NVTXScopedRange nvtx_scope("Allocate token_ids at offset");
token_ids_nd = token_ids_storage_->AllocNDArray(offset * 4, {num_tokens}, dtype);
int* p_token_ids = static_cast<int*>(token_ids_nd->data) + (token_ids_nd->byte_offset) / 4;
for (int i = 0; i < num_tokens; ++i) {
p_token_ids[i] = token_ids[i];
}
}
ICHECK_EQ(token_ids_nd->ndim, 1);
ICHECK_EQ(token_ids_nd->shape[0], num_tokens);
Expand All @@ -96,6 +102,7 @@ class ModelImpl : public ModelObj {
}

ObjectRef ImageEmbed(const NDArray& image, ObjectRef* dst, int offset) final {
NVTXScopedRange nvtx_scope("ImageEmbed");
CHECK(ft_.image_embed_func_.defined()) << "`image_embed` function is not found in the model. ";
auto image_dref_or_nd = ft_.CopyToWorker0(image, "image", image.Shape());
ObjectRef embeddings = ft_.image_embed_func_(image_dref_or_nd, params_);
Expand All @@ -111,6 +118,7 @@ class ModelImpl : public ModelObj {

NDArray BatchPrefill(const ObjectRef& embeddings, const std::vector<int64_t>& seq_ids,
const std::vector<int>& lengths) final {
NVTXScopedRange nvtx_scope("BatchPrefill");
CHECK(!seq_ids.empty());
CHECK_EQ(seq_ids.size(), lengths.size());
int num_sequences = seq_ids.size();
Expand Down Expand Up @@ -180,6 +188,7 @@ class ModelImpl : public ModelObj {
}

NDArray BatchDecode(const ObjectRef& embeddings, const std::vector<int64_t>& seq_ids) final {
NVTXScopedRange nvtx_scope("BatchDecode");
int num_sequence = seq_ids.size();

CHECK(ft_.decode_func_.defined())
Expand Down Expand Up @@ -240,6 +249,7 @@ class ModelImpl : public ModelObj {

NDArray BatchVerify(const ObjectRef& embeddings, const std::vector<int64_t>& seq_ids,
const std::vector<int>& lengths) final {
NVTXScopedRange nvtx_scope("BatchVerify");
CHECK(!seq_ids.empty());
CHECK_EQ(seq_ids.size(), lengths.size());
int num_sequences = seq_ids.size();
Expand Down
2 changes: 2 additions & 0 deletions cpp/serve/sampler/gpu_sampler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* \brief The implementation for GPU sampler functions.
*/
#include <tvm/runtime/ndarray.h>
#include <tvm/runtime/nvtx.h>
#include <tvm/runtime/packed_func.h>

#include "../../random.h"
Expand Down Expand Up @@ -61,6 +62,7 @@ class GPUSampler : public SamplerObj {
const Array<GenerationConfig>& generation_cfg, //
const std::vector<RandomGenerator*>& rngs, //
std::vector<NDArray>* output_prob_dist) final {
NVTXScopedRange nvtx_scope("BatchSampleTokens");
// probs_on_device: (n, v)
RECORD_EVENT(trace_recorder_, request_ids, "start sampling");
CHECK_EQ(probs_on_device->ndim, 2);
Expand Down

0 comments on commit 4255a45

Please sign in to comment.