From d2bc6960ac9b42889945622e50964c48820b7177 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Fri, 21 Feb 2025 16:41:26 -0500 Subject: [PATCH 1/7] Add constant generation setting --- loadgen/bindings/python_api.cc | 2 + loadgen/demos/py_demo_constant_gen.py | 75 +++++++++++++++++++++++++++ loadgen/loadgen.cc | 18 +++++-- loadgen/test_settings.h | 4 ++ loadgen/test_settings_internal.cc | 7 +++ loadgen/test_settings_internal.h | 1 + 6 files changed, 104 insertions(+), 3 deletions(-) create mode 100644 loadgen/demos/py_demo_constant_gen.py diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 96396dab92..f91fabf46a 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -310,6 +310,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) { &TestSettings::server_max_async_queries) .def_readwrite("server_num_issue_query_threads", &TestSettings::server_num_issue_query_threads) + .def_readwrite("server_constant_gen", + &TestSettings::server_constant_gen) .def_readwrite("offline_expected_qps", &TestSettings::offline_expected_qps) .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms) diff --git a/loadgen/demos/py_demo_constant_gen.py b/loadgen/demos/py_demo_constant_gen.py new file mode 100644 index 0000000000..f12b724e8a --- /dev/null +++ b/loadgen/demos/py_demo_constant_gen.py @@ -0,0 +1,75 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +from absl import app +import mlperf_loadgen + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + time.sleep(0.001) + responses = [] + for s in query_samples: + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, args=[query_samples]).start() + + +def flush_queries(): + pass + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Server + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.server_target_qps = 100 + settings.server_target_latency_ns = 100000000 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + settings.server_constant_gen = True + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc index 42b2140de2..09bc781420 100644 --- a/loadgen/loadgen.cc +++ b/loadgen/loadgen.cc @@ -207,6 +207,13 @@ auto ScheduleDistribution(double qps) { }; } +auto ScheduleConstantDistribution(double qps){ + return [dist = std::uniform_real_distribution<>(1.0 / qps)](auto& gen) mutable { + return std::chrono::duration_cast( + std::chrono::duration(dist(gen))); + }; +} + /// \brief Selects samples for the accuracy mode. template auto SampleDistribution(size_t sample_count, size_t stride, std::mt19937* rng) { @@ -310,8 +317,9 @@ std::vector GenerateQueries( auto sample_distribution_equal_issue = SampleDistributionEqualIssue( min_queries, loaded_samples.size(), &sample_rng); - auto schedule_distribution = - ScheduleDistribution(settings.target_qps); + TestScenario temp_scenario = scenario; + auto schedule_distribution = ScheduleDistribution(settings.target_qps); + auto schedule_constant_distribution = ScheduleConstantDistribution(settings.target_qps); // When sample_concatenate_permutation is turned on, pad to a multiple of the // complete dataset to ensure fairness. @@ -397,7 +405,11 @@ std::vector GenerateQueries( } queries.emplace_back(samples, timestamp, response_delegate, sequence_gen); prev_timestamp = timestamp; - timestamp += schedule_distribution(schedule_rng); + if (settings.server_constant_gen && (scenario == TestScenario::Server)){ + timestamp += schedule_constant_distribution(schedule_rng); + } else { + timestamp += schedule_distribution(schedule_rng); + } // In equal_issue mode, the min_queries will be bumped up by a multiple of // the dataset size if the test time has not met the threshold. if (enable_equal_issue && (queries.size() >= min_queries) && diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h index 584d073bb8..7bb604d43c 100644 --- a/loadgen/test_settings.h +++ b/loadgen/test_settings.h @@ -169,6 +169,10 @@ struct TestSettings { /// StartTest() will be used to call IssueQuery(). See also /// mlperf::RegisterIssueQueryThread(). uint64_t server_num_issue_query_threads = 0; + /// \brief If this flag is set to true, LoadGen the time between samples genera- + /// ted by LoadGen in the server scenario is set to constant. Otherwise, the + /// time between samples follows an exponential distribution + bool server_constant_gen = false; /**@}*/ // ================================== diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index 3f2cd88473..b21436a6b4 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -53,6 +53,7 @@ TestSettingsInternal::TestSettingsInternal( use_token_latencies(requested.use_token_latencies), server_ttft_latency(requested.server_ttft_latency), server_tpot_latency(requested.server_tpot_latency), + server_constant_gen(requested.server_constant_gen), infer_token_latencies(requested.infer_token_latencies), token_latency_scaling_factor(requested.token_latency_scaling_factor) { // Target QPS, target latency, and max_async_queries. @@ -305,6 +306,8 @@ void LogRequestedTestSettings(const TestSettings &s) { s.server_max_async_queries); MLPERF_LOG(detail, "requested_server_num_issue_query_threads", s.server_num_issue_query_threads); + MLPERF_LOG(detail, "requested_server_constant_gen", + s.server_constant_gen); break; case TestScenario::Offline: MLPERF_LOG(detail, "requested_offline_expected_qps", @@ -452,6 +455,8 @@ void TestSettingsInternal::LogEffectiveSettings() const { s.performance_sample_count); MLPERF_LOG(detail, "effective_sample_concatenate_permutation", s.sample_concatenate_permutation); + MLPERF_LOG(detail, "effective_server_constant_gen", + s.server_constant_gen); #else detail(""); detail("Effective Settings:"); @@ -772,6 +777,8 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, server_coalesce_queries = (val == 0) ? false : true; if (lookupkv(model, "Server", "max_async_queries", &val, nullptr)) server_max_async_queries = int(val); + if (lookupkv(model, "Server", "constant_gen", &val, nullptr)) + server_constant_gen = (val == 0) ? false : true; lookupkv(model, scenario, "min_duration", &min_duration_ms, nullptr); lookupkv(model, scenario, "max_duration", &max_duration_ms, nullptr); diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h index ab2773bd18..6356a48ccf 100644 --- a/loadgen/test_settings_internal.h +++ b/loadgen/test_settings_internal.h @@ -85,6 +85,7 @@ struct TestSettingsInternal { bool use_token_latencies = false; int64_t server_ttft_latency; int64_t server_tpot_latency; + bool server_constant_gen; bool infer_token_latencies = false; int64_t token_latency_scaling_factor; From f1e962062beef0f856d1049b2713e5f86f6601bb Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 19 Mar 2025 00:46:28 -0500 Subject: [PATCH 2/7] Implementation of benchmark with grouped queries --- loadgen/bindings/c_api.cc | 3 + loadgen/bindings/python_api.cc | 103 ++++++++++++++++++++++++- loadgen/loadgen.cc | 120 +++++++++++++++++++++++++----- loadgen/query_sample_library.h | 3 + loadgen/test_settings.h | 3 + loadgen/test_settings_internal.cc | 7 +- loadgen/test_settings_internal.h | 1 + 7 files changed, 217 insertions(+), 23 deletions(-) diff --git a/loadgen/bindings/c_api.cc b/loadgen/bindings/c_api.cc index 0248a1c163..86f03cfa0c 100644 --- a/loadgen/bindings/c_api.cc +++ b/loadgen/bindings/c_api.cc @@ -88,6 +88,9 @@ class QuerySampleLibraryTrampoline : public QuerySampleLibrary { const std::string& Name() override { return name_; } size_t TotalSampleCount() override { return total_sample_count_; } size_t PerformanceSampleCount() override { return performance_sample_count_; } + size_t GroupSize(size_t i) override { return 1; } + size_t GroupOf(size_t i) override { return i; } + size_t NumberOfGroups() override { return total_sample_count_; } void LoadSamplesToRam(const std::vector& samples) override { (*load_samples_to_ram_cb_)(client_data_, samples.data(), samples.size()); diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index f91fabf46a..149573464a 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -28,6 +28,7 @@ limitations under the License. #include "pybind11/pybind11.h" #include "pybind11/stl.h" #include "pybind11/stl_bind.h" +#include "pybind11/numpy.h" namespace mlperf { @@ -109,8 +110,10 @@ class QuerySampleLibraryTrampoline : public QuerySampleLibrary { ~QuerySampleLibraryTrampoline() override = default; const std::string& Name() override { return name_; } - size_t TotalSampleCount() { return total_sample_count_; } - size_t PerformanceSampleCount() { return performance_sample_count_; } + size_t TotalSampleCount() override { return total_sample_count_; } + size_t PerformanceSampleCount() override { return performance_sample_count_; } + size_t GroupSize(size_t i) override { return 1; } + size_t NumberOfGroups() override { return total_sample_count_; } void LoadSamplesToRam(const std::vector& samples) override { pybind11::gil_scoped_acquire gil_acquirer; @@ -130,6 +133,65 @@ class QuerySampleLibraryTrampoline : public QuerySampleLibrary { UnloadSamplesFromRamCallback unload_samples_from_ram_cb_; }; +// Forwards QuerySampleLibrary calls to relevant callbacks. +class GroupedQuerySampleLibraryTrampoline : public QuerySampleLibrary { + public: + GroupedQuerySampleLibraryTrampoline( + std::string name, + size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb, + pybind11::array_t& group_sizes) + : name_(std::move(name)), + performance_sample_count_(performance_sample_count), + load_samples_to_ram_cb_(load_samples_to_ram_cb), + unload_samples_from_ram_cb_(unload_samples_from_ram_cb) { + + total_sample_count_ = 0; + if(group_sizes.ndim() != 1){ + throw std::runtime_error("Group sizes should be a 1D Numpy array"); + } + auto buffer = group_sizes.request(); + size_t* ptr = (size_t*)buffer.ptr; + + for(ssize_t i = 0; i < group_sizes.shape()[0]; i++){ + group_sizes_.push_back(ptr[i]); + total_sample_count_ += ptr[i]; + for(ssize_t j = 0; j < ptr[i]; j++){ + group_idx_.push_back(i); + } + } + } + ~GroupedQuerySampleLibraryTrampoline() override = default; + + const std::string& Name() override { return name_; } + size_t TotalSampleCount() override { return total_sample_count_; } + size_t PerformanceSampleCount() override { return performance_sample_count_; } + size_t GroupSize(size_t i) override { return group_sizes_[i]; } + size_t GroupOf(size_t i) override { return group_idx_[i]; } + size_t NumberOfGroups() override { return group_sizes_.size(); } + + void LoadSamplesToRam(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + load_samples_to_ram_cb_(samples); + } + void UnloadSamplesFromRam( + const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + unload_samples_from_ram_cb_(samples); + } + + private: + std::string name_; + std::vector group_sizes_; + std::vector group_idx_; + size_t total_sample_count_; + size_t performance_sample_count_; + LoadSamplesToRamCallback load_samples_to_ram_cb_; + UnloadSamplesFromRamCallback unload_samples_from_ram_cb_; +}; + + // A QDL that allows defining callbacks for // IssueQuery, FlushQueries, and Name methods. class QueryDispatchLibraryTrampoline : public QueryDispatchLibrary { @@ -226,6 +288,23 @@ void DestroyQDL(uintptr_t qdl) { delete qdl_cast; } +uintptr_t ConstructGroupedQSL( + pybind11::array_t& group_sizes, + size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) { + GroupedQuerySampleLibraryTrampoline* qsl = new GroupedQuerySampleLibraryTrampoline( + "PyQSL", performance_sample_count, + load_samples_to_ram_cb, unload_samples_from_ram_cb, group_sizes); + return reinterpret_cast(qsl); +} + +void DestroyGroupedQSL(uintptr_t qdl) { + QueryDispatchLibraryTrampoline* qdl_cast = + reinterpret_cast(qdl); + delete qdl_cast; +} + void StartTest(uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings, const std::string& audit_config_filename) { pybind11::gil_scoped_release gil_releaser; @@ -251,6 +330,20 @@ void StartTestWithLogSettings(uintptr_t sut, uintptr_t qsl, audit_config_filename); } +void StartTestWithGroupedTest( + uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings, + const std::string& audit_config_filename){ + pybind11::gil_scoped_release gil_releaser; + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + GroupedQuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + LogSettings default_log_settings; + assert(TestSettings.use_grouped_qsl); + mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings, + audit_config_filename); +} + using ResponseCallback = std::function; /// TODO: Get rid of copies. @@ -342,6 +435,7 @@ PYBIND11_MODULE(mlperf_loadgen, m) { .def_readwrite("test05_schedule_rng_seed", &TestSettings::test05_schedule_rng_seed) .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies) + .def_readwrite("use_grouped_qsl", &TestSettings::use_grouped_qsl) .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency) .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency) .def_readwrite("infer_token_latencies", @@ -456,6 +550,11 @@ PYBIND11_MODULE(mlperf_loadgen, m) { m.def("DestroyQDL", &py::DestroyQDL, "Destroy the object created by ConstructQDL."); + m.def("ConstructGroupedQSL", &py::ConstructGroupedQSL, + "Construct the query sample library."); + m.def("DestroyGroupedQSL", &py::DestroyQSL, + "Destroy the object created by ConstructGroupedQSL."); + m.def("StartTest", &py::StartTest, "Run tests on a SUT created by ConstructSUT() with the provided QSL. " "Uses default log settings.", diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc index 09bc781420..7bd85c8e19 100644 --- a/loadgen/loadgen.cc +++ b/loadgen/loadgen.cc @@ -264,6 +264,7 @@ auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size, /// the QPS as scheduled is equal to the QPS as requested. template std::vector GenerateQueries( + QuerySampleLibrary* qsl, const TestSettingsInternal& settings, const LoadableSampleSet& loaded_sample_set, SequenceGen* sequence_gen, ResponseDelegate* response_delegate) { @@ -347,6 +348,29 @@ std::vector GenerateQueries( // Choose a single sample to repeat when in performance_issue_same mode QuerySampleIndex same_sample = settings.performance_issue_same_index; + // Variables for handling group test + QuerySampleIndex global_idx = 0; + std::vector groups; + std::vector groups_first; + size_t number_of_groups = 0; + size_t g, group_size; + + if (settings.use_grouped_qsl) { + size_t current_idx = 0; + while (current_idx < loaded_samples.size()) + { + size_t current_group = qsl->GroupOf(loaded_samples[current_idx]); + groups.push_back(current_group); + groups_first.push_back(current_idx); + current_idx += qsl->GroupSize(loaded_samples[current_idx]); + number_of_groups++; + } + + } + + auto grouped_sample_distribution = SampleDistribution( + number_of_groups, sample_stride, &sample_rng); + while (prev_timestamp < gen_duration || queries.size() < min_queries) { if (kIsMultiStream) { QuerySampleIndex sample_i = settings.performance_issue_unique @@ -393,6 +417,9 @@ std::vector GenerateQueries( assert(remainder == 0); } } + } else if (settings.use_grouped_qsl) { + g = grouped_sample_distribution(sample_rng); + group_size = qsl->GroupSize(loaded_samples[groups_first[g]]); } else { for (auto& s : samples) { s = loaded_samples[settings.performance_issue_unique @@ -403,10 +430,21 @@ std::vector GenerateQueries( : sample_distribution(sample_rng)]; } } - queries.emplace_back(samples, timestamp, response_delegate, sequence_gen); + if (!settings.use_grouped_qsl) { + queries.emplace_back(samples, timestamp, response_delegate, sequence_gen); + } else { + for (size_t i = 0; i < group_size; i++){ + samples[0] = loaded_samples[groups_first[g]+i]; + queries.emplace_back(samples, timestamp, response_delegate, sequence_gen); + } + } prev_timestamp = timestamp; if (settings.server_constant_gen && (scenario == TestScenario::Server)){ - timestamp += schedule_constant_distribution(schedule_rng); + if(!settings.use_grouped_qsl){ + timestamp += schedule_constant_distribution(schedule_rng); + } else { + timestamp += group_size * schedule_constant_distribution(schedule_rng); + } } else { timestamp += schedule_distribution(schedule_rng); } @@ -453,7 +491,8 @@ std::vector GenerateQueries( // no longer generates queries on the fly. Should we reduce the // use of templates? template -PerformanceResult IssueQueries(SystemUnderTest* sut, +PerformanceResult IssueQueries(SystemUnderTest* sut, + QuerySampleLibrary* qsl, const TestSettingsInternal& settings, const LoadableSampleSet& loaded_sample_set, SequenceGen* sequence_gen) { @@ -469,7 +508,7 @@ PerformanceResult IssueQueries(SystemUnderTest* sut, // Generate queries. auto sequence_id_start = sequence_gen->CurrentSampleId(); std::vector queries = GenerateQueries( - settings, loaded_sample_set, sequence_gen, &response_logger); + qsl, settings, loaded_sample_set, sequence_gen, &response_logger); // Calculated expected number of queries uint64_t expected_queries = @@ -644,12 +683,35 @@ std::vector GenerateLoadableSets( // Generate indices for all available samples in the QSL. const size_t qsl_total_count = qsl->TotalSampleCount(); std::vector samples(qsl_total_count); - for (size_t i = 0; i < qsl_total_count; i++) { - samples[i] = static_cast(i); - } + std::vector groupIdx(qsl_total_count); + if (!settings.use_grouped_qsl){ + for (size_t i = 0; i < qsl_total_count; i++) { + samples[i] = static_cast(i); + } - // Randomize the order of the samples. - std::shuffle(samples.begin(), samples.end(), qsl_rng); + // Randomize the order of the samples. + std::shuffle(samples.begin(), samples.end(), qsl_rng); + } else { + // If using grouped qsl, we randomized the groups. + // The samples within a group mantain their order. + size_t number_of_groups = qsl->NumberOfGroups(); + size_t acumCount = 0, idx = 0; + std::vector groups(number_of_groups); + std::vector acumSizes(number_of_groups); + for (size_t i = 0; i < number_of_groups; i++) { + groups[i] = static_cast(i); + acumSizes[i] = acumCount; + acumCount += qsl->GroupSize(i); + } + std::shuffle(groups.begin(), groups.end(), qsl_rng); + for (size_t i = 0; i < number_of_groups; i++) { + for (size_t j = 0; j < qsl->GroupSize(groups[i]); j++) { + samples[idx] = acumSizes[groups[i]] + j; + groupIdx[idx] = groups[i]; + idx++; + } + } + } // Partition the samples into loadable sets. const size_t set_size = settings.performance_sample_count; @@ -659,12 +721,30 @@ std::vector GenerateLoadableSets( std::vector loadable_set; loadable_set.reserve(set_size + set_padding); - for (auto s : samples) { - loadable_set.push_back(s); - if (loadable_set.size() == set_size) { - result.push_back({std::move(loadable_set), set_size}); - loadable_set.clear(); - loadable_set.reserve(set_size + set_padding); + if (!settings.use_grouped_qsl){ + for (auto s : samples) { + loadable_set.push_back(s); + if (loadable_set.size() == set_size) { + result.push_back({std::move(loadable_set), set_size}); + loadable_set.clear(); + loadable_set.reserve(set_size + set_padding); + } + } + } else { + size_t idx = 0; + size_t number_of_groups = qsl->NumberOfGroups(); + for (size_t i = 0; i < number_of_groups; i++) { + size_t group_size = qsl->GroupSize(groupIdx[idx]); + if (loadable_set.size() + group_size < set_size) { + for (size_t j = 0; j < group_size; j++) { + loadable_set.push_back(samples[idx]); + idx++; + } + } else { + result.push_back({std::move(loadable_set), loadable_set.size()}); + loadable_set.clear(); + loadable_set.reserve(set_size + set_padding); + } } } @@ -771,7 +851,7 @@ std::pair FindBoundaries( LoadSamplesToRam(qsl, performance_set.set); PerformanceResult u_pr(IssueQueries( - sut, u_settings, performance_set, sequence_gen)); + sut, qsl, u_settings, performance_set, sequence_gen)); PerformanceSummary u_perf_summary{sut->Name(), u_settings, std::move(u_pr)}; qsl->UnloadSamplesFromRam(performance_set.set); @@ -823,7 +903,7 @@ PerformanceSummary FindPeakPerformanceBinarySearch( }); PerformanceResult m_pr(IssueQueries( - sut, m_settings, performance_set, sequence_gen)); + sut, qsl, m_settings, performance_set, sequence_gen)); PerformanceSummary m_perf_summary{sut->Name(), m_settings, std::move(m_pr)}; std::string tmp; @@ -868,7 +948,7 @@ void RunPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl, } PerformanceResult pr(IssueQueries( - sut, settings, performance_set, sequence_gen)); + sut, qsl, settings, performance_set, sequence_gen)); // Measure PerfClock/system_clock timer durations for comparison vs // external timer. @@ -991,7 +1071,7 @@ void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl, LoadSamplesToRam(qsl, base_performance_set.set); PerformanceResult base_pr(IssueQueries( - sut, base_settings, base_performance_set, sequence_gen)); + sut, qsl, base_settings, base_performance_set, sequence_gen)); PerformanceSummary base_perf_summary{sut->Name(), base_settings, std::move(base_pr)}; @@ -1113,7 +1193,7 @@ void RunAccuracyMode(SystemUnderTest* sut, QuerySampleLibrary* qsl, } PerformanceResult pr(IssueQueries( - sut, settings, loadable_set, sequence_gen)); + sut, qsl, settings, loadable_set, sequence_gen)); { auto tracer = MakeScopedTracer( diff --git a/loadgen/query_sample_library.h b/loadgen/query_sample_library.h index 7258068cbc..03539d0d9a 100644 --- a/loadgen/query_sample_library.h +++ b/loadgen/query_sample_library.h @@ -66,6 +66,9 @@ class QuerySampleLibrary { /// * A previously unloaded sample will not be unloaded again. virtual void UnloadSamplesFromRam( const std::vector& samples) = 0; + virtual size_t GroupSize(size_t i) = 0; + virtual size_t GroupOf(size_t i) = 0; + virtual size_t NumberOfGroups() = 0; }; /// @} diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h index 7bb604d43c..2e0fee7548 100644 --- a/loadgen/test_settings.h +++ b/loadgen/test_settings.h @@ -239,6 +239,9 @@ struct TestSettings { uint64_t test05_sample_index_rng_seed = 0; uint64_t test05_schedule_rng_seed = 0; + /// \brief Flag to use a grouped qsl when running the benchmark test + bool use_grouped_qsl = false; + /// \brief Load mlperf parameter config from file. int FromConfig(const std::string &path, const std::string &model, const std::string &scenario, int conf_type = 1); diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index b21436a6b4..0b59563e0d 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -55,7 +55,8 @@ TestSettingsInternal::TestSettingsInternal( server_tpot_latency(requested.server_tpot_latency), server_constant_gen(requested.server_constant_gen), infer_token_latencies(requested.infer_token_latencies), - token_latency_scaling_factor(requested.token_latency_scaling_factor) { + token_latency_scaling_factor(requested.token_latency_scaling_factor), + use_grouped_qsl(requested.use_grouped_qsl) { // Target QPS, target latency, and max_async_queries. switch (requested.scenario) { case TestScenario::SingleStream: @@ -745,6 +746,10 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, lookupkv(model, scenario, "token_latency_scaling_factor", &token_latency_scaling_factor, nullptr, 1); } + // use_grouped_qsl + if (lookupkv(model, scenario, "use_grouped_qsl", &val, nullptr)) { + use_grouped_qsl = (val == 1) ? true : false; + } // keys that apply to SingleStream lookupkv(model, "SingleStream", "target_latency_percentile", nullptr, &single_stream_target_latency_percentile, 0.01); diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h index 6356a48ccf..3d54cf4c98 100644 --- a/loadgen/test_settings_internal.h +++ b/loadgen/test_settings_internal.h @@ -89,6 +89,7 @@ struct TestSettingsInternal { bool infer_token_latencies = false; int64_t token_latency_scaling_factor; + bool use_grouped_qsl = false; }; /// \brief A namespace of collections of FindPeakPerformance helper functions, From 983a5681c26f74f69080629cdf3294687a5b4ffa Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Thu, 20 Mar 2025 18:33:19 -0500 Subject: [PATCH 3/7] Small fixes for grouped benchmark run --- loadgen/benchmark/repro.cpp | 3 + loadgen/bindings/python_api.cc | 12 ++-- loadgen/demos/py_demo_grouped_qsl.py | 82 ++++++++++++++++++++++++++++ loadgen/loadgen.cc | 24 ++++---- 4 files changed, 105 insertions(+), 16 deletions(-) create mode 100644 loadgen/demos/py_demo_grouped_qsl.py diff --git a/loadgen/benchmark/repro.cpp b/loadgen/benchmark/repro.cpp index 44ff53efa9..e75805e479 100644 --- a/loadgen/benchmark/repro.cpp +++ b/loadgen/benchmark/repro.cpp @@ -33,6 +33,9 @@ class QSL : public mlperf::QuerySampleLibrary { const std::string& Name() override { return mName; } size_t TotalSampleCount() override { return 1000000; } size_t PerformanceSampleCount() override { return TotalSampleCount(); } + size_t GroupSize(size_t i) override { return 1; } + size_t GroupOf(size_t i) override { return i; } + size_t NumberOfGroups() override { return TotalSampleCount(); } void LoadSamplesToRam(const std::vector&) override { } void UnloadSamplesFromRam( diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 149573464a..42ec6024ab 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -113,6 +113,7 @@ class QuerySampleLibraryTrampoline : public QuerySampleLibrary { size_t TotalSampleCount() override { return total_sample_count_; } size_t PerformanceSampleCount() override { return performance_sample_count_; } size_t GroupSize(size_t i) override { return 1; } + size_t GroupOf(size_t i) override { return i; } size_t NumberOfGroups() override { return total_sample_count_; } void LoadSamplesToRam(const std::vector& samples) override { @@ -157,7 +158,7 @@ class GroupedQuerySampleLibraryTrampoline : public QuerySampleLibrary { for(ssize_t i = 0; i < group_sizes.shape()[0]; i++){ group_sizes_.push_back(ptr[i]); total_sample_count_ += ptr[i]; - for(ssize_t j = 0; j < ptr[i]; j++){ + for(size_t j = 0; j < ptr[i]; j++){ group_idx_.push_back(i); } } @@ -330,7 +331,7 @@ void StartTestWithLogSettings(uintptr_t sut, uintptr_t qsl, audit_config_filename); } -void StartTestWithGroupedTest( +void StartTestWithGroupedQSL( uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings, const std::string& audit_config_filename){ pybind11::gil_scoped_release gil_releaser; @@ -339,7 +340,7 @@ void StartTestWithGroupedTest( GroupedQuerySampleLibraryTrampoline* qsl_cast = reinterpret_cast(qsl); LogSettings default_log_settings; - assert(TestSettings.use_grouped_qsl); + assert(test_settings.use_grouped_qsl); mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings, audit_config_filename); } @@ -551,7 +552,7 @@ PYBIND11_MODULE(mlperf_loadgen, m) { "Destroy the object created by ConstructQDL."); m.def("ConstructGroupedQSL", &py::ConstructGroupedQSL, - "Construct the query sample library."); + "Construct grouped query sample library."); m.def("DestroyGroupedQSL", &py::DestroyQSL, "Destroy the object created by ConstructGroupedQSL."); @@ -577,6 +578,9 @@ PYBIND11_MODULE(mlperf_loadgen, m) { "IssueQuery calls have finished.", pybind11::arg("responses"), pybind11::arg("response_cb") = ResponseCallback{}); + m.def("StartTestWithGroupedQSL", &py::StartTestWithGroupedQSL, + "Run tests on a SUT created by ConstructSUT() and a QSL created by" + "ConstructGroupedQSL"); } } // namespace py diff --git a/loadgen/demos/py_demo_grouped_qsl.py b/loadgen/demos/py_demo_grouped_qsl.py new file mode 100644 index 0000000000..02b98c486f --- /dev/null +++ b/loadgen/demos/py_demo_grouped_qsl.py @@ -0,0 +1,82 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +import numpy as np +from absl import app +import mlperf_loadgen + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + time.sleep(0.001) + responses = [] + for s in query_samples: + print(s.index) + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, args=[query_samples]).start() + + +def flush_queries(): + pass + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Server + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.server_target_qps = 100 + settings.server_target_latency_ns = 100000000 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + settings.server_constant_gen = True + settings.use_grouped_qsl = True + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructGroupedQSL( + np.array([16 for _ in range(64)] + [32 for _ in range(32)], dtype=np.uint64), 16*128, load_samples_to_ram, unload_samples_from_ram + ) + + # qsl = mlperf_loadgen.ConstructQSL( + # 1024, 128, load_samples_to_ram, unload_samples_from_ram + # ) + mlperf_loadgen.StartTestWithGroupedQSL(sut, qsl, settings, "") + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc index 7bd85c8e19..ce97fd590f 100644 --- a/loadgen/loadgen.cc +++ b/loadgen/loadgen.cc @@ -208,7 +208,7 @@ auto ScheduleDistribution(double qps) { } auto ScheduleConstantDistribution(double qps){ - return [dist = std::uniform_real_distribution<>(1.0 / qps)](auto& gen) mutable { + return [dist = std::uniform_real_distribution<>(1.0 / qps, 1.0 / qps)](auto& gen) mutable { return std::chrono::duration_cast( std::chrono::duration(dist(gen))); }; @@ -349,11 +349,10 @@ std::vector GenerateQueries( QuerySampleIndex same_sample = settings.performance_issue_same_index; // Variables for handling group test - QuerySampleIndex global_idx = 0; std::vector groups; std::vector groups_first; size_t number_of_groups = 0; - size_t g, group_size; + size_t g = 0, group_size = 1; if (settings.use_grouped_qsl) { size_t current_idx = 0; @@ -362,10 +361,9 @@ std::vector GenerateQueries( size_t current_group = qsl->GroupOf(loaded_samples[current_idx]); groups.push_back(current_group); groups_first.push_back(current_idx); - current_idx += qsl->GroupSize(loaded_samples[current_idx]); + current_idx += qsl->GroupSize(current_group); number_of_groups++; } - } auto grouped_sample_distribution = SampleDistribution( @@ -419,7 +417,7 @@ std::vector GenerateQueries( } } else if (settings.use_grouped_qsl) { g = grouped_sample_distribution(sample_rng); - group_size = qsl->GroupSize(loaded_samples[groups_first[g]]); + group_size = qsl->GroupSize(qsl->GroupOf(groups_first[g])); } else { for (auto& s : samples) { s = loaded_samples[settings.performance_issue_unique @@ -430,23 +428,24 @@ std::vector GenerateQueries( : sample_distribution(sample_rng)]; } } + prev_timestamp = timestamp; if (!settings.use_grouped_qsl) { queries.emplace_back(samples, timestamp, response_delegate, sequence_gen); } else { for (size_t i = 0; i < group_size; i++){ samples[0] = loaded_samples[groups_first[g]+i]; queries.emplace_back(samples, timestamp, response_delegate, sequence_gen); + timestamp += schedule_constant_distribution(schedule_rng); } + prev_timestamp = timestamp - schedule_constant_distribution(schedule_rng); } - prev_timestamp = timestamp; - if (settings.server_constant_gen && (scenario == TestScenario::Server)){ - if(!settings.use_grouped_qsl){ + + if (!settings.use_grouped_qsl){ + if (settings.server_constant_gen && (scenario == TestScenario::Server)){ timestamp += schedule_constant_distribution(schedule_rng); } else { - timestamp += group_size * schedule_constant_distribution(schedule_rng); + timestamp += schedule_distribution(schedule_rng); } - } else { - timestamp += schedule_distribution(schedule_rng); } // In equal_issue mode, the min_queries will be bumped up by a multiple of // the dataset size if the test time has not met the threshold. @@ -455,6 +454,7 @@ std::vector GenerateQueries( (scenario != TestScenario::Offline)) { min_queries += loaded_samples.size(); } + } // See if we need to create a "remainder" query for offline+accuracy to From c73bd63964c262fdd07569408248186e7e41ac00 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 26 Mar 2025 00:28:05 -0500 Subject: [PATCH 4/7] Add group sizes to PerformanceResults + report group latency results --- loadgen/bindings/python_api.cc | 2 +- loadgen/issue_query_controller.cc | 8 +++ loadgen/issue_query_controller.h | 2 + loadgen/loadgen.cc | 20 ++++++- loadgen/results.cc | 87 ++++++++++++++++++++++++++++++- loadgen/results.h | 12 +++++ loadgen/test_settings_internal.cc | 5 +- loadgen/test_settings_internal.h | 1 + 8 files changed, 132 insertions(+), 5 deletions(-) diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 42ec6024ab..55c5747b67 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -340,7 +340,7 @@ void StartTestWithGroupedQSL( GroupedQuerySampleLibraryTrampoline* qsl_cast = reinterpret_cast(qsl); LogSettings default_log_settings; - assert(test_settings.use_grouped_qsl); + assert(TestSettings.use_grouped_qsl); mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings, audit_config_filename); } diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc index c1abea9d14..ec25cdbf7e 100644 --- a/loadgen/issue_query_controller.cc +++ b/loadgen/issue_query_controller.cc @@ -107,6 +107,14 @@ void QueryMetadata::CoalesceQueries(QueryMetadata* queries, size_t first, void QueryMetadata::Decoalesce() { query_to_send.resize(1); } +std::vector QueryMetadata::GetSampleIndices(){ + std::vector sample_indices; + for (auto s: this->samples_){ + sample_indices.push_back(s.sample_index); + } + return sample_indices; +} + /// \brief A base template that should never be used since each scenario has /// its own specialization. template diff --git a/loadgen/issue_query_controller.h b/loadgen/issue_query_controller.h index 5668c574ed..bedf4301ab 100644 --- a/loadgen/issue_query_controller.h +++ b/loadgen/issue_query_controller.h @@ -110,6 +110,8 @@ class QueryMetadata { /// \brief Set a coalesced query back to its original state. void Decoalesce(); + std::vector GetSampleIndices(); + public: std::vector query_to_send; const std::chrono::nanoseconds scheduled_delta; diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc index ce97fd590f..81bd357c67 100644 --- a/loadgen/loadgen.cc +++ b/loadgen/loadgen.cc @@ -417,7 +417,7 @@ std::vector GenerateQueries( } } else if (settings.use_grouped_qsl) { g = grouped_sample_distribution(sample_rng); - group_size = qsl->GroupSize(qsl->GroupOf(groups_first[g])); + group_size = qsl->GroupSize(qsl->GroupOf(loaded_samples[groups_first[g]])); } else { for (auto& s : samples) { s = loaded_samples[settings.performance_issue_unique @@ -639,6 +639,19 @@ PerformanceResult IssueQueries(SystemUnderTest* sut, queries[i].all_samples_done_time); } } + std::vector group_sizes; + std::vector sample_index; + if (settings.use_grouped_qsl){ + for (size_t i = 0; i < queries.size(); i++){ + for (auto s: queries[i].GetSampleIndices()){ + sample_index.push_back(s); + } + } + } + + for (size_t i = 0; i < qsl->NumberOfGroups(); i++) { + group_sizes.push_back(qsl->GroupSize(i)); + } return PerformanceResult{ std::move(sample_latencies), @@ -649,7 +662,10 @@ PerformanceResult IssueQueries(SystemUnderTest* sut, final_query_issued_time, final_query_all_samples_done_time, TokenPerformanceResults{first_token_latencies, time_per_output_token_arr, - tokens_per_sample}}; + tokens_per_sample}, + std::move(group_sizes), + std::move(sample_index) + }; } void LoadSamplesToRam(QuerySampleLibrary* qsl, diff --git a/loadgen/results.cc b/loadgen/results.cc index 5e940793e3..90f8b9dc2b 100644 --- a/loadgen/results.cc +++ b/loadgen/results.cc @@ -146,6 +146,50 @@ void PerformanceSummary::ProcessTokenLatencies() { } } +void PerformanceSummary::ProcessGroupLatencies(){ + if (pr.sample_latencies.empty() || pr.group_sizes.empty() || (!settings.use_grouped_qsl) || (group_latencies_processed)) { + return; + } + sample_count = pr.sample_latencies.size(); + std::vector group_initial_idx; + std::vector group_latencies; + size_t acum_group_idx = 0; + + for(size_t i = 0; i < pr.group_sizes.size(); i++){ + group_initial_idx.push_back(acum_group_idx); + acum_group_idx += pr.group_sizes[i]; + } + size_t i = 0; + QuerySampleLatency accumulated_sample_latency = 0; + + while (i < pr.sample_index.size()) { + auto sample_index = pr.sample_index[i]; + auto low = std::lower_bound (group_initial_idx.begin(), group_initial_idx.end(), sample_index); + size_t idx = low - group_initial_idx.begin(); + if (group_initial_idx[idx] == sample_index){ + group_count++; + QuerySampleLatency q = 0; + for (size_t j = 0; j < pr.group_sizes[idx]; j++){ + q += pr.sample_latencies[i + j]; + } + group_latencies.push_back(q); + accumulated_sample_latency += q; + i += pr.group_sizes[idx]; + } else { + i = pr.sample_index.size(); + } + } + std::sort(group_latencies.begin(), group_latencies.end()); + group_latency_min = group_latencies.front(); + group_latency_max = group_latencies.back(); + group_latency_mean = accumulated_sample_latency / group_count; + + for (auto& lp : group_latency_percentiles) { + lp.query_latency = group_latencies[group_count * lp.percentile]; + } + group_latencies_processed = true; +}; + bool PerformanceSummary::EarlyStopping( std::string* recommendation, int64_t queries_issued, std::vector* sample_latencies, @@ -380,6 +424,9 @@ bool PerformanceSummary::PerfConstraintsMet(std::string* recommendation) { } void PerformanceSummary::LogSummary(AsyncSummary& summary) { + if (settings.use_grouped_qsl) { + ProcessGroupLatencies(); + } ProcessLatencies(); summary( @@ -480,6 +527,15 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) { } } + if (settings.use_grouped_qsl) { + double gps_as_completed = + group_count / pr.final_query_all_samples_done_time; + summary("Groups per second: ", group_count / pr.max_latency); + summary("Completed tokens per second: ", + DoubleToString(gps_as_completed)); + + } + std::string min_duration_recommendation; std::string perf_constraints_recommendation; std::string early_stopping_recommendation; @@ -630,6 +686,17 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) { } } + if (settings.use_grouped_qsl) { + summary("Min group latency (ns) : ", group_latency_min); + summary("Max group latency (ns) : ", group_latency_max); + summary("Mean group latency (ns) : ", group_latency_mean); + for (auto& lp : group_latency_percentiles) { + summary( + DoubleToString(lp.percentile * 100) + " group percentile latency (ns) : ", + lp.query_latency); + } + } + summary( "\n" "================================================\n" @@ -640,6 +707,9 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) { void PerformanceSummary::LogDetail(AsyncDetail& detail) { #if USE_NEW_LOGGING_FORMAT + if (settings.use_grouped_qsl) { + ProcessGroupLatencies(); + } ProcessLatencies(); // General validity checking @@ -848,8 +918,23 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) { break; } } -#endif } + + if(settings.use_grouped_qsl) { + MLPERF_LOG(detail, "result_group_min_latency_ns", + group_latency_min); + MLPERF_LOG(detail, "result_group_max_latency_ns", + group_latency_max); + MLPERF_LOG(detail, "result_group_mean_latency_ns", + group_latency_mean); + for (auto& lp : group_latency_percentiles) { + MLPERF_LOG(detail, + "result_group_" + DoubleToString(lp.percentile * 100) + + "_percentile_latency_ns", + lp.query_latency); + } + } +#endif } } // namespace loadgen } // namespace mlperf diff --git a/loadgen/results.h b/loadgen/results.h index 6befea2c04..da597a3fce 100644 --- a/loadgen/results.h +++ b/loadgen/results.h @@ -44,6 +44,8 @@ struct PerformanceResult { double final_query_issued_time; // seconds from start. double final_query_all_samples_done_time; // seconds from start. TokenPerformanceResults token_results; + std::vector group_sizes; + std::vector sample_index; }; /// \brief Wraps PerformanceResult with relevant context to change how @@ -99,6 +101,15 @@ struct PerformanceSummary { PercentileEntry tpot_percentiles[6] = {{.50}, {.90}, {.95}, {.97}, {.99}, {.999}}; + // Set by ProcessGroupLatencies + size_t group_count = 0; + bool group_latencies_processed = false; + QuerySampleLatency group_latency_min = 0; + QuerySampleLatency group_latency_max = 0; + QuerySampleLatency group_latency_mean = 0; + + PercentileEntry group_latency_percentiles[6] = {{.50}, {.90}, {.95}, + {.97}, {.99}, {.999}}; #if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) // MSVC complains if there is no explicit constructor. // (target_latency_percentile above depends on construction with settings) @@ -109,6 +120,7 @@ struct PerformanceSummary { #endif void ProcessLatencies(); void ProcessTokenLatencies(); + void ProcessGroupLatencies(); bool MinDurationMet(std::string* recommendation); bool EarlyStopping(std::string* recommendation, int64_t queries_issued, diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index 0b59563e0d..21bde70a1f 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -342,6 +342,9 @@ void LogRequestedTestSettings(const TestSettings &s) { s.performance_sample_count_override); MLPERF_LOG(detail, "requested_sample_concatenate_permutation", s.sample_concatenate_permutation); + MLPERF_LOG(detail, "requested_server_constant_gen", + s.server_constant_gen); + MLPERF_LOG(detail, "requested_use_grouped_qsl", s.use_grouped_qsl); // Token latencies specific values if (s.use_token_latencies) { MLPERF_LOG(detail, "requested_use_token_latencies", @@ -458,6 +461,7 @@ void TestSettingsInternal::LogEffectiveSettings() const { s.sample_concatenate_permutation); MLPERF_LOG(detail, "effective_server_constant_gen", s.server_constant_gen); + MLPERF_LOG(detail, "effective_use_grouped_qsl", s.use_grouped_qsl); #else detail(""); detail("Effective Settings:"); @@ -531,7 +535,6 @@ void TestSettingsInternal::LogSummary(AsyncSummary &summary) const { "samples_per_query value"); } } - } // namespace loadgen int TestSettings::FromConfig(const std::string &path, const std::string &model, diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h index 3d54cf4c98..762f26beb9 100644 --- a/loadgen/test_settings_internal.h +++ b/loadgen/test_settings_internal.h @@ -22,6 +22,7 @@ limitations under the License. #include "logging.h" #include "test_settings.h" +#include "query_sample_library.h" namespace mlperf { From 1f4ae664c52e2d0ecaf69b90f85e76da314a67f4 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Mon, 31 Mar 2025 18:30:09 -0500 Subject: [PATCH 5/7] Quick fixes for automotive setup --- loadgen/bindings/python_api.cc | 14 +++++++------- loadgen/demos/py_demo_grouped_qsl.py | 2 +- loadgen/test_settings_internal.cc | 1 + 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 55c5747b67..692966b387 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -142,7 +142,7 @@ class GroupedQuerySampleLibraryTrampoline : public QuerySampleLibrary { size_t performance_sample_count, LoadSamplesToRamCallback load_samples_to_ram_cb, UnloadSamplesFromRamCallback unload_samples_from_ram_cb, - pybind11::array_t& group_sizes) + pybind11::array_t group_sizes) : name_(std::move(name)), performance_sample_count_(performance_sample_count), load_samples_to_ram_cb_(load_samples_to_ram_cb), @@ -290,7 +290,7 @@ void DestroyQDL(uintptr_t qdl) { } uintptr_t ConstructGroupedQSL( - pybind11::array_t& group_sizes, + pybind11::array_t group_sizes, size_t performance_sample_count, LoadSamplesToRamCallback load_samples_to_ram_cb, UnloadSamplesFromRamCallback unload_samples_from_ram_cb) { @@ -300,10 +300,10 @@ uintptr_t ConstructGroupedQSL( return reinterpret_cast(qsl); } -void DestroyGroupedQSL(uintptr_t qdl) { - QueryDispatchLibraryTrampoline* qdl_cast = - reinterpret_cast(qdl); - delete qdl_cast; +void DestroyGroupedQSL(uintptr_t qsl) { + GroupedQuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + delete qsl_cast; } void StartTest(uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings, @@ -553,7 +553,7 @@ PYBIND11_MODULE(mlperf_loadgen, m) { m.def("ConstructGroupedQSL", &py::ConstructGroupedQSL, "Construct grouped query sample library."); - m.def("DestroyGroupedQSL", &py::DestroyQSL, + m.def("DestroyGroupedQSL", &py::DestroyGroupedQSL, "Destroy the object created by ConstructGroupedQSL."); m.def("StartTest", &py::StartTest, diff --git a/loadgen/demos/py_demo_grouped_qsl.py b/loadgen/demos/py_demo_grouped_qsl.py index 02b98c486f..0f2d85914d 100644 --- a/loadgen/demos/py_demo_grouped_qsl.py +++ b/loadgen/demos/py_demo_grouped_qsl.py @@ -74,7 +74,7 @@ def main(argv): # 1024, 128, load_samples_to_ram, unload_samples_from_ram # ) mlperf_loadgen.StartTestWithGroupedQSL(sut, qsl, settings, "") - mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroyGroupedQSL(qsl) mlperf_loadgen.DestroySUT(sut) diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index 21bde70a1f..daf121618a 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -509,6 +509,7 @@ void TestSettingsInternal::LogSummary(AsyncSummary &summary) const { summary("ttft_latency (ns): ", server_ttft_latency); summary("tpot_latency (ns): ", server_tpot_latency); } + summary("target_latency_percentile : ", target_latency_percentile); summary("max_async_queries : ", max_async_queries); summary("min_duration (ms): ", min_duration.count()); summary("max_duration (ms): ", max_duration.count()); From 2664da2e949197e45d73eb917475fc6137f89fa7 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 2 Apr 2025 17:16:37 -0500 Subject: [PATCH 6/7] Add grouped QSL c API --- loadgen/bindings/c_api.cc | 89 ++++++++++++++++++++++++++++++++++ loadgen/bindings/python_api.cc | 2 +- 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/loadgen/bindings/c_api.cc b/loadgen/bindings/c_api.cc index 86f03cfa0c..4361ae27ca 100644 --- a/loadgen/bindings/c_api.cc +++ b/loadgen/bindings/c_api.cc @@ -129,6 +129,83 @@ void DestroyQSL(void* qsl) { delete qsl_cast; } +namespace { + +// +class GroupedQuerySampleLibraryTrampoline : public QuerySampleLibrary { + public: + GroupedQuerySampleLibraryTrampoline( + ClientData client_data, + std::string name, + size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb, + std::vector& group_sizes) + : name_(std::move(name)), + performance_sample_count_(performance_sample_count), + load_samples_to_ram_cb_(load_samples_to_ram_cb), + unload_samples_from_ram_cb_(unload_samples_from_ram_cb) { + + total_sample_count_ = 0; + + for(ssize_t i = 0; i < group_sizes.size(); i++){ + group_sizes_.push_back(group_sizes[i]); + total_sample_count_ += group_sizes[i]; + for(size_t j = 0; j < group_sizes[i]; j++){ + group_idx_.push_back(i); + } + } + } + ~GroupedQuerySampleLibraryTrampoline() override = default; + + const std::string& Name() override { return name_; } + size_t TotalSampleCount() override { return total_sample_count_; } + size_t PerformanceSampleCount() override { return performance_sample_count_; } + size_t GroupSize(size_t i) override { return group_sizes_[i]; } + size_t GroupOf(size_t i) override { return group_idx_[i]; } + size_t NumberOfGroups() override { return group_sizes_.size(); } + + void LoadSamplesToRam(const std::vector& samples) override { + (*load_samples_to_ram_cb_)(client_data_, samples.data(), samples.size()); + } + void UnloadSamplesFromRam( + const std::vector& samples) override { + (*unload_samples_from_ram_cb_)(client_data_, samples.data(), + samples.size()); + } + + private: + std::string name_; + ClientData client_data_; + std::vector group_sizes_; + std::vector group_idx_; + size_t total_sample_count_; + size_t performance_sample_count_; + LoadSamplesToRamCallback load_samples_to_ram_cb_; + UnloadSamplesFromRamCallback unload_samples_from_ram_cb_; +}; + +} // namespace + +void* ConstructGroupedQSL(ClientData client_data, const char* name, size_t name_length, + size_t total_sample_count, size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb, + std::vector& group_sizes) { + GroupedQuerySampleLibraryTrampoline* qsl = new GroupedQuerySampleLibraryTrampoline( + client_data, std::string(name, name_length), total_sample_count, + performance_sample_count, load_samples_to_ram_cb, + unload_samples_from_ram_cb, group_sizes); + return reinterpret_cast(qsl); +} + +void DestroyGroupedQSL(void* qsl) { + GroupedQuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + delete qsl_cast; +} + + // mlperf::c::StartTest just forwards to mlperf::StartTest after doing the // proper cast. void StartTest(void* sut, void* qsl, const TestSettings& settings, @@ -142,6 +219,18 @@ void StartTest(void* sut, void* qsl, const TestSettings& settings, audit_config_filename); } +void StartTestWithGroupedQSL(void* sut, void* qsl, const TestSettings& settings, + const std::string& audit_config_filename = "audit.config") { + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + GroupedQuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + assert(settings.use_grouped_qsl); + LogSettings default_log_settings; + mlperf::StartTest(sut_cast, qsl_cast, settings, default_log_settings, + audit_config_filename); +} + void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count) { mlperf::QuerySamplesComplete(responses, response_count); diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 692966b387..2db6004d74 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -340,7 +340,7 @@ void StartTestWithGroupedQSL( GroupedQuerySampleLibraryTrampoline* qsl_cast = reinterpret_cast(qsl); LogSettings default_log_settings; - assert(TestSettings.use_grouped_qsl); + assert(test_settings.use_grouped_qsl); mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings, audit_config_filename); } From c85c58917d33032f69f167e3fc43bb807c6e9160 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Thu, 3 Apr 2025 10:18:46 -0500 Subject: [PATCH 7/7] Quick fixes: accuracy mode & C API --- loadgen/bindings/c_api.cc | 5 +++-- loadgen/loadgen.cc | 12 +++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/loadgen/bindings/c_api.cc b/loadgen/bindings/c_api.cc index 4361ae27ca..e533330bfd 100644 --- a/loadgen/bindings/c_api.cc +++ b/loadgen/bindings/c_api.cc @@ -13,6 +13,7 @@ limitations under the License. #include "c_api.h" #include +#include #include "../loadgen.h" #include "../query_sample.h" @@ -148,7 +149,7 @@ class GroupedQuerySampleLibraryTrampoline : public QuerySampleLibrary { total_sample_count_ = 0; - for(ssize_t i = 0; i < group_sizes.size(); i++){ + for(size_t i = 0; i < group_sizes.size(); i++){ group_sizes_.push_back(group_sizes[i]); total_sample_count_ += group_sizes[i]; for(size_t j = 0; j < group_sizes[i]; j++){ @@ -193,7 +194,7 @@ void* ConstructGroupedQSL(ClientData client_data, const char* name, size_t name_ UnloadSamplesFromRamCallback unload_samples_from_ram_cb, std::vector& group_sizes) { GroupedQuerySampleLibraryTrampoline* qsl = new GroupedQuerySampleLibraryTrampoline( - client_data, std::string(name, name_length), total_sample_count, + client_data, std::string(name, name_length), performance_sample_count, load_samples_to_ram_cb, unload_samples_from_ram_cb, group_sizes); return reinterpret_cast(qsl); diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc index 81bd357c67..3ef486ee2e 100644 --- a/loadgen/loadgen.cc +++ b/loadgen/loadgen.cc @@ -318,7 +318,6 @@ std::vector GenerateQueries( auto sample_distribution_equal_issue = SampleDistributionEqualIssue( min_queries, loaded_samples.size(), &sample_rng); - TestScenario temp_scenario = scenario; auto schedule_distribution = ScheduleDistribution(settings.target_qps); auto schedule_constant_distribution = ScheduleConstantDistribution(settings.target_qps); @@ -751,12 +750,11 @@ std::vector GenerateLoadableSets( size_t number_of_groups = qsl->NumberOfGroups(); for (size_t i = 0; i < number_of_groups; i++) { size_t group_size = qsl->GroupSize(groupIdx[idx]); - if (loadable_set.size() + group_size < set_size) { - for (size_t j = 0; j < group_size; j++) { - loadable_set.push_back(samples[idx]); - idx++; - } - } else { + for (size_t j = 0; j < group_size; j++) { + loadable_set.push_back(samples[idx]); + idx++; + } + if (loadable_set.size() >= set_size) { result.push_back({std::move(loadable_set), loadable_set.size()}); loadable_set.clear(); loadable_set.reserve(set_size + set_padding);