From 775f862067b305f8dcba3ff1d65d47c15119c30d Mon Sep 17 00:00:00 2001 From: Sheil Kumar Date: Fri, 27 Aug 2021 13:29:52 -0700 Subject: [PATCH] Add new option to disable cpu sync for tensors (#8490) * add options to disable cpu copy back * null check proprties * only affect gpu outputs * change name to disabletensorcpusync * slight refactoring * Globally enable ms-experimental ops * change meaning of ms_experimental to mean *all* ms_experimental ops. Some experimental ops will still be enabled globally without this flag like audio ops. * remove changes incorrectly merged * bad merge * add test Co-authored-by: Sheil Kumar --- winml/lib/Api/impl/TensorBase.h | 23 +++++++++++++++---- .../cppwinrt/scenariotestscppwinrt.cpp | 4 +++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h index 181f3f68d49c4..318b0ab4bc1b2 100644 --- a/winml/lib/Api/impl/TensorBase.h +++ b/winml/lib/Api/impl/TensorBase.h @@ -336,6 +336,17 @@ struct TensorBase : TBase { return S_OK; } + bool GetDisableTensorCpuSyncFromMetadata(const wfc::IPropertySet& properties) { + if (properties != nullptr && properties.HasKey(L"DisableTensorCpuSync")) { + if (auto disableTensorCpuSyncInspectable = properties.Lookup(L"DisableTensorCpuSync")) { + auto disableTensorCpuSyncValue = disableTensorCpuSyncInspectable.as(); + return disableTensorCpuSyncValue.GetBoolean(); + } + } + + return false; + } + // ILotusValueProviderPrivate::UpdateSourceResourceData STDMETHOD(UpdateSourceResourceData) (BindingContext& context, IValue* value) { @@ -350,13 +361,17 @@ struct TensorBase : TBase { // get the shape RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!"); + bool is_cpu; + bool isCpuOutput = SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu; + bool disableTensorCpuSyncProperty = GetDisableTensorCpuSyncFromMetadata(context.properties); + bool disableCpuSync = !isCpuOutput && disableTensorCpuSyncProperty; + // make sure we always have a CPU resource - if (CpuTensor() == nullptr) { + if (!disableCpuSync && CpuTensor() == nullptr) { CpuTensor() = std::make_shared<_winml::Tensor>(shape_); } - bool is_cpu; - if (SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu) { + if (isCpuOutput) { // Get the data pointer and size auto buffer = CpuTensor()->buffer(false); @@ -371,7 +386,7 @@ struct TensorBase : TBase { // In that case the underlying buffers will not match the engine output, and they need to be flushed. CpuTensor()->flush(); } - } else { + } else if (!disableCpuSync) { // If we got a gpu resource, we should move the data to the cpu so accessors can retrieve the data. // We don't need to copy the engine provided dx resource into a local copy since we always preallocate gpu // resources for tensors. Therefore we are certain that the returned dxresource is the same as the one we passed in diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp index 418bfe865c7fd..96fae01ff12f7 100644 --- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp +++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp @@ -729,7 +729,9 @@ static void Scenario21RunModel2ChainZ() { std::vector shape = {1, 3, 720, 720}; auto outputValue = TensorFloat::Create(shape); // FeatureValueFromFeatureValueDescriptor(input, nullptr); // now bind the(empty) output so we have a marker to chain with - binding1.Bind(output.Name(), outputValue); + PropertySet outputBindProperties; + outputBindProperties.Insert(L"DisableTensorCpuSync", wf::PropertyValue::CreateBoolean(true)); + binding1.Bind(output.Name(), outputValue, outputBindProperties); // and leave the output unbound on the second model, we will fetch it later // run both models async WINML_EXPECT_NO_THROW(session1.EvaluateAsync(binding1, L""));