Add new option to disable cpu sync for tensors (#8490)

* add options to disable cpu copy back * null check proprties * only affect gpu outputs * change name to disabletensorcpusync * slight refactoring * Globally enable ms-experimental ops * change meaning of ms_experimental to mean *all* ms_experimental ops. Some experimental ops will still be enabled globally without this flag like audio ops. * remove changes incorrectly merged * bad merge * add test Co-authored-by: Sheil Kumar <[email protected]>
microsoft · Aug 27, 2021 · 775f862 · 775f862
1 parent 6a477ac
commit 775f862
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 5 deletions.
diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h
@@ -336,6 +336,17 @@ struct TensorBase : TBase {
     return S_OK;
   }
 
+  bool GetDisableTensorCpuSyncFromMetadata(const wfc::IPropertySet& properties) {
+    if (properties != nullptr && properties.HasKey(L"DisableTensorCpuSync")) {
+      if (auto disableTensorCpuSyncInspectable = properties.Lookup(L"DisableTensorCpuSync")) {
+        auto disableTensorCpuSyncValue = disableTensorCpuSyncInspectable.as<wf::IPropertyValue>();
+        return disableTensorCpuSyncValue.GetBoolean();
+      }
+    }
+
+    return false;
+  }
+
   // ILotusValueProviderPrivate::UpdateSourceResourceData
   STDMETHOD(UpdateSourceResourceData)
   (BindingContext& context, IValue* value) {
@@ -350,13 +361,17 @@ struct TensorBase : TBase {
     // get the shape
     RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!");
 
+    bool is_cpu;
+    bool isCpuOutput = SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu;
+    bool disableTensorCpuSyncProperty = GetDisableTensorCpuSyncFromMetadata(context.properties);
+    bool disableCpuSync = !isCpuOutput && disableTensorCpuSyncProperty;
+
     // make sure we always have a CPU resource
-    if (CpuTensor() == nullptr) {
+    if (!disableCpuSync && CpuTensor() == nullptr) {
       CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
     }
 
-    bool is_cpu;
-    if (SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu) {
+    if (isCpuOutput) {
       // Get the data pointer and size
       auto buffer = CpuTensor()->buffer(false);
 
@@ -371,7 +386,7 @@ struct TensorBase : TBase {
         // In that case the underlying buffers will not match the engine output, and they need to be flushed.
         CpuTensor()->flush();
       }
-    } else {
+    } else if (!disableCpuSync) {
       // If we got a gpu resource, we should move the data to the cpu so accessors can retrieve the data.
       // We don't need to copy the engine provided dx resource into a local copy since we always preallocate gpu
       // resources for tensors. Therefore we are certain that the returned dxresource is the same as the one we passed in

diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
@@ -729,7 +729,9 @@ static void Scenario21RunModel2ChainZ() {
   std::vector<int64_t> shape = {1, 3, 720, 720};
   auto outputValue = TensorFloat::Create(shape);  //   FeatureValueFromFeatureValueDescriptor(input, nullptr);
                                                   // now bind the(empty) output so we have a marker to chain with
-  binding1.Bind(output.Name(), outputValue);
+  PropertySet outputBindProperties;
+  outputBindProperties.Insert(L"DisableTensorCpuSync", wf::PropertyValue::CreateBoolean(true));
+  binding1.Bind(output.Name(), outputValue, outputBindProperties);
   // and leave the output unbound on the second model, we will fetch it later
   // run both models async
   WINML_EXPECT_NO_THROW(session1.EvaluateAsync(binding1, L""));