From 775f862067b305f8dcba3ff1d65d47c15119c30d Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Fri, 27 Aug 2021 13:29:52 -0700
Subject: [PATCH] Add new option to disable cpu sync for tensors  (#8490)

* add options to disable cpu copy back

* null check proprties

* only affect gpu outputs

* change name to disabletensorcpusync

* slight refactoring

* Globally enable ms-experimental ops

* change meaning of ms_experimental to mean *all* ms_experimental ops. Some experimental ops will still be enabled globally without this flag like audio ops.

* remove changes incorrectly merged

* bad merge

* add test

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
---
 winml/lib/Api/impl/TensorBase.h               | 23 +++++++++++++++----
 .../cppwinrt/scenariotestscppwinrt.cpp        |  4 +++-
 2 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h
index 181f3f68d49c4..318b0ab4bc1b2 100644
--- a/winml/lib/Api/impl/TensorBase.h
+++ b/winml/lib/Api/impl/TensorBase.h
@@ -336,6 +336,17 @@ struct TensorBase : TBase {
     return S_OK;
   }
 
+  bool GetDisableTensorCpuSyncFromMetadata(const wfc::IPropertySet& properties) {
+    if (properties != nullptr && properties.HasKey(L"DisableTensorCpuSync")) {
+      if (auto disableTensorCpuSyncInspectable = properties.Lookup(L"DisableTensorCpuSync")) {
+        auto disableTensorCpuSyncValue = disableTensorCpuSyncInspectable.as<wf::IPropertyValue>();
+        return disableTensorCpuSyncValue.GetBoolean();
+      }
+    }
+
+    return false;
+  }
+
   // ILotusValueProviderPrivate::UpdateSourceResourceData
   STDMETHOD(UpdateSourceResourceData)
   (BindingContext& context, IValue* value) {
@@ -350,13 +361,17 @@ struct TensorBase : TBase {
     // get the shape
     RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!");
 
+    bool is_cpu;
+    bool isCpuOutput = SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu;
+    bool disableTensorCpuSyncProperty = GetDisableTensorCpuSyncFromMetadata(context.properties);
+    bool disableCpuSync = !isCpuOutput && disableTensorCpuSyncProperty;
+
     // make sure we always have a CPU resource
-    if (CpuTensor() == nullptr) {
+    if (!disableCpuSync && CpuTensor() == nullptr) {
       CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
     }
 
-    bool is_cpu;
-    if (SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu) {
+    if (isCpuOutput) {
       // Get the data pointer and size
       auto buffer = CpuTensor()->buffer(false);
 
@@ -371,7 +386,7 @@ struct TensorBase : TBase {
         // In that case the underlying buffers will not match the engine output, and they need to be flushed.
         CpuTensor()->flush();
       }
-    } else {
+    } else if (!disableCpuSync) {
       // If we got a gpu resource, we should move the data to the cpu so accessors can retrieve the data.
       // We don't need to copy the engine provided dx resource into a local copy since we always preallocate gpu
       // resources for tensors. Therefore we are certain that the returned dxresource is the same as the one we passed in
diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
index 418bfe865c7fd..96fae01ff12f7 100644
--- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
+++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
@@ -729,7 +729,9 @@ static void Scenario21RunModel2ChainZ() {
   std::vector<int64_t> shape = {1, 3, 720, 720};
   auto outputValue = TensorFloat::Create(shape);  //   FeatureValueFromFeatureValueDescriptor(input, nullptr);
                                                   // now bind the(empty) output so we have a marker to chain with
-  binding1.Bind(output.Name(), outputValue);
+  PropertySet outputBindProperties;
+  outputBindProperties.Insert(L"DisableTensorCpuSync", wf::PropertyValue::CreateBoolean(true));
+  binding1.Bind(output.Name(), outputValue, outputBindProperties);
   // and leave the output unbound on the second model, we will fetch it later
   // run both models async
   WINML_EXPECT_NO_THROW(session1.EvaluateAsync(binding1, L""));