Skip to content

Commit 534763d

Browse files
mklimenkpreetha-intel
authored andcommitted
Fix model copying for QDQ stripping (#784)
* Reintroduce #768 with a small fix * Fix model copying with help from microsoft#25761 * Remove unused debug variables
1 parent 5a52271 commit 534763d

File tree

3 files changed

+68
-20
lines changed

3 files changed

+68
-20
lines changed

onnxruntime/core/providers/openvino/backend_manager.cc

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,56 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
375375
return false;
376376
}
377377

378+
static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
379+
const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
380+
for (std::size_t i = 0; i < node_indices.size(); i++) {
381+
gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
382+
for (auto& output : node->OutputDefs()) {
383+
if (output->ToProto().type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
384+
return true;
385+
}
386+
}
387+
return false;
388+
}
389+
390+
static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) {
391+
const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr;
392+
return type_proto && type_proto->has_tensor_type() &&
393+
(type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
394+
type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16);
395+
}
396+
397+
// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
398+
static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) {
399+
std::unordered_set<std::string> qdq_ops = {"QuantizeLinear", "DequantizeLinear"};
400+
const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
401+
402+
for (size_t i = 0; i < node_indices.size(); i++) {
403+
gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
404+
405+
if (qdq_ops.find(node->OpType()) != qdq_ops.end()) {
406+
const auto& input_defs = node->InputDefs();
407+
408+
if (node->OpType() == "DequantizeLinear") {
409+
// DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
410+
// Check quantized input tensor and optional zero point
411+
if (Is16BitTensor(input_defs.empty() ? nullptr : input_defs[0]) ||
412+
(input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
413+
return true;
414+
}
415+
} else if (node->OpType() == "QuantizeLinear") {
416+
// QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
417+
const auto& output_defs = node->OutputDefs();
418+
if (Is16BitTensor(output_defs.empty() ? nullptr : output_defs[0]) ||
419+
(input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
420+
return true;
421+
}
422+
}
423+
}
424+
}
425+
return false;
426+
}
427+
378428
static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
379429
[[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
380430
[[maybe_unused]] const onnxruntime::Node& fused_node) {
@@ -433,6 +483,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
433483
}
434484
#endif
435485

486+
// Check if the graph is QDQ and has int16 or uint16 quantization
487+
// If so, we will apply the QDQ scales fix transformation (for GPU device only)
488+
bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);
489+
436490
const auto& onnx_model_path_name = subgraph.ModelPath();
437491
// QDQ stripping enabled only for the NPU and experimentally on the GPU
438492
if ((session_context_.device_type.find("NPU") != std::string::npos) &&
@@ -446,7 +500,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
446500
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
447501
return model_proto;
448502
} else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
449-
enable_ovep_qdq_optimizer) {
503+
is_qdq_graph_uint16_or_int16) {
450504
// Create a copy of the model
451505
std::unique_ptr<onnxruntime::Model> model;
452506
Status status = qdq_scales_fix::Transform(subgraph, logger, model);

onnxruntime/core/providers/openvino/ov_versions/data_ops.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,12 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
555555
return false;
556556
}
557557

558+
auto dtype = type_proto->tensor_type().elem_type();
559+
// Enable bfloat16 -> float16 on-the-fly conversion
560+
if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16 ||
561+
dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 ||
562+
dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)
563+
return true;
558564
if (is_initializer) {
559565
auto dtype = type_proto->tensor_type().elem_type();
560566
for (auto const& var : supported_types_initializer_) {
@@ -609,9 +615,6 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
609615
(var.second == dtype)) {
610616
return true;
611617
}
612-
// experimentally for GPU and qdq stripping mode allow int16 types
613-
if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16))
614-
return true;
615618
}
616619
#ifndef NDEBUG
617620
if (openvino_ep::backend_utils::IsDebugEnabled()) {

onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
#include "qdq_scales_fix.h"
55
#include "core/providers/openvino/ov_protobuf_utils.h"
6+
#include "core/framework/ort_value.h"
7+
#include "core/framework/float16.h"
68

79
#include <fstream>
810
#include <list>
@@ -903,22 +905,11 @@ Status copy_model(const GraphViewer& src_graph_viewer,
903905
}
904906

905907
for (auto& [name, tensor_proto] : src_graph.GetAllInitializedTensors()) {
906-
dst_graph.AddInitializedTensor(*tensor_proto);
907-
}
908-
909-
for (auto node_arg : src_graph.GetInputsIncludingInitializers()) {
910-
auto check_inputs = [node_arg](auto input_node_arg) {
911-
return input_node_arg->Name() == node_arg->Name();
912-
};
913-
if (std::find_if(dst_graph_inputs.begin(), dst_graph_inputs.end(), check_inputs) != dst_graph_inputs.end())
914-
continue;
915-
916-
auto src_tensor_proto = src_graph.GetConstantInitializer(node_arg->Name(), true);
917-
if (src_tensor_proto) {
918-
auto dst_tensor_proto = onnx::TensorProto::Create();
919-
dst_tensor_proto->copy_from(src_tensor_proto);
920-
dst_graph.AddInitializedTensor(*dst_tensor_proto);
921-
}
908+
auto ort_value = OrtValue();
909+
if (src_graph.GetOrtValueInitializer(name, ort_value))
910+
ORT_RETURN_IF_ERROR(dst_graph.AddInitializedOrtValue(*tensor_proto, ort_value));
911+
else
912+
dst_graph.AddInitializedTensor(*tensor_proto);
922913
}
923914

924915
ORT_RETURN_IF_ERROR(dst_graph.Resolve());

0 commit comments

Comments
 (0)