From 9fa0d8392a35e8fa0a7a94bd7e14d6c438c44e5b Mon Sep 17 00:00:00 2001
From: Suffian Khan <sukha@microsoft.com>
Date: Sat, 21 Aug 2021 00:40:12 -0700
Subject: [PATCH] Extend node debugging utilities to push tensors and node
 placement to SQL database (#8672)

* adding support for tracing to sqldb instead of files

* use compiled statements

* script to pull tensors from db

* link sqlite3

* remove node info redundant with onnx graph

* addressing PR comments

* address PR comments and include program counter

* third party notice

* use find_pacakge

* add to cgmanifests.json

* address thread safety and add pid suffix

* build fi

* python script to select on devicetype

* remove unpopulated and redundant Shape and Type fields

* comment

* comment

* PR comments

* add graph execution counter to session state

* move increment to inference session

* std::endl to \n

* ifdef on graph execution counter

* add ifdef to inference session

* move DEBUG_NODE_INPUTS_OUTPUTS to CMakeLists.txt
---
 ThirdPartyNotices.txt                         |  28 ++
 cgmanifests/cgmanifest.json                   |  12 +-
 cmake/CMakeLists.txt                          |   6 +
 cmake/onnxruntime_framework.cmake             |  12 +-
 cmake/onnxruntime_session.cmake               |   1 +
 .../debug_node_inputs_outputs_utils.cc        | 319 +++++++++++++++++-
 .../debug_node_inputs_outputs_utils.h         |  56 ++-
 .../framework/orttraining_partial_executor.cc |  10 +-
 .../core/framework/sequential_executor.cc     |  11 +-
 onnxruntime/core/framework/session_state.h    |  16 +
 onnxruntime/core/session/inference_session.cc |   8 +
 .../debug_node_inputs_outputs_utils_test.cc   |   2 +-
 orttraining/tools/scripts/sqldb_to_tensors.py |  19 ++
 13 files changed, 463 insertions(+), 37 deletions(-)
 create mode 100644 orttraining/tools/scripts/sqldb_to_tensors.py

diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index c3417695adb03..f429f35751963 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -4747,3 +4747,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+_____
+
+SQLite Is Public Domain
+
+All of the code and documentation in SQLite has been dedicated to the public
+domain by the authors. All code authors, and representatives of the companies
+they work for, have signed affidavits dedicating their contributions to the
+public domain and originals of those signed affidavits are stored in a firesafe
+at the main offices of Hwaci. Anyone is free to copy, modify, publish, use,
+compile, sell, or distribute the original SQLite code, either in source code
+form or as a compiled binary, for any purpose, commercial or non-commercial,
+and by any means.
+
+The previous paragraph applies to the deliverable code and documentation in
+SQLite - those parts of the SQLite library that you actually bundle and ship
+with a larger application. Some scripts used as part of the build process (for
+example the "configure" scripts generated by autoconf) might fall under other
+open-source licenses. Nothing from these build scripts ever reaches the final
+deliverable SQLite library, however, and so the licenses associated with those
+scripts should not be a factor in assessing your rights to copy and use the
+SQLite library.
+
+All of the deliverable code in SQLite has been written from scratch. No code
+has been taken from other projects or from the open internet. Every line of
+code can be traced back to its original author, and all of those authors have
+public domain dedications on file. So the SQLite code base is clean and is
+uncontaminated with licensed code from other projects.
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index e4cf9747c0427..c8ff5d7c74ae5 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -469,7 +469,17 @@
             },
             "comments": "dlpack"
          }
+      },
+      {
+         "component": {
+            "Type": "other",
+            "Other": {
+               "Name": "SQLite3",
+               "Version": "3.22.0",
+               "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
+            }
+         }
       }
    ],
    "Version": 1
-}
\ No newline at end of file
+}
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index e0e09afbfcd9f..541604282bb03 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -83,6 +83,7 @@ cmake_dependent_option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instea
 option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
 option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
 option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
+cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
 option(onnxruntime_USE_DML "Build with DirectML support" OFF)
 option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
 option(onnxruntime_USE_WINML "Build with WinML support" OFF)
@@ -1835,3 +1836,8 @@ if (onnxruntime_BUILD_OPSCHEMA_LIB AND onnxruntime_ENABLE_TRAINING)
   # opschema library requires training ops as well
   include(onnxruntime_opschema_lib.cmake)
 endif()
+
+if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS)
+  add_compile_definitions(DEBUG_NODE_INPUTS_OUTPUTS)
+endif()
+
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index 6bad01cc0b5fa..dee3ff2bdae34 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -75,9 +75,15 @@ if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_BUI
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
 endif()
 
-if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS)
-  target_compile_definitions(onnxruntime_framework PRIVATE DEBUG_NODE_INPUTS_OUTPUTS)
+if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB)
+  find_package (SQLite3)
+  if (SQLITE3_FOUND)
+    include_directories(${SQLite3_INCLUDE_DIR})
+    target_link_libraries (onnxruntime_framework ${SQLite3_LIBRARY})
+  else()
+    message( FATAL_ERROR "Could not locate SQLite3 package." )
+  endif (SQLITE3_FOUND)
+  target_compile_definitions(onnxruntime_framework PRIVATE DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB)
 endif()
 
-
 install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
diff --git a/cmake/onnxruntime_session.cmake b/cmake/onnxruntime_session.cmake
index edbebcc6874ca..eeedc004ee678 100644
--- a/cmake/onnxruntime_session.cmake
+++ b/cmake/onnxruntime_session.cmake
@@ -40,6 +40,7 @@ endif()
 if (onnxruntime_ENABLE_TRAINING OR onnxruntime_ENABLE_TRAINING_OPS)
   target_include_directories(onnxruntime_session PRIVATE ${ORTTRAINING_ROOT})
 endif()
+
 if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
   onnxruntime_add_include_to_target(onnxruntime_session Python::Module) 
 endif()
diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
index a1e32b4856662..a48032d4fbab5 100644
--- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
+++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
@@ -7,6 +7,11 @@
 
 #include <iomanip>
 #include <cctype>
+#include <string>
+
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
+#include <sqlite3.h>
+#endif
 
 #include "core/common/path_utils.h"
 #include "core/framework/tensorprotoutils.h"
@@ -18,6 +23,15 @@ namespace utils {
 
 namespace {
 
+struct TensorMetadata {
+
+  std::string name;
+  std::string producer;
+  std::string consumer;
+  std::string device_type;
+  size_t step;
+};
+
 bool FilterNode(const NodeDumpOptions& dump_options, const Node& node) {
   auto match_pattern =
       [](const std::string& value, const std::string& delimited_patterns) {
@@ -117,17 +131,230 @@ void DumpTensorToFile(const Tensor& tensor, const std::string& tensor_name, cons
   ORT_THROW_IF_ERROR(Env::Default().FileClose(output_fd));
 }
 
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
+sqlite3* SqliteConnection() {
+
+  static thread_local std::unique_ptr<sqlite3, decltype(&sqlite3_close)> sqlite_db(
+    []() {
+
+      std::stringstream ss;
+      ss << "-pid" << Env::Default().GetSelfPid() << ".db";
+      const auto& opt = NodeDumpOptionsFromEnvironmentVariables();
+      auto sqlite_db_prefix = opt.sqlite_db_prefix;
+      auto sqlite_db_path = sqlite_db_prefix.Concat(ss.str()).ToPathString();
+  
+      sqlite3 *db;
+      int rc = sqlite3_open(sqlite_db_path.c_str(), &db);
+      ORT_ENFORCE(rc == SQLITE_OK, "Failed to connect to sqlite3 db ", sqlite_db_path.c_str());
+     
+      const char *sql_create_tensor_table = 
+        "Create table if not exists Tensors ( " 
+        "  Step int not null, " 
+        "  Name text not null, " 
+        "  Value TensorProto, "
+        "  DeviceType text, " 
+        "  TracedProducer NodeArg, " 
+        "  TracedConsumers NodeArgList, " 
+        "  primary key (Step, Name) " 
+        ");";
+        
+      const char *error_message = nullptr;
+      rc = sqlite3_exec(db, sql_create_tensor_table, nullptr, 0, (char**)&error_message);
+      ORT_ENFORCE(rc == SQLITE_OK, 
+        "Failed to create Tensors table in sqlite3 db ", sqlite_db_path.c_str(), 
+        " on ", error_message);
+
+      const char *sql_create_node_table = 
+        "Create table if not exists Nodes ( "
+        "  ExecutionCounter int, "
+        "  Name text primary key not null, " 
+        "  OpType text not null, " 
+        "  ExecutionProvider text " 
+        ");";
+        
+      rc = sqlite3_exec(db, sql_create_node_table, nullptr, 0, (char**)&error_message);
+      ORT_ENFORCE(rc == SQLITE_OK, 
+        "Failed to create Nodes table in sqlite3 db ", sqlite_db_path.c_str(), 
+        " on ", error_message);
+ 
+      return db;
+    }(), &sqlite3_close);
+
+  return sqlite_db.get();
+}
+
+#define SQL_OK(command) \
+  ORT_ENFORCE((command) == SQLITE_OK, "Failed sql operation on ", sqlite3_errmsg(SqliteConnection()))
+
+void SqlStepWithRetry(sqlite3_stmt* stmt, int sql_expected) {
+
+  int attempt = 0;
+  while (true) {
+
+    int rc = sqlite3_step(stmt);
+    if (rc == sql_expected) {
+      return;
+    }
+
+    if (rc == SQLITE_BUSY || rc == SQLITE_LOCKED) {
+      if (attempt % 10000 == 0) {
+        std::cerr << "Warning: Pid " << Env::Default().GetSelfPid() 
+	          << " gently spinning on sql db busy or locked\n";
+      }
+      Env::Default().SleepForMicroseconds(100);  
+      attempt++;
+      continue;
+    }
+
+    ORT_THROW("Failed sql step for ", sqlite3_expanded_sql(stmt), " on ", sqlite3_errmsg(SqliteConnection()));
+  }
+}
+
+bool TensorExistsInSqlDb(const TensorMetadata& tensor_metadata) {
+
+  static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
+  
+    sqlite3 *db = SqliteConnection();
+
+    const char *sql_tensor_exists = 
+      "select count(name) from Tensors where Name == ? and Step == ?;";
+
+    sqlite3_stmt *stmt = nullptr;
+    SQL_OK(sqlite3_prepare_v2(db, sql_tensor_exists, -1, &stmt, nullptr));
+
+    return stmt;
+  }(), &sqlite3_finalize);
+
+  sqlite3_stmt* stmt = stmt_uptr.get();
+
+  SQL_OK(sqlite3_reset(stmt));
+  SQL_OK(sqlite3_bind_text(stmt, 1, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
+  SQL_OK(sqlite3_bind_int(stmt, 2, (int)tensor_metadata.step));
+  SqlStepWithRetry(stmt, SQLITE_ROW);
+  bool exists = sqlite3_column_int(stmt, 0) > 0;
+  SqlStepWithRetry(stmt, SQLITE_DONE);
+
+  return exists;
+}
+
+void InsertTensorInSqlDb(const Tensor& tensor, const TensorMetadata& tensor_metadata) {
+
+  static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
+  
+    sqlite3 *db = SqliteConnection();
+
+    const char *sql_insert_tensor = 
+      "Insert into Tensors (Step, Name, Value, DeviceType, TracedProducer, TracedConsumers) " 
+      " values (?, ?, ?, ?, \"\", \"\"); ";
+
+    sqlite3_stmt *stmt = nullptr;
+    SQL_OK(sqlite3_prepare_v2(db, sql_insert_tensor, -1, &stmt, nullptr));
+
+    return stmt;
+  }(), &sqlite3_finalize);
+
+  sqlite3_stmt* stmt = stmt_uptr.get();
+
+  SQL_OK(sqlite3_reset(stmt));
+  SQL_OK(sqlite3_bind_int(stmt, 1, tensor_metadata.step));
+  SQL_OK(sqlite3_bind_text(stmt, 2, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
+
+  auto tensor_proto = utils::TensorToTensorProto(tensor, tensor_metadata.name);
+  std::string bytes = tensor_proto.SerializeAsString();
+  const char* data = bytes.data();
+  int size = bytes.size();
+    
+  SQL_OK(sqlite3_bind_blob(stmt, 3, data, size, SQLITE_TRANSIENT));
+  SQL_OK(sqlite3_bind_text(stmt, 4, tensor_metadata.device_type.c_str(), -1, SQLITE_TRANSIENT));
+ 
+  SqlStepWithRetry(stmt, SQLITE_DONE);
+}
+
+void UpdateTensorUsageInSqlDb(const TensorMetadata& tensor_metadata) {
+
+  static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
+  
+    sqlite3 *db = SqliteConnection();
+
+    const char *sql_update_tensor = 
+      "Update Tensors set " 
+      "  TracedProducer = TracedProducer || ?, "
+      "  TracedConsumers = TracedConsumers || ? " 
+      "where Name = ? and Step = ?;";
+
+    sqlite3_stmt *stmt = nullptr;
+    SQL_OK(sqlite3_prepare_v2(db, sql_update_tensor, -1, &stmt, nullptr));
+
+    return stmt;
+  }(), &sqlite3_finalize);
+
+  sqlite3_stmt* stmt = stmt_uptr.get();
+
+  SQL_OK(sqlite3_reset(stmt));
+  SQL_OK(sqlite3_bind_text(stmt, 1, tensor_metadata.producer.c_str(), -1, SQLITE_TRANSIENT));
+  SQL_OK(sqlite3_bind_text(stmt, 2, tensor_metadata.consumer.c_str(), -1, SQLITE_TRANSIENT));
+  SQL_OK(sqlite3_bind_text(stmt, 3, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
+  SQL_OK(sqlite3_bind_int(stmt, 4, tensor_metadata.step));
+  
+  SqlStepWithRetry(stmt, SQLITE_DONE);
+}
+
+void DumpTensorToSqliteDb(const Tensor& tensor, const TensorMetadata& tensor_metadata) {
+
+  if (!TensorExistsInSqlDb(tensor_metadata)) {
+    InsertTensorInSqlDb(tensor, tensor_metadata);
+  }
+
+  UpdateTensorUsageInSqlDb(tensor_metadata);
+}
+
+void InsertNodePlacementToSqliteDb(const NodeDumpContext& dump_context, const Node& node) {
+
+  static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
+  
+    sqlite3 *db = SqliteConnection();
+
+    const char *sql_insert_node = 
+      "Insert or Ignore into Nodes (ExecutionCounter, Name, OpType, ExecutionProvider) "
+      " values (?, ?, ?, ?);";
+
+    sqlite3_stmt *stmt = nullptr;
+    SQL_OK(sqlite3_prepare_v2(db, sql_insert_node, -1, &stmt, nullptr));
+
+    return stmt;
+  }(), &sqlite3_finalize);
+
+  sqlite3_stmt* stmt = stmt_uptr.get();
+ 
+  SQL_OK(sqlite3_reset(stmt));
+  SQL_OK(sqlite3_bind_int(stmt, 1, dump_context.program_counter)); 
+  SQL_OK(sqlite3_bind_text(stmt, 2, node.Name().c_str(), -1, SQLITE_TRANSIENT));
+  SQL_OK(sqlite3_bind_text(stmt, 3, node.OpType().c_str(), -1, SQLITE_TRANSIENT));
+  SQL_OK(sqlite3_bind_text(stmt, 4, node.GetExecutionProviderType().c_str(), -1, SQLITE_TRANSIENT)); 
+ 
+  SqlStepWithRetry(stmt, SQLITE_DONE);
+}
+#endif // DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
+
 void DumpCpuTensor(
     const NodeDumpOptions& dump_options,
-    const Tensor& tensor, const std::string& tensor_name) {
+    const Tensor& tensor, const TensorMetadata& tensor_metadata) {
   switch (dump_options.data_destination) {
     case NodeDumpOptions::DataDestination::StdOut: {
       DispatchOnTensorType(tensor.DataType(), DumpTensorToStdOut, tensor);
       break;
     }
     case NodeDumpOptions::DataDestination::TensorProtoFiles: {
-      const Path tensor_file = dump_options.output_dir / Path::Parse(MakeTensorFileName(tensor_name, dump_options));
-      DumpTensorToFile(tensor, tensor_name, tensor_file);
+      const Path tensor_file = dump_options.output_dir / Path::Parse(MakeTensorFileName(tensor_metadata.name, dump_options));
+      DumpTensorToFile(tensor, tensor_metadata.name, tensor_file);
+      break;
+    }
+    case NodeDumpOptions::DataDestination::SqliteDb: {
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
+      DumpTensorToSqliteDb(tensor, tensor_metadata);
+#else
+      ORT_THROW("Recompile with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1 onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB=1");
+#endif
       break;
     }
     default:
@@ -137,14 +364,15 @@ void DumpCpuTensor(
 
 void DumpTensor(
     const NodeDumpOptions& dump_options,
-    const Tensor& tensor, const std::string& tensor_name,
+    const Tensor& tensor, TensorMetadata& tensor_metadata,
     const SessionState& session_state) {
   // check tensor is on CPU before dumping it
   auto& tensor_location = tensor.Location();
   if (tensor_location.device.Type() == OrtDevice::CPU ||
       tensor_location.mem_type == OrtMemTypeCPUInput ||
       tensor_location.mem_type == OrtMemTypeCPUOutput) {
-    DumpCpuTensor(dump_options, tensor, tensor_name);
+    tensor_metadata.device_type = "CPU";
+    DumpCpuTensor(dump_options, tensor, tensor_metadata);
   } else {
     std::cout << tensor_location << "\n";
 
@@ -159,7 +387,8 @@ void DumpTensor(
       const auto& data_transfer_mgr = session_state.GetDataTransferMgr();
       auto status = data_transfer_mgr.CopyTensor(tensor, cpu_tensor);
       if (status == common::Status::OK()) {
-        DumpCpuTensor(dump_options, cpu_tensor, tensor_name);
+        tensor_metadata.device_type = "GPU";
+        DumpCpuTensor(dump_options, cpu_tensor, tensor_metadata);
       } else {
         std::cout << " failed to transfer data to cpu.\n";
       }
@@ -191,13 +420,25 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables() {
     if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpOutputData, false)) {
       opts.dump_flags |= NodeDumpOptions::DumpFlags::OutputData;
     }
+    if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpNodePlacement, true)) {
+      opts.dump_flags |= NodeDumpOptions::DumpFlags::NodePlacement;
+    }
 
     opts.filter.name_pattern = Env::Default().GetEnvironmentVar(env_vars::kNameFilter);
     opts.filter.op_type_pattern = Env::Default().GetEnvironmentVar(env_vars::kOpTypeFilter);
 
-    if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpDataToFiles, false)) {
+    const std::string destination = ParseEnvironmentVariableWithDefault<std::string>(
+        env_vars::kDumpDataDestination, "stdout");
+
+    if (destination == "files") {
       opts.data_destination = NodeDumpOptions::DataDestination::TensorProtoFiles;
     }
+    else if (destination == "sqlite") {
+      opts.data_destination = NodeDumpOptions::DataDestination::SqliteDb;
+    }
+    else if (destination != "stdout") {
+      ORT_THROW("Unsupported data destination type: ", destination);
+    }
 
     if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kAppendRankToFileName, false)) {
       std::string rank = Env::Default().GetEnvironmentVar("OMPI_COMM_WORLD_RANK");
@@ -210,6 +451,10 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables() {
 
     opts.output_dir = Path::Parse(ToPathString(Env::Default().GetEnvironmentVar(env_vars::kOutputDir)));
 
+    std::string sqlite_db_prefix = 
+	ParseEnvironmentVariableWithDefault<std::string>(env_vars::kSqliteDbPrefix, "execution-trace");
+    opts.sqlite_db_prefix = Path::Parse(ToPathString(sqlite_db_prefix));
+
     // check for confirmation for dumping data to files for all nodes
     const bool is_input_or_output_requested = ((opts.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) ||
                                               ((opts.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0);
@@ -241,8 +486,11 @@ static void PrintIf(bool boolean_expression, const std::string& message) {
 }
 
 void DumpNodeInputs(
-    const NodeDumpOptions& dump_options,
-    const OpKernelContext& context, const Node& node, const SessionState& session_state) {
+    const NodeDumpOptions& dump_options, 
+    const NodeDumpContext& dump_context, 
+    const OpKernelContext& context, 
+    const Node& node, 
+    const SessionState& session_state) {
   const bool is_any_output_dumped = IsAnyOutputDumped(dump_options);
   if (!is_any_output_dumped) {
     return;
@@ -250,14 +498,23 @@ void DumpNodeInputs(
 
   if (!FilterNode(dump_options, node)) return;
 
+  bool should_dump_node_placement = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::NodePlacement) != 0;
+  if (dump_context.iteration == 1 && should_dump_node_placement) {
+    PrintIf(should_dump_node_placement, MakeString(" Placement: ", node.GetExecutionProviderType(), "\n"));
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB    
+    InsertNodePlacementToSqliteDb(dump_context, node);
+#endif
+  }
+
   std::cout << "-----------\n";
   std::cout << node.OpType() << " node: " << node.Name() << "\n";
 
   const auto& input_defs = node.InputDefs();
+  TensorMetadata tensor_metadata;
 
   for (auto i = 0, end = context.InputCount(); i < end; ++i) {
     if (input_defs[i]->Exists()) {
-      std::cout << "Input " << i << " Name: " << input_defs[i]->Name();
+      std::cout << "Input " << i << " Name: " << input_defs[i]->Name() << "\n";
 
       const auto* type = context.InputType(i);
 
@@ -270,7 +527,10 @@ void DumpNodeInputs(
           PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
 
           if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) {
-            DumpTensor(dump_options, tensor, input_defs[i]->Name(), session_state);
+            tensor_metadata.name = input_defs[i]->Name();
+            tensor_metadata.step = dump_context.iteration;
+            tensor_metadata.consumer = node.Name() + ":" + std::to_string(i);
+            DumpTensor(dump_options, tensor, tensor_metadata, session_state);
           }
         } else {
           std::cout << " is non-tensor type.\n";
@@ -286,11 +546,19 @@ void DumpNodeInputs(
 }
 
 void DumpNodeInputs(
-    const OpKernelContext& context, const Node& node, const SessionState& session_state) {
-  DumpNodeInputs(NodeDumpOptionsFromEnvironmentVariables(), context, node, session_state);
+    const NodeDumpContext& dump_context, 
+    const OpKernelContext& context, 
+    const Node& node, 
+    const SessionState& session_state) {
+  DumpNodeInputs(NodeDumpOptionsFromEnvironmentVariables(), dump_context, context, node, session_state);
 }
 
-void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& context, const Node& node, const SessionState& session_state) {
+void DumpNodeOutputs(
+   const NodeDumpOptions& dump_options, 
+   const NodeDumpContext& dump_context, 
+   OpKernelContext& context, 
+   const Node& node, 
+   const SessionState& session_state) {
   const bool is_any_output_dumped = IsAnyOutputDumped(dump_options);
   if (!is_any_output_dumped) {
     return;
@@ -298,12 +566,21 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
 
   if (!FilterNode(dump_options, node)) return;
 
+  bool should_dump_node_placement = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::NodePlacement) != 0;
+  if (dump_context.iteration == 1 && should_dump_node_placement) {
+    PrintIf(should_dump_node_placement, MakeString(" Placement: ", node.GetExecutionProviderType(), "\n"));
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB    
+    InsertNodePlacementToSqliteDb(dump_context, node);
+#endif
+  }
+
   std::cout << "-----------\n";
   const auto& output_defs = node.OutputDefs();
+  TensorMetadata tensor_metadata;
 
   for (auto i = 0, end = context.OutputCount(); i < end; ++i) {
     if (output_defs[i]->Exists()) {
-      std::cout << "Output " << i << " Name: " << output_defs[i]->Name();
+      std::cout << "Output " << i << " Name: " << output_defs[i]->Name() << "\n";
 
       const auto* type = context.OutputType(i);
       if (type) {
@@ -315,7 +592,10 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
           PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
 
           if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0) {
-            DumpTensor(dump_options, tensor, output_defs[i]->Name(), session_state);
+            tensor_metadata.name = output_defs[i]->Name();
+            tensor_metadata.step = dump_context.iteration;
+            tensor_metadata.producer = node.Name() + ":" + std::to_string(i);
+            DumpTensor(dump_options, tensor, tensor_metadata, session_state);
           }
         } else {
           std::cout << " is non-tensor type.\n";
@@ -333,8 +613,11 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
 }
 
 void DumpNodeOutputs(
-    OpKernelContext& context, const Node& node, const SessionState& session_state) {
-  DumpNodeOutputs(NodeDumpOptionsFromEnvironmentVariables(), context, node, session_state);
+    const NodeDumpContext& dump_context,
+    OpKernelContext& context, 
+    const Node& node, 
+    const SessionState& session_state) {
+  DumpNodeOutputs(NodeDumpOptionsFromEnvironmentVariables(), dump_context, context, node, session_state);
 }
 
 }  // namespace utils
diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
index a7b1699bfeeda..1d5797f0e887d 100644
--- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
+++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
@@ -4,6 +4,14 @@
 // to create a build with these enabled run the build script with:
 //   --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
 
+// to enable redirect to sqlite database run the build script with:
+//   --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1 onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB=1
+//
+// see orttraining/tools/scripts/sqldb_to_tensors.py for retrieval
+//
+// select data dump destination using
+//   ORT_DEBUG_NODE_IO_DUMP_DATA_DESTINATION= one of {stdout, files, sqlite}
+
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
 
 #pragma once
@@ -18,10 +26,12 @@ namespace utils {
 
 // environment variables that control debug node dumping behavior
 namespace debug_node_inputs_outputs_env_vars {
-// Shape is printed by default unless it's turned OFF by setting environment
-// variable ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA to 0.
-// set to non-zero to dump shape data
+// Tensor shape and Node placement is printed by default unless it's turned OFF
+// by setting the respective environment variables to 0
+// set to non-zero to dump tensor shape data
 constexpr const char* kDumpShapeData = "ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA";
+// set to non-zero to dump node placement data
+constexpr const char* kDumpNodePlacement = "ORT_DEBUG_NODE_IO_DUMP_NODE_PLACEMENT";
 // set to non-zero to dump node input data
 constexpr const char* kDumpInputData = "ORT_DEBUG_NODE_IO_DUMP_INPUT_DATA";
 // set to non-zero to dump node output data
@@ -32,12 +42,14 @@ constexpr const char* kNameFilter = "ORT_DEBUG_NODE_IO_NAME_FILTER";
 // specify a node op type filter to limit the nodes that are dumped
 // see NodeDumpOptions::FilterOptions
 constexpr const char* kOpTypeFilter = "ORT_DEBUG_NODE_IO_OP_TYPE_FILTER";
-// set to non-zero to dump data to files instead of stdout
-constexpr const char* kDumpDataToFiles = "ORT_DEBUG_NODE_IO_DUMP_DATA_TO_FILES";
+// set to "stdout" or "files" or "sqlite" to select dump destination
+constexpr const char* kDumpDataDestination = "ORT_DEBUG_NODE_IO_DUMP_DATA_DESTINATION";
 // set to non-zero to append OpenMPI world rank to filename
 constexpr const char* kAppendRankToFileName = "ORT_DEBUG_NODE_IO_APPEND_RANK_TO_FILE_NAME";
 // specify the output directory for any data files produced
 constexpr const char* kOutputDir = "ORT_DEBUG_NODE_IO_OUTPUT_DIR";
+// specify the file prefix for sqlite3 db (process id will be appended)
+constexpr const char* kSqliteDbPrefix = "ORT_DEBUG_NODE_IO_SQLITE_DB_PREFIX";
 // set to non-zero to confirm that dumping data files for all nodes is acceptable
 constexpr const char* kDumpingDataToFilesForAllNodesIsOk =
     "ORT_DEBUG_NODE_IO_DUMPING_DATA_TO_FILES_FOR_ALL_NODES_IS_OK";
@@ -51,7 +63,8 @@ struct NodeDumpOptions {
     Shape = 1 << 0,
     InputData = 1 << 1,
     OutputData = 1 << 2,
-    AllData = Shape | InputData | OutputData,
+    NodePlacement = 1 << 3,
+    AllData = Shape | InputData | OutputData | NodePlacement,
   };
 
   // specifies the information to dump per node
@@ -80,11 +93,22 @@ struct NodeDumpOptions {
     StdOut,
     // write to one file per tensor input/output as a TensorProto
     TensorProtoFiles,
+    // write to one row per tensor input/output in Sqlite table
+    SqliteDb
   } data_destination{DataDestination::StdOut};
 
   std::string file_suffix;
   // the output directory for dumped data files
   Path output_dir;
+  // the sqlite3 db to append dumped data
+  Path sqlite_db_prefix;
+};
+
+struct NodeDumpContext {
+  // which execution pass are we on?
+  size_t iteration;
+  // which node are we on?
+  size_t program_counter;
 };
 
 // gets NodeDumpOptions instance configured from environment variable values
@@ -93,18 +117,30 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables();
 // dumps inputs for a node
 void DumpNodeInputs(
     const NodeDumpOptions& dump_options,
-    const OpKernelContext& context, const Node& node, const SessionState& session_state);
+    const NodeDumpContext& dump_context,
+    const OpKernelContext& context, 
+    const Node& node, 
+    const SessionState& session_state);
 
 void DumpNodeInputs(
-    const OpKernelContext& context, const Node& node, const SessionState& session_state);
+    const NodeDumpContext& dump_context,
+    const OpKernelContext& context, 
+    const Node& node, 
+    const SessionState& session_state);
 
 // dumps outputs for a node
 void DumpNodeOutputs(
     const NodeDumpOptions& dump_options,
-    OpKernelContext& context, const Node& node, const SessionState& session_state);
+    const NodeDumpContext& dump_context,
+    OpKernelContext& context, 
+    const Node& node, 
+    const SessionState& session_state);
 
 void DumpNodeOutputs(
-    OpKernelContext& context, const Node& node, const SessionState& session_state);
+    const NodeDumpContext& dump_context,
+    OpKernelContext& context, 
+    const Node& node, 
+    const SessionState& session_state);
 
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/orttraining_partial_executor.cc b/onnxruntime/core/framework/orttraining_partial_executor.cc
index 6cd76363737ec..f5f83ae6e976d 100644
--- a/onnxruntime/core/framework/orttraining_partial_executor.cc
+++ b/onnxruntime/core/framework/orttraining_partial_executor.cc
@@ -184,6 +184,11 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
       profile::Color::Black);
 #endif
 
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS
+    utils::NodeDumpContext dump_context { session_state.GetGraphExecutionCounter(), 0 };
+#endif
+
+
   for (size_t program_counter = state_.GetProgramCounterStart();
        program_counter < state_.GetProgramCounterEnd();
        program_counter += 1) {
@@ -287,7 +292,8 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
       }
     }
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
-    utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node(), session_state);
+    dump_context.program_counter = program_counter; 
+    utils::DumpNodeInputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
 #endif
 
     const std::string node_name_for_profiling = [&]() -> std::string {
@@ -440,7 +446,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
     }
 
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
-    utils::DumpNodeOutputs(op_kernel_context, p_op_kernel->Node(), session_state);
+    utils::DumpNodeOutputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
 #endif
 
     // free ml-values corresponding to this node
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 13c8ad1454e68..4ddcc9a71e3b0 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -186,6 +186,12 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
       profile::Color::Black);
 #endif
 
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS
+    size_t program_counter = 0;
+    utils::NodeDumpContext dump_context { session_state.GetGraphExecutionCounter(), program_counter };
+#endif
+
+
   for (const auto& node_exec_plan : exec_plan_vec) {
     if (terminate_flag_) {
       LOGS(logger, WARNING) << "Exiting due to terminate flag being set to true.";
@@ -271,7 +277,8 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
       }
     }
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
-    utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node(), session_state);
+    dump_context.program_counter = program_counter++;
+    utils::DumpNodeInputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
 #endif
 
     const std::string node_name_for_profiling = [&]() -> std::string {
@@ -418,7 +425,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
     }
 
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
-    utils::DumpNodeOutputs(op_kernel_context, p_op_kernel->Node(), session_state);
+    utils::DumpNodeOutputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
 #endif
 
     // free ml-values corresponding to this node
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index 90b6f7e70c103..d238b656aa686 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -101,6 +101,7 @@ class SessionState {
         use_deterministic_compute_(use_deterministic_compute),
         enable_mem_reuse_(enable_mem_reuse),
         prepacked_weights_container_(prepacked_weights_container) {
+
     SetupAllocators();
   }
 
@@ -317,6 +318,16 @@ class SessionState {
     return used_shared_pre_packed_weights_counter_;
   }
 
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS
+  void IncrementGraphExecutionCounter() {
+    ++graph_executions_counter_;
+  }
+
+  size_t GetGraphExecutionCounter() const {
+    return graph_executions_counter_;
+  }
+#endif
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState);
 
@@ -502,6 +513,11 @@ class SessionState {
   // Counter for number of times a shared version of the pre-packed weight corresponding to
   // a constant initialized weight was used by the session state
   size_t used_shared_pre_packed_weights_counter_ = 0;
+
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS
+  // Counter for number of times the session graph has been executed
+  size_t graph_executions_counter_ = 0;
+#endif
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 84a1ce5602cea..8e5400c0e1c7f 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1691,6 +1691,11 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
     ORT_ENFORCE(session_options_.execution_mode == ExecutionMode::ORT_SEQUENTIAL, "Only sequential mode is supported.");
 
     // execute the graph
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS
+    if (state.GetProgramCounterStart() == 0) {
+      session_state_->IncrementGraphExecutionCounter();
+    }
+#endif
     ORT_CHECK_AND_SET_RETVAL(utils::ExecutePartialGraph(*session_state_, feeds_fetches_manager, feeds, fetches,
                                                         run_logger, state, cache));
   }
@@ -1801,6 +1806,9 @@ Status InferenceSession::Run(const RunOptions& run_options,
 #endif
 
     // execute the graph
+#ifdef DEBUG_NODE_INPUTS_OUTPUTS
+    session_state_->IncrementGraphExecutionCounter();
+#endif
     ORT_CHECK_AND_SET_RETVAL(utils::ExecuteGraph(*session_state_, feeds_fetches_manager, feeds, *p_fetches,
                                                  session_options_.execution_mode, run_options.terminate, run_logger,
                                                  run_options.only_execute_path_to_fetches));
diff --git a/onnxruntime/test/debug_node_inputs_outputs/debug_node_inputs_outputs_utils_test.cc b/onnxruntime/test/debug_node_inputs_outputs/debug_node_inputs_outputs_utils_test.cc
index 7d5017683fa9c..afb81a2aec88f 100644
--- a/onnxruntime/test/debug_node_inputs_outputs/debug_node_inputs_outputs_utils_test.cc
+++ b/onnxruntime/test/debug_node_inputs_outputs/debug_node_inputs_outputs_utils_test.cc
@@ -43,7 +43,7 @@ TEST(DebugNodeInputsOutputs, BasicFileOutput) {
           {env_vars::kDumpOutputData, "1"},
           {env_vars::kNameFilter, nullopt},
           {env_vars::kOpTypeFilter, nullopt},
-          {env_vars::kDumpDataToFiles, "1"},
+          {env_vars::kDumpDataDestination, "files"},
           {env_vars::kAppendRankToFileName, nullopt},
           {env_vars::kOutputDir, ToMBString(temp_dir.Path())},
           {env_vars::kDumpingDataToFilesForAllNodesIsOk, "1"},
diff --git a/orttraining/tools/scripts/sqldb_to_tensors.py b/orttraining/tools/scripts/sqldb_to_tensors.py
new file mode 100644
index 0000000000000..0a512a52c3683
--- /dev/null
+++ b/orttraining/tools/scripts/sqldb_to_tensors.py
@@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import sqlite3
+import onnx
+from onnx import numpy_helper
+
+connection = sqlite3.connect('<path-to-sqldb-from-tracing>', detect_types=sqlite3.PARSE_DECLTYPES)
+
+def convert_tensor_proto_to_numpy_array(blob):
+    tensor_proto = onnx.TensorProto()
+    tensor_proto.ParseFromString(blob)
+    return numpy_helper.to_array(tensor_proto)
+
+sqlite3.register_converter("TensorProto", convert_tensor_proto_to_numpy_array)
+
+for step, name, value, device, producer, consumers in connection.execute(
+        'Select Step, Name, Value, DeviceType, TracedProducer, TracedConsumers from Tensors'):
+    print(step, name, value.shape, consumers)