Add post-install command to build PyTorch CPP extensions from within …

…onnxruntime package (#8027) ORTModule requires two PyTorch CPP extensions that are currently JIT compiled. The runtime compilation can cause issues in some environments without all build requirements or in environments with multiple instances of ORTModule running in parallel This PR creates a custom command to compile such extensions that must be manually executed before ORTModule is executed for the first time. When users try to use ORTModule before the extensions are compiled, an error with instructions are raised PyTorch CPP Extensions for ORTModule can be compiled by running: python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install Full build environment is needed for this
microsoft · Jun 29, 2021 · 83be375 · 83be375
1 parent 25db570
commit 83be375
Show file tree

Hide file tree

Showing 36 changed files with 333 additions and 174 deletions.
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
@@ -194,6 +194,15 @@ if (onnxruntime_ENABLE_TRAINING)
   file(GLOB onnxruntime_python_ortmodule_srcs CONFIGURE_DEPENDS
     "${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/*.py"
   )
+  file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_srcs CONFIGURE_DEPENDS
+    "${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/*.py"
+  )
+  file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_aten_op_executor_srcs CONFIGURE_DEPENDS
+    "${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/aten_op_executor/*"
+  )
+  file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_torch_gpu_allocator_srcs CONFIGURE_DEPENDS
+    "${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/torch_gpu_allocator/*"
+  )
   file(GLOB onnxruntime_python_train_tools_srcs CONFIGURE_DEPENDS
     "${REPO_ROOT}/tools/python/register_custom_ops_pytorch_exporter.py"
   )
@@ -394,6 +403,9 @@ if (onnxruntime_ENABLE_TRAINING)
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/amp
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/optim
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule
+    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions
+    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/aten_op_executor
+    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/torch_gpu_allocator
     COMMAND ${CMAKE_COMMAND} -E copy
         ${onnxruntime_python_capi_training_srcs}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/training/
@@ -409,6 +421,15 @@ if (onnxruntime_ENABLE_TRAINING)
     COMMAND ${CMAKE_COMMAND} -E copy
         ${onnxruntime_python_ortmodule_srcs}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/
+    COMMAND ${CMAKE_COMMAND} -E copy
+        ${onnxruntime_python_ortmodule_torch_cpp_ext_srcs}
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/
+    COMMAND ${CMAKE_COMMAND} -E copy
+        ${onnxruntime_python_ortmodule_torch_cpp_ext_aten_op_executor_srcs}
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/aten_op_executor/
+    COMMAND ${CMAKE_COMMAND} -E copy
+        ${onnxruntime_python_ortmodule_torch_cpp_ext_torch_gpu_allocator_srcs}
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/torch_gpu_allocator/
     COMMAND ${CMAKE_COMMAND} -E copy
         ${onnxruntime_python_train_tools_srcs}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/

diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
@@ -65,6 +65,10 @@ def validate_build_package_info():
         has_ortmodule = True
     except ImportError:
         has_ortmodule = False
+    except EnvironmentError:
+        # ORTModule is present but not ready to run yet
+        has_ortmodule = True
+        pass
     except Exception as e:
         # this may happen if Cuda is not installed, we want to raise it after
         # for any exception other than not having ortmodule, we want to continue

diff --git a/orttraining/orttraining/python/training/__init__.py b/orttraining/orttraining/python/training/__init__.py
@@ -3,11 +3,19 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-
 from onnxruntime.capi._pybind_state import TrainingParameters
 from onnxruntime.capi._pybind_state import PropagateCastOpsStrategy
 from onnxruntime.capi.training.training_session import TrainingSession
 
 from .orttrainer_options import ORTTrainerOptions
 from .orttrainer import ORTTrainer, TrainStepInfo
 from . import amp, checkpoint, optim, model_desc_validation
+
+try:
+    from .ortmodule import ORTModule
+except ImportError:
+    # Not a ORTModule training package
+    pass
+except EnvironmentError:
+    # Not a ORTModule training package
+    pass
diff --git a/orttraining/orttraining/python/training/ortmodule/__init__.py b/orttraining/orttraining/python/training/ortmodule/__init__.py
@@ -4,57 +4,41 @@
 # --------------------------------------------------------------------------
 
 import os
+import sys
+
+from glob import glob
 from packaging import version
 
+
 ################################################################################
 # All global constant goes here, before ORTModule is imported ##################
 ################################################################################
 ONNX_OPSET_VERSION = 12
-MINIMUM_TORCH_VERSION_STR = '1.8.1'
-
-# Use one of the available directories as Torch CPP extension in the following order:
-#    1) Path at listed at TORCH_EXTENSIONS_DIR environment variable
-#    2) Default Python package dir
-#    3) <Home directory>/.cache
-home_dir = os.path.expanduser("~")
-python_package_dir = os.path.dirname(__file__)
-torch_extensions_dir = os.environ.get('TORCH_EXTENSIONS_DIR')
+MINIMUM_RUNTIME_PYTORCH_VERSION_STR = '1.8.1'
+TORCH_CPP_DIR = os.path.join(os.path.dirname(__file__),
+                             'torch_cpp_extensions')
 
-TORCH_CPP_BUILD_DIR = os.path.join(python_package_dir,'torch_inline_extensions')
-TORCH_CPP_BUILD_DIR_BACKUP = os.path.join(home_dir, '.cache', 'torch_ort_extensions')
-
-if torch_extensions_dir is not None and os.access(torch_extensions_dir, os.X_OK | os.W_OK):
-    TORCH_CPP_BUILD_DIR = torch_extensions_dir
-elif not os.access(python_package_dir, os.X_OK | os.W_OK):
-    if os.access(home_dir, os.X_OK | os.W_OK):
-        TORCH_CPP_BUILD_DIR = TORCH_CPP_BUILD_DIR_BACKUP
-    else:
-        extra_message = ''
-        if torch_extensions_dir:
-            extra_message = 'or the path pointed by the TORCH_EXTENSIONS_DIR environment variable '
-        raise PermissionError('ORTModule could not find a writable directory to cache its internal files.',
-                              f'Make {python_package_dir} or {home_dir} {extra_message}writable and try again.')
-
-# Check whether Torch C++ extension compilation was aborted in previous runs
-if not os.path.exists(TORCH_CPP_BUILD_DIR):
-    os.makedirs(TORCH_CPP_BUILD_DIR, exist_ok = True)
-elif os.path.exists(os.path.join(TORCH_CPP_BUILD_DIR,'lock')):
-    print("WARNING: ORTModule detected PyTorch's CPP extension lock file during initialization, "
-          "which can cause the script to stop responding. "
-          f"Delete {os.path.join(TORCH_CPP_BUILD_DIR,'lock')} if a hang occurs.")
-
-# Verify proper PyTorch is installed before proceding to ONNX Runtime initialization
+# Verify minimum PyTorch version is installed before proceding to ONNX Runtime initialization
 try:
     import torch
-    torch_version = version.parse(torch.__version__.split('+')[0])
-    minimum_torch_version = version.parse(MINIMUM_TORCH_VERSION_STR)
-    if torch_version < minimum_torch_version:
+    runtime_pytorch_version = version.parse(torch.__version__.split('+')[0])
+    minimum_runtime_pytorch_version = version.parse(MINIMUM_RUNTIME_PYTORCH_VERSION_STR)
+    if runtime_pytorch_version < minimum_runtime_pytorch_version:
         raise RuntimeError(
-            f'ONNX Runtime ORTModule frontend requires PyTorch version greater or equal to {MINIMUM_TORCH_VERSION_STR}, '
+            f'ONNX Runtime ORTModule frontend requires PyTorch version greater or equal to {MINIMUM_RUNTIME_PYTORCH_VERSION_STR}, '
             f'but version {torch.__version__} was found instead.')
 except:
-    raise(f'PyTorch {MINIMUM_TORCH_VERSION_STR} must be installed in order to run ONNX Runtime ORTModule frontend!')
+    raise RuntimeError(f'PyTorch {MINIMUM_RUNTIME_PYTORCH_VERSION_STR} must be installed in order to run ONNX Runtime ORTModule frontend!')
+
+# Verify whether PyTorch C++ extensions are already compiled
+torch_cpp_exts = glob(os.path.join(TORCH_CPP_DIR, '*.so'))
+torch_cpp_exts.extend(glob(os.path.join(TORCH_CPP_DIR, '*.dll')))
+torch_cpp_exts.extend(glob(os.path.join(TORCH_CPP_DIR, '*.dylib')))
+if not torch_cpp_exts and '-m' not in sys.argv:
+    raise EnvironmentError(f"ORTModule's extensions were not detected at '{TORCH_CPP_DIR}' folder. "
+                           "Run `python -m torch_ort.configure` before using `ORTModule` frontend.")
 
+# PyTorch custom Autograd function support
 from ._custom_autograd_function import enable_custom_autograd_support
 enable_custom_autograd_support()
 

diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from . import _utils, _io, _logger, _cpp_extensions as _cpp_ext
+from . import _utils, _io, _logger, torch_cpp_extensions as _cpp_ext
 from ._custom_autograd_function_exporter import _post_process_after_export
 from onnxruntime.training.ortmodule import ONNX_OPSET_VERSION
 
@@ -117,12 +117,11 @@ def __init__(self, module):
         self.is_rocm_pytorch = (True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False)
 
         self._use_external_gpu_allocator = True
-        if self._use_external_gpu_allocator:
+        if self._use_external_gpu_allocator and torch.cuda.is_available():
             # CPP extension to get torch GPU allocator's alloc and free function addresses
-            self._torch_gpu_allocator = _cpp_ext._load_torch_gpu_allocator_cpp_extension(self._loglevel < _logger.LogLevel.WARNING,
-                                                                                         self.is_rocm_pytorch)
-            self._torch_alloc = self._torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address()
-            self._torch_free = self._torch_gpu_allocator.gpu_caching_allocator_raw_delete_address()
+            from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_gpu_allocator
+            self._torch_alloc = torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address()
+            self._torch_free = torch_gpu_allocator.gpu_caching_allocator_raw_delete_address()
 
         # WIP feature to enable caching in Gradient accumulation scenario.
         self._enable_grad_acc_optimization = False
@@ -217,7 +216,7 @@ def _export_model(self, *inputs, **kwargs):
 
         self._set_device_from_module(inputs, kwargs)
         self._onnx_model = self._get_exported_model(*inputs, **kwargs)
-        _cpp_ext._load_aten_op_executor_cpp_extension_if_needed(self._onnx_model, self._loglevel < _logger.LogLevel.WARNING, self.is_rocm_pytorch)
+        _cpp_ext._load_aten_op_executor_cpp_extension_if_needed(self._onnx_model, self._loglevel < _logger.LogLevel.WARNING)
         if self._save_onnx:
             onnx.save(self._onnx_model, self._save_onnx_prefix + '_torch_exporter.onnx')
 

diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/__init__.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/__init__.py
@@ -0,0 +1,45 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Support for PyTorch C++ extensions within ORTModule
+
+
+TODO: Implement mechanism to register extensions and prevent issues with incorrect/missing flags
+      for each :meth:`torch.utils.cpp_extension.*` call
+"""
+
+import threading
+from functools import wraps
+from onnxruntime.capi import _pybind_state as C
+
+
+def run_once_aten_op_executor(f):
+    """
+    Decorator to run a function only once.
+    :param f: function to be run only once during execution time despite the number of calls
+    :return: The original function with the params passed to it if it hasn't already been run before
+    """
+    @wraps(f)
+    def aten_op_executor_wrapper(*args, **kwargs):
+        if not aten_op_executor_wrapper.has_run:
+            with aten_op_executor_wrapper.lock:
+                if not aten_op_executor_wrapper.has_run:
+                    aten_op_executor_wrapper.has_run = True
+                    return f(*args, **kwargs)
+
+    aten_op_executor_wrapper.lock = threading.Lock()
+    aten_op_executor_wrapper.has_run = False
+    return aten_op_executor_wrapper
+
+@run_once_aten_op_executor
+def _load_aten_op_executor_cpp_extension(verbosity):
+    from onnxruntime.training.ortmodule.torch_cpp_extensions import aten_op_executor
+    C.register_aten_op_executor(str(aten_op_executor.execute_aten_operator_address()))
+
+def _load_aten_op_executor_cpp_extension_if_needed(onnx_model, verbosity):
+    for node in onnx_model.graph.node:
+        if node.op_type == 'ATenOp' and node.domain == 'com.microsoft':
+            _load_aten_op_executor_cpp_extension(verbosity)
+            break
diff --git a/...hon/training/ortmodule/_cpp_extensions.py → ...ions/aten_op_executor/aten_op_executor.cc b/...hon/training/ortmodule/_cpp_extensions.py → ...ions/aten_op_executor/aten_op_executor.cc
@@ -1,72 +1,6 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-"""Support for PyTorch C++ extensions within ORTModule
-
-IMPORTANT: All extensions must explicitly use TORCH_CPP_BUILD_DIR as `build_directory`
-           to allow ORTModule to monitor TORCH_CPP_BUILD_DIR/lock and warn the user
-           when abnormal initialization occurs
-
-TODO: Implement mechanism to register extensions and prevent issues with incorrect/missing flags
-      for each :meth:`torch.utils.cpp_extension.load_inline` call
-"""
-
-import threading
-from functools import wraps
-from torch.utils.cpp_extension import load_inline
-
-from onnxruntime.capi import _pybind_state as C
-from onnxruntime.training.ortmodule import TORCH_CPP_BUILD_DIR
-
-
-def _load_torch_gpu_allocator_cpp_extension(verbosity, is_rocm_pytorch):
-    gpu_identifier = "hip" if is_rocm_pytorch else "cuda"
-    gpu_allocator_header = "HIPCachingAllocator" if is_rocm_pytorch else "CUDACachingAllocator"
-    torch_gpu_allocator_addresses_cpp_source = f'''
-        #include <torch/extension.h>
-        #include <c10/{gpu_identifier}/{gpu_allocator_header}.h>
-
-        size_t gpu_caching_allocator_raw_alloc_address() {{
-            return reinterpret_cast<size_t>(&c10::{gpu_identifier}::{gpu_allocator_header}::raw_alloc);
-        }}
-
-        size_t gpu_caching_allocator_raw_delete_address() {{
-            return reinterpret_cast<size_t>(&c10::{gpu_identifier}::{gpu_allocator_header}::raw_delete);
-        }}
-    '''
-
-    return load_inline(name='torch_allocator',
-                       cpp_sources=[torch_gpu_allocator_addresses_cpp_source],
-                       extra_cflags=['-D__HIP_PLATFORM_HCC__=1' if is_rocm_pytorch else ''],
-                       functions=['gpu_caching_allocator_raw_alloc_address',
-                                  'gpu_caching_allocator_raw_delete_address'],
-                       verbose=verbosity,
-                       with_cuda=True,
-                       build_directory=TORCH_CPP_BUILD_DIR)
-
-def run_once_aten_op_executor(f):
-    """
-    Decorator to run a function only once.
-    :param f: function to be run only once during execution time despite the number of calls
-    :return: The original function with the params passed to it if it hasn't already been run before
-    """
-    @wraps(f)
-    def aten_op_executor_wrapper(*args, **kwargs):
-        if not aten_op_executor_wrapper.has_run:
-            with aten_op_executor_wrapper.lock:
-                if not aten_op_executor_wrapper.has_run:
-                    aten_op_executor_wrapper.has_run = True
-                    return f(*args, **kwargs)
-
-    aten_op_executor_wrapper.lock = threading.Lock()
-    aten_op_executor_wrapper.has_run = False
-    return aten_op_executor_wrapper
-
-@run_once_aten_op_executor
-def _load_aten_op_executor_cpp_extension(verbosity, is_rocm_pytorch):
-    aten_op_executor_cpp_source = """
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #include <torch/torch.h>
 #include <ATen/DLConvertor.h>
 #include <unordered_map>
@@ -100,22 +34,18 @@ class ATenOperatorCache {
           break;
         }
       }
-
       TORCH_INTERNAL_ASSERT(found);
       const auto& schema = aten_op.op->schema();
       aten_op.argument_size = schema.arguments().size();
       for (const auto& argument : schema.arguments()) {
         aten_op.is_optional_arguments.emplace_back(argument.type()->kind() == c10::TypeKind::OptionalType);
       }
-
       aten_op.return_size = schema.returns().size();
       for (const auto& ret : schema.returns()) {
         TORCH_INTERNAL_ASSERT(ret.type()->kind() == c10::TypeKind::TensorType);
       }
-
       ops_[op_name] = aten_op;
     }
-
     return ops_.at(op_name);
   }
 
@@ -131,6 +61,7 @@ class ATenOperatorCache {
 //   weight: embedding_backward(grad, indices, weight.size(0), padding_idx, scale_grad_by_freq, sparse)
 // the 3rd argument (index 2) is weight.size(0), we add this processing here.
 using TensorTransformFunc = std::function<c10::IValue(const at::Tensor&)>;
+
 static const TensorTransformFunc embedding_num_weights = [](const at::Tensor& tensor) {
   return c10::IValue(tensor.size(0));
 };
@@ -166,7 +97,6 @@ class ATenOperatorCache {
     for (T elem : raw_argument.second) {
       list.emplace_back(elem);
     }
-
     ivalue_arguments[index] =
         is_optional_arguments[index] ? c10::IValue(c10::optional<c10::List<T>>(list)) : c10::IValue(list);
   }
@@ -183,14 +113,15 @@ class ATenOperatorCache {
     const std::vector<std::pair<size_t, std::vector<bool>>>& bool_array_arguments) {
   std::string op_name_str(op_name);
   const auto& aten_op = ATenOperatorCache::Instance().GetOperator(op_name_str);
-
   // TODO: need to handle optional argument and arguments with default values.
   std::vector<c10::IValue> arguments;
   arguments.resize(aten_op.argument_size);
+
   for (const auto& tensor_argument : tensor_arguments) {
     size_t index = tensor_argument.first;
     at::Tensor tensor = at::fromDLPack(tensor_argument.second);
     bool has_transform_func = false;
+
     auto op_it = TENSOR_TRANSFORM_FUNCS.find(op_name_str);
     if (op_it != TENSOR_TRANSFORM_FUNCS.end()) {
       auto func_it = op_it->second.find(index);
@@ -229,18 +160,7 @@ class ATenOperatorCache {
 }
 
 size_t execute_aten_operator_address() { return reinterpret_cast<size_t>(&ExecuteATenOperator); }
-    """
 
-    aten_op_executor_cpp_extension = load_inline(name='aten_op_executor', cpp_sources=[aten_op_executor_cpp_source],
-                                                 extra_cflags=['-D__HIP_PLATFORM_HCC__=1' if is_rocm_pytorch else ''],
-                                                 functions=['execute_aten_operator_address'],
-                                                 verbose=verbosity, with_cuda=True,
-                                                 build_directory=TORCH_CPP_BUILD_DIR)
-
-    C.register_aten_op_executor(str(aten_op_executor_cpp_extension.execute_aten_operator_address()))
-
-def _load_aten_op_executor_cpp_extension_if_needed(onnx_model, verbosity, is_rocm_pytorch):
-    for node in onnx_model.graph.node:
-        if node.op_type == 'ATenOp' and node.domain == 'com.microsoft':
-            _load_aten_op_executor_cpp_extension(verbosity, is_rocm_pytorch)
-            break
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("execute_aten_operator_address", &execute_aten_operator_address, "Address of Aten operator executor");
+}