Skip to content

Commit

Permalink
Add post-install command to build PyTorch CPP extensions from within …
Browse files Browse the repository at this point in the history
…onnxruntime package (#8027)

ORTModule requires two PyTorch CPP extensions that are currently JIT compiled. The runtime compilation can cause issues in some environments without all build requirements or in environments with multiple instances of ORTModule running in parallel

This PR creates a custom command to compile such extensions that must be manually executed before ORTModule is executed for the first time. When users try to use ORTModule before the extensions are compiled, an error with instructions are raised

PyTorch CPP Extensions for ORTModule can be compiled by running:
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install

Full build environment is needed for this
  • Loading branch information
Thiago Crepaldi authored Jun 29, 2021
1 parent 25db570 commit 83be375
Show file tree
Hide file tree
Showing 36 changed files with 333 additions and 174 deletions.
21 changes: 21 additions & 0 deletions cmake/onnxruntime_python.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,15 @@ if (onnxruntime_ENABLE_TRAINING)
file(GLOB onnxruntime_python_ortmodule_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/*.py"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/*.py"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_aten_op_executor_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/aten_op_executor/*"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_torch_gpu_allocator_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/torch_gpu_allocator/*"
)
file(GLOB onnxruntime_python_train_tools_srcs CONFIGURE_DEPENDS
"${REPO_ROOT}/tools/python/register_custom_ops_pytorch_exporter.py"
)
Expand Down Expand Up @@ -394,6 +403,9 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/amp
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/optim
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/aten_op_executor
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/torch_gpu_allocator
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_capi_training_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/training/
Expand All @@ -409,6 +421,15 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_aten_op_executor_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/aten_op_executor/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_torch_gpu_allocator_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/torch_gpu_allocator/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_train_tools_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/
Expand Down
4 changes: 4 additions & 0 deletions onnxruntime/python/onnxruntime_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ def validate_build_package_info():
has_ortmodule = True
except ImportError:
has_ortmodule = False
except EnvironmentError:
# ORTModule is present but not ready to run yet
has_ortmodule = True
pass
except Exception as e:
# this may happen if Cuda is not installed, we want to raise it after
# for any exception other than not having ortmodule, we want to continue
Expand Down
10 changes: 9 additions & 1 deletion orttraining/orttraining/python/training/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,19 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------


from onnxruntime.capi._pybind_state import TrainingParameters
from onnxruntime.capi._pybind_state import PropagateCastOpsStrategy
from onnxruntime.capi.training.training_session import TrainingSession

from .orttrainer_options import ORTTrainerOptions
from .orttrainer import ORTTrainer, TrainStepInfo
from . import amp, checkpoint, optim, model_desc_validation

try:
from .ortmodule import ORTModule
except ImportError:
# Not a ORTModule training package
pass
except EnvironmentError:
# Not a ORTModule training package
pass
60 changes: 22 additions & 38 deletions orttraining/orttraining/python/training/ortmodule/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,57 +4,41 @@
# --------------------------------------------------------------------------

import os
import sys

from glob import glob
from packaging import version


################################################################################
# All global constant goes here, before ORTModule is imported ##################
################################################################################
ONNX_OPSET_VERSION = 12
MINIMUM_TORCH_VERSION_STR = '1.8.1'

# Use one of the available directories as Torch CPP extension in the following order:
# 1) Path at listed at TORCH_EXTENSIONS_DIR environment variable
# 2) Default Python package dir
# 3) <Home directory>/.cache
home_dir = os.path.expanduser("~")
python_package_dir = os.path.dirname(__file__)
torch_extensions_dir = os.environ.get('TORCH_EXTENSIONS_DIR')
MINIMUM_RUNTIME_PYTORCH_VERSION_STR = '1.8.1'
TORCH_CPP_DIR = os.path.join(os.path.dirname(__file__),
'torch_cpp_extensions')

TORCH_CPP_BUILD_DIR = os.path.join(python_package_dir,'torch_inline_extensions')
TORCH_CPP_BUILD_DIR_BACKUP = os.path.join(home_dir, '.cache', 'torch_ort_extensions')

if torch_extensions_dir is not None and os.access(torch_extensions_dir, os.X_OK | os.W_OK):
TORCH_CPP_BUILD_DIR = torch_extensions_dir
elif not os.access(python_package_dir, os.X_OK | os.W_OK):
if os.access(home_dir, os.X_OK | os.W_OK):
TORCH_CPP_BUILD_DIR = TORCH_CPP_BUILD_DIR_BACKUP
else:
extra_message = ''
if torch_extensions_dir:
extra_message = 'or the path pointed by the TORCH_EXTENSIONS_DIR environment variable '
raise PermissionError('ORTModule could not find a writable directory to cache its internal files.',
f'Make {python_package_dir} or {home_dir} {extra_message}writable and try again.')

# Check whether Torch C++ extension compilation was aborted in previous runs
if not os.path.exists(TORCH_CPP_BUILD_DIR):
os.makedirs(TORCH_CPP_BUILD_DIR, exist_ok = True)
elif os.path.exists(os.path.join(TORCH_CPP_BUILD_DIR,'lock')):
print("WARNING: ORTModule detected PyTorch's CPP extension lock file during initialization, "
"which can cause the script to stop responding. "
f"Delete {os.path.join(TORCH_CPP_BUILD_DIR,'lock')} if a hang occurs.")

# Verify proper PyTorch is installed before proceding to ONNX Runtime initialization
# Verify minimum PyTorch version is installed before proceding to ONNX Runtime initialization
try:
import torch
torch_version = version.parse(torch.__version__.split('+')[0])
minimum_torch_version = version.parse(MINIMUM_TORCH_VERSION_STR)
if torch_version < minimum_torch_version:
runtime_pytorch_version = version.parse(torch.__version__.split('+')[0])
minimum_runtime_pytorch_version = version.parse(MINIMUM_RUNTIME_PYTORCH_VERSION_STR)
if runtime_pytorch_version < minimum_runtime_pytorch_version:
raise RuntimeError(
f'ONNX Runtime ORTModule frontend requires PyTorch version greater or equal to {MINIMUM_TORCH_VERSION_STR}, '
f'ONNX Runtime ORTModule frontend requires PyTorch version greater or equal to {MINIMUM_RUNTIME_PYTORCH_VERSION_STR}, '
f'but version {torch.__version__} was found instead.')
except:
raise(f'PyTorch {MINIMUM_TORCH_VERSION_STR} must be installed in order to run ONNX Runtime ORTModule frontend!')
raise RuntimeError(f'PyTorch {MINIMUM_RUNTIME_PYTORCH_VERSION_STR} must be installed in order to run ONNX Runtime ORTModule frontend!')

# Verify whether PyTorch C++ extensions are already compiled
torch_cpp_exts = glob(os.path.join(TORCH_CPP_DIR, '*.so'))
torch_cpp_exts.extend(glob(os.path.join(TORCH_CPP_DIR, '*.dll')))
torch_cpp_exts.extend(glob(os.path.join(TORCH_CPP_DIR, '*.dylib')))
if not torch_cpp_exts and '-m' not in sys.argv:
raise EnvironmentError(f"ORTModule's extensions were not detected at '{TORCH_CPP_DIR}' folder. "
"Run `python -m torch_ort.configure` before using `ORTModule` frontend.")

# PyTorch custom Autograd function support
from ._custom_autograd_function import enable_custom_autograd_support
enable_custom_autograd_support()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------

from . import _utils, _io, _logger, _cpp_extensions as _cpp_ext
from . import _utils, _io, _logger, torch_cpp_extensions as _cpp_ext
from ._custom_autograd_function_exporter import _post_process_after_export
from onnxruntime.training.ortmodule import ONNX_OPSET_VERSION

Expand Down Expand Up @@ -117,12 +117,11 @@ def __init__(self, module):
self.is_rocm_pytorch = (True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False)

self._use_external_gpu_allocator = True
if self._use_external_gpu_allocator:
if self._use_external_gpu_allocator and torch.cuda.is_available():
# CPP extension to get torch GPU allocator's alloc and free function addresses
self._torch_gpu_allocator = _cpp_ext._load_torch_gpu_allocator_cpp_extension(self._loglevel < _logger.LogLevel.WARNING,
self.is_rocm_pytorch)
self._torch_alloc = self._torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address()
self._torch_free = self._torch_gpu_allocator.gpu_caching_allocator_raw_delete_address()
from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_gpu_allocator
self._torch_alloc = torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address()
self._torch_free = torch_gpu_allocator.gpu_caching_allocator_raw_delete_address()

# WIP feature to enable caching in Gradient accumulation scenario.
self._enable_grad_acc_optimization = False
Expand Down Expand Up @@ -217,7 +216,7 @@ def _export_model(self, *inputs, **kwargs):

self._set_device_from_module(inputs, kwargs)
self._onnx_model = self._get_exported_model(*inputs, **kwargs)
_cpp_ext._load_aten_op_executor_cpp_extension_if_needed(self._onnx_model, self._loglevel < _logger.LogLevel.WARNING, self.is_rocm_pytorch)
_cpp_ext._load_aten_op_executor_cpp_extension_if_needed(self._onnx_model, self._loglevel < _logger.LogLevel.WARNING)
if self._save_onnx:
onnx.save(self._onnx_model, self._save_onnx_prefix + '_torch_exporter.onnx')

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------

"""Support for PyTorch C++ extensions within ORTModule
TODO: Implement mechanism to register extensions and prevent issues with incorrect/missing flags
for each :meth:`torch.utils.cpp_extension.*` call
"""

import threading
from functools import wraps
from onnxruntime.capi import _pybind_state as C


def run_once_aten_op_executor(f):
"""
Decorator to run a function only once.
:param f: function to be run only once during execution time despite the number of calls
:return: The original function with the params passed to it if it hasn't already been run before
"""
@wraps(f)
def aten_op_executor_wrapper(*args, **kwargs):
if not aten_op_executor_wrapper.has_run:
with aten_op_executor_wrapper.lock:
if not aten_op_executor_wrapper.has_run:
aten_op_executor_wrapper.has_run = True
return f(*args, **kwargs)

aten_op_executor_wrapper.lock = threading.Lock()
aten_op_executor_wrapper.has_run = False
return aten_op_executor_wrapper

@run_once_aten_op_executor
def _load_aten_op_executor_cpp_extension(verbosity):
from onnxruntime.training.ortmodule.torch_cpp_extensions import aten_op_executor
C.register_aten_op_executor(str(aten_op_executor.execute_aten_operator_address()))

def _load_aten_op_executor_cpp_extension_if_needed(onnx_model, verbosity):
for node in onnx_model.graph.node:
if node.op_type == 'ATenOp' and node.domain == 'com.microsoft':
_load_aten_op_executor_cpp_extension(verbosity)
break
Original file line number Diff line number Diff line change
@@ -1,72 +1,6 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------

"""Support for PyTorch C++ extensions within ORTModule
IMPORTANT: All extensions must explicitly use TORCH_CPP_BUILD_DIR as `build_directory`
to allow ORTModule to monitor TORCH_CPP_BUILD_DIR/lock and warn the user
when abnormal initialization occurs
TODO: Implement mechanism to register extensions and prevent issues with incorrect/missing flags
for each :meth:`torch.utils.cpp_extension.load_inline` call
"""

import threading
from functools import wraps
from torch.utils.cpp_extension import load_inline

from onnxruntime.capi import _pybind_state as C
from onnxruntime.training.ortmodule import TORCH_CPP_BUILD_DIR


def _load_torch_gpu_allocator_cpp_extension(verbosity, is_rocm_pytorch):
gpu_identifier = "hip" if is_rocm_pytorch else "cuda"
gpu_allocator_header = "HIPCachingAllocator" if is_rocm_pytorch else "CUDACachingAllocator"
torch_gpu_allocator_addresses_cpp_source = f'''
#include <torch/extension.h>
#include <c10/{gpu_identifier}/{gpu_allocator_header}.h>
size_t gpu_caching_allocator_raw_alloc_address() {{
return reinterpret_cast<size_t>(&c10::{gpu_identifier}::{gpu_allocator_header}::raw_alloc);
}}
size_t gpu_caching_allocator_raw_delete_address() {{
return reinterpret_cast<size_t>(&c10::{gpu_identifier}::{gpu_allocator_header}::raw_delete);
}}
'''

return load_inline(name='torch_allocator',
cpp_sources=[torch_gpu_allocator_addresses_cpp_source],
extra_cflags=['-D__HIP_PLATFORM_HCC__=1' if is_rocm_pytorch else ''],
functions=['gpu_caching_allocator_raw_alloc_address',
'gpu_caching_allocator_raw_delete_address'],
verbose=verbosity,
with_cuda=True,
build_directory=TORCH_CPP_BUILD_DIR)

def run_once_aten_op_executor(f):
"""
Decorator to run a function only once.
:param f: function to be run only once during execution time despite the number of calls
:return: The original function with the params passed to it if it hasn't already been run before
"""
@wraps(f)
def aten_op_executor_wrapper(*args, **kwargs):
if not aten_op_executor_wrapper.has_run:
with aten_op_executor_wrapper.lock:
if not aten_op_executor_wrapper.has_run:
aten_op_executor_wrapper.has_run = True
return f(*args, **kwargs)

aten_op_executor_wrapper.lock = threading.Lock()
aten_op_executor_wrapper.has_run = False
return aten_op_executor_wrapper

@run_once_aten_op_executor
def _load_aten_op_executor_cpp_extension(verbosity, is_rocm_pytorch):
aten_op_executor_cpp_source = """
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include <torch/torch.h>
#include <ATen/DLConvertor.h>
#include <unordered_map>
Expand Down Expand Up @@ -100,22 +34,18 @@ class ATenOperatorCache {
break;
}
}
TORCH_INTERNAL_ASSERT(found);
const auto& schema = aten_op.op->schema();
aten_op.argument_size = schema.arguments().size();
for (const auto& argument : schema.arguments()) {
aten_op.is_optional_arguments.emplace_back(argument.type()->kind() == c10::TypeKind::OptionalType);
}
aten_op.return_size = schema.returns().size();
for (const auto& ret : schema.returns()) {
TORCH_INTERNAL_ASSERT(ret.type()->kind() == c10::TypeKind::TensorType);
}
ops_[op_name] = aten_op;
}
return ops_.at(op_name);
}

Expand All @@ -131,6 +61,7 @@ class ATenOperatorCache {
// weight: embedding_backward(grad, indices, weight.size(0), padding_idx, scale_grad_by_freq, sparse)
// the 3rd argument (index 2) is weight.size(0), we add this processing here.
using TensorTransformFunc = std::function<c10::IValue(const at::Tensor&)>;

static const TensorTransformFunc embedding_num_weights = [](const at::Tensor& tensor) {
return c10::IValue(tensor.size(0));
};
Expand Down Expand Up @@ -166,7 +97,6 @@ class ATenOperatorCache {
for (T elem : raw_argument.second) {
list.emplace_back(elem);
}
ivalue_arguments[index] =
is_optional_arguments[index] ? c10::IValue(c10::optional<c10::List<T>>(list)) : c10::IValue(list);
}
Expand All @@ -183,14 +113,15 @@ class ATenOperatorCache {
const std::vector<std::pair<size_t, std::vector<bool>>>& bool_array_arguments) {
std::string op_name_str(op_name);
const auto& aten_op = ATenOperatorCache::Instance().GetOperator(op_name_str);
// TODO: need to handle optional argument and arguments with default values.
std::vector<c10::IValue> arguments;
arguments.resize(aten_op.argument_size);

for (const auto& tensor_argument : tensor_arguments) {
size_t index = tensor_argument.first;
at::Tensor tensor = at::fromDLPack(tensor_argument.second);
bool has_transform_func = false;

auto op_it = TENSOR_TRANSFORM_FUNCS.find(op_name_str);
if (op_it != TENSOR_TRANSFORM_FUNCS.end()) {
auto func_it = op_it->second.find(index);
Expand Down Expand Up @@ -229,18 +160,7 @@ class ATenOperatorCache {
}

size_t execute_aten_operator_address() { return reinterpret_cast<size_t>(&ExecuteATenOperator); }
"""

aten_op_executor_cpp_extension = load_inline(name='aten_op_executor', cpp_sources=[aten_op_executor_cpp_source],
extra_cflags=['-D__HIP_PLATFORM_HCC__=1' if is_rocm_pytorch else ''],
functions=['execute_aten_operator_address'],
verbose=verbosity, with_cuda=True,
build_directory=TORCH_CPP_BUILD_DIR)

C.register_aten_op_executor(str(aten_op_executor_cpp_extension.execute_aten_operator_address()))

def _load_aten_op_executor_cpp_extension_if_needed(onnx_model, verbosity, is_rocm_pytorch):
for node in onnx_model.graph.node:
if node.op_type == 'ATenOp' and node.domain == 'com.microsoft':
_load_aten_op_executor_cpp_extension(verbosity, is_rocm_pytorch)
break
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("execute_aten_operator_address", &execute_aten_operator_address, "Address of Aten operator executor");
}
Loading

0 comments on commit 83be375

Please sign in to comment.