Skip to content

Commit

Permalink
Fix python manylinux to not load cuda if it fails to load dependencies (
Browse files Browse the repository at this point in the history
#8882)

* Fix python manylinux to not load cuda if it fails to load dependencies
  • Loading branch information
RyanUnderhill authored Sep 7, 2021
1 parent 0bb56a1 commit b797157
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 82 deletions.
3 changes: 0 additions & 3 deletions onnxruntime/python/_ld_preload.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,3 @@

# This file can be modified by setup.py when building a manylinux2010 wheel
# When modified, it will preload some libraries needed for the python C extension
# Do not remove or move the following comment

# LD_PRELOAD_BEGIN_MARK
140 changes: 70 additions & 70 deletions onnxruntime/python/onnxruntime_pybind_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ const OrtDevice::DeviceType OrtDevice::GPU;

namespace onnxruntime {


} // namespace onnxruntime

#if defined(_MSC_VER)
Expand Down Expand Up @@ -110,8 +109,8 @@ void CustomOpLibrary::UnloadLibrary() {

template <typename T>
static py::object AddNonTensor(const OrtValue& val,
const DataTransferManager* /*data_transfer_manager*/,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* /*mem_cpy_to_host_functions*/) {
const DataTransferManager* /*data_transfer_manager*/,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* /*mem_cpy_to_host_functions*/) {
return py::cast(val.Get<T>());
}

Expand Down Expand Up @@ -218,8 +217,8 @@ py::object GetPyObjectFromSparseTensor(size_t pos, const OrtValue& ort_value, co

template <>
py::object AddNonTensor<TensorSeq>(const OrtValue& val,
const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
const auto& seq_tensors = val.Get<TensorSeq>();
py::list py_list;
for (const auto& rtensor : seq_tensors) {
Expand All @@ -235,12 +234,12 @@ py::object AddNonTensor<TensorSeq>(const OrtValue& val,
}

py::object AddNonTensorAsPyObj(const OrtValue& val,
const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
// Should be in sync with core/framework/datatypes.h
auto val_type = val.Type();
if (val_type->IsTensorSequenceType()) {
return AddNonTensor<TensorSeq>(val, data_transfer_manager, mem_cpy_to_host_functions);
return AddNonTensor<TensorSeq>(val, data_transfer_manager, mem_cpy_to_host_functions);
} else {
#if !defined(DISABLE_ML_OPS)
utils::ContainerChecker c_checker(val_type);
Expand Down Expand Up @@ -276,7 +275,7 @@ py::object AddNonTensorAsPyObj(const OrtValue& val,
}

py::object AddTensorAsPyObj(const OrtValue& val, const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
const Tensor& rtensor = val.Get<Tensor>();
py::object obj;
GetPyObjFromTensor(rtensor, obj, data_transfer_manager, mem_cpy_to_host_functions);
Expand All @@ -303,13 +302,13 @@ static std::unique_ptr<onnxruntime::IExecutionProvider> LoadExecutionProvider(

#ifdef USE_CUDA
const CUDAExecutionProviderInfo GetCudaExecutionProviderInfo(ProviderInfo_CUDA* cuda_provider_info,
const ProviderOptionsMap& provider_options_map){
const ProviderOptionsMap& provider_options_map) {
ORT_ENFORCE(cuda_provider_info);
const auto it = provider_options_map.find(kCudaExecutionProvider);
CUDAExecutionProviderInfo info;
if (it != provider_options_map.end())
cuda_provider_info->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info);
else{
else {
info.device_id = cuda_device_id;
info.gpu_mem_limit = gpu_mem_limit;
info.arena_extend_strategy = arena_extend_strategy;
Expand All @@ -322,28 +321,29 @@ const CUDAExecutionProviderInfo GetCudaExecutionProviderInfo(ProviderInfo_CUDA*
#endif

#ifdef USE_ROCM
const ROCMExecutionProviderInfo GetROCMExecutionProviderInfo(const ProviderOptionsMap& provider_options_map){
const ROCMExecutionProviderInfo GetROCMExecutionProviderInfo(const ProviderOptionsMap& provider_options_map) {
const auto it = provider_options_map.find(kRocmExecutionProvider);
return it != provider_options_map.end()
? ROCMExecutionProviderInfo::FromProviderOptions(it->second)
: [&]() {
ROCMExecutionProviderInfo info{};
info.device_id = cuda_device_id;
info.gpu_mem_limit = gpu_mem_limit;
info.arena_extend_strategy = arena_extend_strategy;
info.external_allocator_info = external_allocator_info;
return info;
}();
? ROCMExecutionProviderInfo::FromProviderOptions(it->second)
: [&]() {
ROCMExecutionProviderInfo info{};
info.device_id = cuda_device_id;
info.gpu_mem_limit = gpu_mem_limit;
info.arena_extend_strategy = arena_extend_strategy;
info.external_allocator_info = external_allocator_info;
return info;
}();
}
#endif

std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
const SessionOptions& session_options,
const std::string& type,
const ProviderOptionsMap& provider_options_map){
const SessionOptions& session_options,
const std::string& type,
const ProviderOptionsMap& provider_options_map) {
if (type == kCpuExecutionProvider) {
return onnxruntime::CreateExecutionProviderFactory_CPU(
session_options.enable_cpu_mem_arena)->CreateProvider();
session_options.enable_cpu_mem_arena)
->CreateProvider();
} else if (type == kTensorrtExecutionProvider) {
#ifdef USE_TENSORRT
std::string calibration_table, cache_path, lib_path;
Expand Down Expand Up @@ -499,21 +499,22 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
#endif
} else if (type == kCudaExecutionProvider) {
#ifdef USE_CUDA
if(auto* cuda_provider_info = TryGetProviderInfo_CUDA())
{
const CUDAExecutionProviderInfo info = GetCudaExecutionProviderInfo(cuda_provider_info,
provider_options_map);

// This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
// exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
// since FromProviderOptions might contain external CUDA allocator.
external_allocator_info = info.external_allocator_info;
return cuda_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
}
else
{
if(!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) {
ORT_THROW("CUDA_PATH is set but CUDA wasn't able to be loaded. Please install the correct version of CUDA and cuDNN as mentioned in the GPU requirements page, make sure they're in the PATH, and that your GPU is supported.");
// If the environment variable 'CUDA_UNAVAILABLE' exists, then we do not load cuda. This is set by _ld_preload for the manylinux case
// as in that case, trying to load the library itself will result in a crash due to the way that auditwheel strips dependencies.
if (Env::Default().GetEnvironmentVar("ORT_CUDA_UNAVAILABLE").empty()) {
if (auto* cuda_provider_info = TryGetProviderInfo_CUDA()) {
const CUDAExecutionProviderInfo info = GetCudaExecutionProviderInfo(cuda_provider_info,
provider_options_map);

// This variable is never initialized because the APIs by which it should be initialized are deprecated, however they still
// exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
// since FromProviderOptions might contain external CUDA allocator.
external_allocator_info = info.external_allocator_info;
return cuda_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
} else {
if (!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) {
ORT_THROW("CUDA_PATH is set but CUDA wasn't able to be loaded. Please install the correct version of CUDA and cuDNN as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.");
}
}
}
#endif
Expand All @@ -530,7 +531,8 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
} else if (type == kDnnlExecutionProvider) {
#ifdef USE_DNNL
return onnxruntime::CreateExecutionProviderFactory_Dnnl(
session_options.enable_cpu_mem_arena)->CreateProvider();
session_options.enable_cpu_mem_arena)
->CreateProvider();
#endif
} else if (type == kOpenVINOExecutionProvider) {
#ifdef USE_OPENVINO
Expand Down Expand Up @@ -622,18 +624,21 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
}
}
return onnxruntime::CreateExecutionProviderFactory_VITISAI(target.c_str(), 0,
export_runtime_module.c_str(),
load_runtime_module.c_str())->CreateProvider();
export_runtime_module.c_str(),
load_runtime_module.c_str())
->CreateProvider();
#endif
} else if (type == kAclExecutionProvider) {
#ifdef USE_ACL
return onnxruntime::CreateExecutionProviderFactory_ACL(
session_options.enable_cpu_mem_arena)->CreateProvider();
session_options.enable_cpu_mem_arena)
->CreateProvider();
#endif
} else if (type == kArmNNExecutionProvider) {
#ifdef USE_ARMNN
return onnxruntime::CreateExecutionProviderFactory_ArmNN(
session_options.enable_cpu_mem_arena)->CreateProvider();
session_options.enable_cpu_mem_arena)
->CreateProvider();
#endif
} else if (type == kDmlExecutionProvider) {
#ifdef USE_DML
Expand Down Expand Up @@ -666,9 +671,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
} else if (type == kCoreMLExecutionProvider) {
#if defined(USE_COREML)
#if !defined(__APPLE__)
LOGS_DEFAULT(WARNING) << "CoreML execution provider can only be used to generate ORT format model in this build.";
LOGS_DEFAULT(WARNING) << "CoreML execution provider can only be used to generate ORT format model in this build.";
#endif
return onnxruntime::CreateExecutionProviderFactory_CoreML(0)->CreateProvider();
return onnxruntime::CreateExecutionProviderFactory_CoreML(0)->CreateProvider();
#endif
} else {
// check whether it is a dynamic load EP:
Expand All @@ -681,10 +686,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
ProviderOptions provider_options;
std::string entry_symbol = kDefaultExecutionProviderEntry;
for (auto option : it->second) {
if (option.first == kExecutionProviderSharedLibraryEntry){
if (option.first == kExecutionProviderSharedLibraryEntry) {
entry_symbol = option.second;
}
else if (option.first != kExecutionProviderSharedLibraryPath){
} else if (option.first != kExecutionProviderSharedLibraryPath) {
provider_options.insert(option);
}
}
Expand Down Expand Up @@ -755,7 +759,7 @@ static void RegisterCustomOpDomainsAndLibraries(PyInferenceSession* sess, const
}
#endif

void InitializeSession(InferenceSession* sess,
void InitializeSession(InferenceSession* sess,
ExecutionProviderRegistrationFn ep_registration_fn,
const std::vector<std::string>& provider_types,
const ProviderOptionsVector& provider_options,
Expand Down Expand Up @@ -1290,13 +1294,13 @@ including arg name, arg type (contains both type and shape).)pbdoc")
.def(
"initialize_session",
[ep_registration_fn](PyInferenceSession* sess,
const std::vector<std::string>& provider_types = {},
const ProviderOptionsVector& provider_options = {},
const std::unordered_set<std::string>& disabled_optimizer_names = {}) {
const std::vector<std::string>& provider_types = {},
const ProviderOptionsVector& provider_options = {},
const std::unordered_set<std::string>& disabled_optimizer_names = {}) {
InitializeSession(sess->GetSessionHandle(),
ep_registration_fn,
provider_types,
provider_options,
provider_types,
provider_options,
disabled_optimizer_names);
},
R"pbdoc(Load a model saved in ONNX or ORT format.)pbdoc")
Expand Down Expand Up @@ -1334,23 +1338,20 @@ including arg name, arg type (contains both type and shape).)pbdoc")
size_t pos = 0;
for (auto fet : fetches) {
if (fet.IsTensor()) {
rfetch.push_back(AddTensorAsPyObj(fet,nullptr, nullptr));
rfetch.push_back(AddTensorAsPyObj(fet, nullptr, nullptr));
} else if (fet.IsSparseTensor()) {
rfetch.push_back(GetPyObjectFromSparseTensor(pos, fet, nullptr));
} else {
} else {
rfetch.push_back(AddNonTensorAsPyObj(fet, nullptr, nullptr));
}
++pos;
}
return rfetch;
})
/// This method accepts a dictionary of feeds (name -> OrtValue) and the list of output_names
/// and returns a list of python objects representing OrtValues. Each name may represent either
/// a Tensor, SparseTensor or a TensorSequence.
.def("run_with_ort_values", [](PyInferenceSession* sess,
const py::dict& feeds,
const std::vector<std::string>& output_names,
RunOptions* run_options = nullptr) -> std::vector<OrtValue>{
/// This method accepts a dictionary of feeds (name -> OrtValue) and the list of output_names
/// and returns a list of python objects representing OrtValues. Each name may represent either
/// a Tensor, SparseTensor or a TensorSequence.
.def("run_with_ort_values", [](PyInferenceSession* sess, const py::dict& feeds, const std::vector<std::string>& output_names, RunOptions* run_options = nullptr) -> std::vector<OrtValue> {
NameMLValMap ort_feeds;
// item is always a copy since dict returns a value and not a ref
// and Apple XToolChain barks
Expand Down Expand Up @@ -1522,14 +1523,13 @@ void CreateInferencePybindStateModule(py::module& m) {
addOpSchemaSubmodule(m);
addOpKernelSubmodule(m);
#endif

}

void InitArray(){
void InitArray() {
([]() -> void {
// import_array1() forces a void return value.
import_array1();
})();
// import_array1() forces a void return value.
import_array1();
})();
}

// static variable used to create inference session and training session.
Expand Down
24 changes: 15 additions & 9 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,19 +118,23 @@ def finalize_options(self):
self.root_is_pure = False

def _rewrite_ld_preload(self, to_preload):
with open('onnxruntime/capi/_ld_preload.py', 'rt') as f:
ld_preload = f.read().splitlines()
with open('onnxruntime/capi/_ld_preload.py', 'wt') as f:
for line in ld_preload:
f.write(line)
f.write('\n')
if 'LD_PRELOAD_BEGIN_MARK' in line:
break
with open('onnxruntime/capi/_ld_preload.py', 'a') as f:
if len(to_preload) > 0:
f.write('from ctypes import CDLL, RTLD_GLOBAL\n')
for library in to_preload:
f.write('_{} = CDLL("{}", mode=RTLD_GLOBAL)\n'.format(library.split('.')[0], library))

def _rewrite_ld_preload_cuda(self, to_preload):
with open('onnxruntime/capi/_ld_preload.py', 'a') as f:
if len(to_preload) > 0:
f.write('from ctypes import CDLL, RTLD_GLOBAL\n')
f.write('try:\n')
for library in to_preload:
f.write(' _{} = CDLL("{}", mode=RTLD_GLOBAL)\n'.format(library.split('.')[0], library))
f.write('except OSError:\n')
f.write(' import os\n')
f.write(' os.environ["ORT_CUDA_UNAVAILABLE"] = "1"\n')

def run(self):
if is_manylinux:
source = 'onnxruntime/capi/onnxruntime_pybind11_state.so'
Expand All @@ -142,6 +146,7 @@ def run(self):
dependencies = ['librccl.so', 'libamdhip64.so', 'librocblas.so', 'libMIOpen.so',
'libhsa-runtime64.so', 'libhsakmt.so']
to_preload = []
to_preload_cuda = []
args = ['patchelf', '--debug']
for line in result.stdout.split('\n'):
for dependency in dependencies:
Expand All @@ -163,13 +168,14 @@ def run(self):
for dependency in cuda_dependencies:
if dependency in line:
if dependency not in to_preload:
to_preload.append(line)
to_preload_cuda.append(line)
args.extend(['--remove-needed', line])
args.append(dest)
if len(args) > 3:
subprocess.run(args, check=True, stdout=subprocess.PIPE)

self._rewrite_ld_preload(to_preload)
self._rewrite_ld_preload_cuda(to_preload_cuda)
_bdist_wheel.run(self)
if is_manylinux:
file = glob(path.join(self.dist_dir, '*linux*.whl'))[0]
Expand Down

0 comments on commit b797157

Please sign in to comment.