# Tested with:
# Python 3.10.12
# tensorrt-cu12==11.1.0.106
# onnx==1.22.0
# onnxruntime==1.23.2
# nvidia-cuda-runtime-cu12==12.9.79
# RTX 4000 Ada (sm89), CUDA driver 556.12
import ctypes, numpy as np, onnx, tensorrt as trt, onnxruntime as ort
model_path = "multi_PP-OCRv3_det_mobile.onnx"
# Fix: name the input H/W dimensions
m = onnx.load(model_path)
dims = m.graph.input[0].type.tensor_type.shape.dim
dims[2].dim_param = "H"
dims[3].dim_param = "W"
fixed_path = "multi_PP-OCRv3_det_mobile_fixed.onnx"
onnx.save(m, fixed_path)
# CUDA runtime (from nvidia-cuda-runtime-cu12)
libcudart = ctypes.CDLL("libcudart.so.12")
libcudart.cudaMalloc.restype = ctypes.c_int
libcudart.cudaMalloc.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]
libcudart.cudaMemcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
libcudart.cudaStreamCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
libcudart.cudaStreamSynchronize.argtypes = [ctypes.c_void_p]
# Build test
def try_build(path, h, w):
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1) # EXPLICIT_BATCH
trt.OnnxParser(network, logger).parse(open(path, "rb").read())
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
profile = builder.create_optimization_profile()
profile.set_shape(network.get_input(0).name,
min=(1, 3, 32, 32), opt=(1, 3, h, w), max=(1, 3, 960, 960))
config.add_optimization_profile(profile)
engine = builder.build_serialized_network(network, config)
return engine is not None
print("original, non-square:", try_build(model_path, 480, 640)) # False
print("original, square: ", try_build(model_path, 640, 640)) # True
print("fixed, non-square: ", try_build(fixed_path, 480, 640)) # True
# Inference: TRT vs onnxruntime
np.random.seed(42)
x = np.random.randn(1, 3, 480, 640).astype(np.float32)
# ONNX Runtime baseline
ort_out = ort.InferenceSession(fixed_path, providers=["CPUExecutionProvider"]).run(None, {"x": x})[0]
# TensorRT inference
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1)
trt.OnnxParser(network, logger).parse(open(fixed_path, "rb").read())
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
profile = builder.create_optimization_profile()
inp_name = network.get_input(0).name
profile.set_shape(inp_name, min=(1, 3, 32, 32), opt=(1, 3, 480, 640), max=(1, 3, 960, 960))
config.add_optimization_profile(profile)
engine = trt.Runtime(logger).deserialize_cuda_engine(builder.build_serialized_network(network, config))
context = engine.create_execution_context()
context.set_input_shape(inp_name, x.shape)
out_name = engine.get_tensor_name(1)
out_shape = tuple(context.get_tensor_shape(out_name))
d_input = ctypes.c_void_p()
d_output = ctypes.c_void_p()
libcudart.cudaMalloc(ctypes.byref(d_input), x.nbytes)
libcudart.cudaMalloc(ctypes.byref(d_output), int(np.prod(out_shape)) * 4)
xc = np.ascontiguousarray(x)
libcudart.cudaMemcpy(d_input, xc.ctypes.data, x.nbytes, 1) # HtoD
context.set_tensor_address(inp_name, d_input.value)
context.set_tensor_address(out_name, d_output.value)
stream = ctypes.c_void_p()
libcudart.cudaStreamCreate(ctypes.byref(stream))
context.execute_async_v3(stream.value)
libcudart.cudaStreamSynchronize(stream)
trt_out = np.empty(out_shape, dtype=np.float32)
libcudart.cudaMemcpy(trt_out.ctypes.data, d_output, int(np.prod(out_shape)) * 4, 2) # DtoH
# Compare
print("shape:", ort_out.shape, trt_out.shape)
print("max diff:", np.abs(ort_out - trt_out).max())
print("allclose(1e-3):", np.allclose(ort_out, trt_out, atol=1e-3))
print("cosine:", float(np.dot(ort_out.ravel(), trt_out.ravel()) /
(np.linalg.norm(ort_out) * np.linalg.norm(trt_out))))
multi_PP-OCRv3_det_mobilerequires the TensorRT square-padding workaround because its ONNX export leaves the input H/W dimensions unnamed. TensorRT then infers anH == Wconstraint and fails to build an engine for non-square profiles (Condition '==' violated: 480 != 640), while ONNX Runtime still runs because it ignores the incorrect shape metadata at runtime. The fix is to assign distinct names to the two input spatial dimensions (e.g.HandW) in the ONNX metadata. After that change, TensorRT can build with a non-square profile and its inference output matches ONNX Runtime numerically (allclose(atol=1e-3)=True, cosine similarity ≈0.9997). So the current_pad_to_square/_crop_outputworkaround ininference_engine/tensorrt/main.pyis covering an ONNX export metadata issue rather than an inherent model limitation.