TensorRT: multi_PP-OCRv3_det_mobile square-pad workaround can be eliminated by naming input H/W dimensions

`multi_PP-OCRv3_det_mobile` requires the TensorRT square-padding workaround because its ONNX export leaves the input H/W dimensions unnamed. TensorRT then infers an `H == W` constraint and fails to build an engine for non-square profiles (`Condition '==' violated: 480 != 640`), while ONNX Runtime still runs because it ignores the incorrect shape metadata at runtime. The fix is to assign distinct names to the two input spatial dimensions (e.g. `H` and `W`) in the ONNX metadata. After that change, TensorRT can build with a non-square profile and its inference output matches ONNX Runtime numerically (`allclose(atol=1e-3)=True`, cosine similarity ≈ `0.9997`). So the current `_pad_to_square` / `_crop_output` workaround in `inference_engine/tensorrt/main.py` is covering an ONNX export metadata issue rather than an inherent model limitation.

```python
# Tested with:
#   Python 3.10.12
#   tensorrt-cu12==11.1.0.106
#   onnx==1.22.0
#   onnxruntime==1.23.2
#   nvidia-cuda-runtime-cu12==12.9.79
#   RTX 4000 Ada (sm89), CUDA driver 556.12
import ctypes, numpy as np, onnx, tensorrt as trt, onnxruntime as ort

model_path = "multi_PP-OCRv3_det_mobile.onnx"

# Fix: name the input H/W dimensions
m = onnx.load(model_path)
dims = m.graph.input[0].type.tensor_type.shape.dim
dims[2].dim_param = "H"
dims[3].dim_param = "W"
fixed_path = "multi_PP-OCRv3_det_mobile_fixed.onnx"
onnx.save(m, fixed_path)

# CUDA runtime (from nvidia-cuda-runtime-cu12)
libcudart = ctypes.CDLL("libcudart.so.12")
libcudart.cudaMalloc.restype = ctypes.c_int
libcudart.cudaMalloc.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]
libcudart.cudaMemcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
libcudart.cudaStreamCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
libcudart.cudaStreamSynchronize.argtypes = [ctypes.c_void_p]

# Build test

def try_build(path, h, w):
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(1)  # EXPLICIT_BATCH
    trt.OnnxParser(network, logger).parse(open(path, "rb").read())
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
    profile = builder.create_optimization_profile()
    profile.set_shape(network.get_input(0).name,
                      min=(1, 3, 32, 32), opt=(1, 3, h, w), max=(1, 3, 960, 960))
    config.add_optimization_profile(profile)
    engine = builder.build_serialized_network(network, config)
    return engine is not None

print("original, non-square:", try_build(model_path, 480, 640))   # False
print("original, square:    ", try_build(model_path, 640, 640))   # True
print("fixed, non-square:   ", try_build(fixed_path, 480, 640))  # True

# Inference: TRT vs onnxruntime
np.random.seed(42)
x = np.random.randn(1, 3, 480, 640).astype(np.float32)

# ONNX Runtime baseline
ort_out = ort.InferenceSession(fixed_path, providers=["CPUExecutionProvider"]).run(None, {"x": x})[0]

# TensorRT inference
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1)
trt.OnnxParser(network, logger).parse(open(fixed_path, "rb").read())
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
profile = builder.create_optimization_profile()
inp_name = network.get_input(0).name
profile.set_shape(inp_name, min=(1, 3, 32, 32), opt=(1, 3, 480, 640), max=(1, 3, 960, 960))
config.add_optimization_profile(profile)
engine = trt.Runtime(logger).deserialize_cuda_engine(builder.build_serialized_network(network, config))
context = engine.create_execution_context()
context.set_input_shape(inp_name, x.shape)
out_name = engine.get_tensor_name(1)
out_shape = tuple(context.get_tensor_shape(out_name))

d_input = ctypes.c_void_p()
d_output = ctypes.c_void_p()
libcudart.cudaMalloc(ctypes.byref(d_input), x.nbytes)
libcudart.cudaMalloc(ctypes.byref(d_output), int(np.prod(out_shape)) * 4)
xc = np.ascontiguousarray(x)
libcudart.cudaMemcpy(d_input, xc.ctypes.data, x.nbytes, 1)  # HtoD
context.set_tensor_address(inp_name, d_input.value)
context.set_tensor_address(out_name, d_output.value)
stream = ctypes.c_void_p()
libcudart.cudaStreamCreate(ctypes.byref(stream))
context.execute_async_v3(stream.value)
libcudart.cudaStreamSynchronize(stream)
trt_out = np.empty(out_shape, dtype=np.float32)
libcudart.cudaMemcpy(trt_out.ctypes.data, d_output, int(np.prod(out_shape)) * 4, 2)  # DtoH

# Compare
print("shape:", ort_out.shape, trt_out.shape)
print("max diff:", np.abs(ort_out - trt_out).max())
print("allclose(1e-3):", np.allclose(ort_out, trt_out, atol=1e-3))
print("cosine:", float(np.dot(ort_out.ravel(), trt_out.ravel()) /
                        (np.linalg.norm(ort_out) * np.linalg.norm(trt_out))))
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

TensorRT: multi_PP-OCRv3_det_mobile square-pad workaround can be eliminated by naming input H/W dimensions #695

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Uh oh!

Uh oh!

TensorRT: multi_PP-OCRv3_det_mobile square-pad workaround can be eliminated by naming input H/W dimensions #695

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions