Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 28 additions & 12 deletions core/foundation_stereo.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,23 +372,20 @@ def forward(self, features_left_04, features_left_08, features_left_16, features
return disp_up


class TrtRunner(nn.Module):
def __init__(self, args, feature_runner_engine_path, post_runner_engine_path):
class _BaseTrtRunner(nn.Module):
def __init__(self, args):
super().__init__()
import tensorrt as trt
self.args = args
with open(feature_runner_engine_path, 'rb') as file:
engine_data = file.read()
self.trt_logger = trt.Logger(trt.Logger.WARNING)
self.feature_engine = trt.Runtime(self.trt_logger).deserialize_cuda_engine(engine_data)
self.feature_context = self.feature_engine.create_execution_context()

with open(post_runner_engine_path, 'rb') as file:
def load_engine(self, engine_path):
import tensorrt as trt
with open(engine_path, 'rb') as file:
engine_data = file.read()
self.post_engine = trt.Runtime(self.trt_logger).deserialize_cuda_engine(engine_data)
self.post_context = self.post_engine.create_execution_context()
self.max_disp = args.max_disp
self.cv_group = args.get('cv_group', 8)
engine = trt.Runtime(self.trt_logger).deserialize_cuda_engine(engine_data)
context = engine.create_execution_context()
return engine, context

def trt_dtype_to_torch(self, dt):
import tensorrt as trt
Expand Down Expand Up @@ -429,6 +426,15 @@ def run_trt(self, engine, context, inputs_by_name:dict):
assert ok
return outputs


class TrtRunner(_BaseTrtRunner):
def __init__(self, args, feature_runner_engine_path, post_runner_engine_path):
super().__init__(args)
self.feature_engine, self.feature_context = self.load_engine(feature_runner_engine_path)
self.post_engine, self.post_context = self.load_engine(post_runner_engine_path)
self.max_disp = args.max_disp
self.cv_group = args.get('cv_group', 8)

def forward(self, image1, image2):
import tensorrt as trt
feat_out = self.run_trt(self.feature_engine, self.feature_context, {'left': image1, 'right': image2})
Expand All @@ -442,4 +448,14 @@ def forward(self, image1, image2):
del post_inputs[k]
out = self.run_trt(self.post_engine, self.post_context, post_inputs)
disp = out['disp']
return disp
return disp


class SingleTrtRunner(_BaseTrtRunner):
def __init__(self, args, engine_path):
super().__init__(args)
self.engine, self.context = self.load_engine(engine_path)

def forward(self, image1, image2):
out = self.run_trt(self.engine, self.context, {'left': image1, 'right': image2})
return out['disp']
8 changes: 2 additions & 6 deletions core/submodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,17 +609,14 @@ def __init__(self, in_planes, ratio=16):
"""From selective-IGEV
"""
super(ChannelAttentionEnhancement, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.max_pool = nn.AdaptiveMaxPool2d(1)

self.fc = nn.Sequential(nn.Conv2d(in_planes, in_planes // 16, 1, bias=False),
nn.ReLU(),
nn.Conv2d(in_planes // 16, in_planes, 1, bias=False))
self.sigmoid = nn.Sigmoid()

def forward(self, x):
avg_out = self.fc(self.avg_pool(x))
max_out = self.fc(self.max_pool(x))
avg_out = self.fc(torch.mean(x, dim=(2, 3), keepdim=True))
max_out = self.fc(torch.amax(x, dim=(2, 3), keepdim=True))
out = avg_out + max_out
return self.sigmoid(out)

Expand Down Expand Up @@ -672,4 +669,3 @@ def forward(self, x):

x = input + x
return x

5 changes: 3 additions & 2 deletions docker/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ RUN conda init bash &&\
echo "conda activate my" >> ~/.bashrc &&\
conda activate my &&\
pip install uv &&\
uv pip install torch==2.6.0 torchvision==0.21.0 xformers --index-url https://download.pytorch.org/whl/cu124
uv pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128

COPY requirements.txt /tmp/requirements.txt
RUN conda activate my &&\
uv pip install -r /tmp/requirements.txt &&\
uv pip install onnxruntime-gpu onnx pycuda cuda-python tensorrt-cu12 tensorrt-lean-cu12 tensorrt-dispatch-cu12 nvidia-modelopt[torch] &&\
uv pip install onnxruntime-gpu onnx onnxscript pycuda cuda-python tensorrt-cu12 tensorrt-lean-cu12 tensorrt-dispatch-cu12 nvidia-modelopt[torch] &&\
uv pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128 &&\
conda install -y -c anaconda h5py &&\
conda install -y -c conda-forge libstdcxx-ng

Expand Down
15 changes: 13 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ bash docker/run_container.sh
- Option 2: pip
```bash
conda create -n ffs python=3.12 && conda activate ffs
pip install torch==2.6.0 torchvision==0.21.0 xformers --index-url https://download.pytorch.org/whl/cu124
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128
pip install -r requirements.txt
```

Expand Down Expand Up @@ -118,6 +118,16 @@ python scripts/make_onnx.py --model_dir weights/23-36-37/model_best_bp2_serializ

Refer to `scripts/make_onnx.py` for a comprehensive list of available flags. Since some intermediate operation is not supported by TRT conversion. We split around it into 2 onnx files.

Experimental single-file export is also available:
```
python scripts/make_onnx.py --model_dir weights/23-36-37/model_best_bp2_serialize.pth --save_path output/ --height 448 --width 640 --valid_iters 8 --max_disp 192 --single_onnx
```
This path replaces the Triton GWC volume builder with the pure PyTorch implementation during export so the whole model can be serialized into `output/foundation_stereo.onnx`. It is intended for ONNX export experiments; TensorRT compatibility still needs to be validated on the resulting graph.
You can build a single TensorRT engine from it as below:
```
trtexec --onnx=output/foundation_stereo.onnx --saveEngine=output/foundation_stereo.engine --fp16 --useCudaGraph
```

Then convert from ONNX to TRT as below.
```
trtexec --onnx=output/feature_runner.onnx --saveEngine=output/feature_runner.engine --fp16 --useCudaGraph
Expand All @@ -139,6 +149,7 @@ To use TRT for inference:
```
python scripts/run_demo_tensorrt.py --onnx_dir output/ --left_file assets/left.png --right_file assets/right.png --intrinsic_file assets/K.txt --out_dir output/ --remove_invisible 0 --denoise_cloud 1 --get_pc 1 --zfar 100
```
If `output/` contains `foundation_stereo.engine`, the demo will use the single-engine path automatically. You can also pass `--engine_path output/foundation_stereo.engine` explicitly.

# Internet-Scale Pseudo-Labeling
Real-world data offers greater diversity and realism than synthetic data. However, obtaining real stereo images with ground-truth metric depth annotation is notoriously difficult. To address this challenge, we propose an automatic data curation pipeline to generate pseudo-labels on internet-scale stereo images from [Stereo4D](https://stereo4d.github.io/) dataset. **Top:** Pseudo-labeling pipeline on in-the-wild internet stereo data. **Bottom:** Visualization of our generated pseudo-labels.
Expand Down Expand Up @@ -170,4 +181,4 @@ The dataset is available at HuggingFace: https://huggingface.co/datasets/nvidia/
Please contact [Bowen Wen](https://wenbowen123.github.io/) (bowenw@nvidia.com) for questions and commercial inquiries.

# Acknowledgement
We would like to thank Xutong Ren, Karsten Patzwaldt, Yonggan Fu, Saurav Muralidharan, Han Cai, Pavlo Molchanov, Yu Wang, Varun Praveen, Joseph Aribido and Jun Gao for their insightful early discussions for this project. We would also like to thank NVIDIA Isaac and TAO teams for their engineering support and valuable discussions. Thanks to the authors of [FoundationStereo](https://github.com/NVlabs/FoundationStereo), [Selective-IGEV](https://github.com/Windsrain/Selective-Stereo), [Stereo4D](https://github.com/Stereo4d/stereo4d-code) and [RAFT-Stereo](https://github.com/princeton-vl/RAFT-Stereo) for their code release. Finally, thanks to CVPR reviewers and AC for their appreciation of this work and constructive feedback.
We would like to thank Xutong Ren, Karsten Patzwaldt, Yonggan Fu, Saurav Muralidharan, Han Cai, Pavlo Molchanov, Yu Wang, Varun Praveen, Joseph Aribido and Jun Gao for their insightful early discussions for this project. We would also like to thank NVIDIA Isaac and TAO teams for their engineering support and valuable discussions. Thanks to the authors of [FoundationStereo](https://github.com/NVlabs/FoundationStereo), [Selective-IGEV](https://github.com/Windsrain/Selective-Stereo), [Stereo4D](https://github.com/Stereo4d/stereo4d-code) and [RAFT-Stereo](https://github.com/princeton-vl/RAFT-Stereo) for their code release. Finally, thanks to CVPR reviewers and AC for their appreciation of this work and constructive feedback.
81 changes: 57 additions & 24 deletions scripts/make_onnx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import warnings, argparse, logging, os, sys,zipfile
import torch.nn as nn
os.environ['TORCH_COMPILE_DISABLE'] = '1'
os.environ['TORCHDYNAMO_DISABLE'] = '1'
code_dir = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -21,6 +22,23 @@ def forward(self, left, right):
return disp


class SingleOnnxRunner(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model

@torch.no_grad()
def forward(self, left, right):
with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE):
return self.model(
left,
right,
iters=self.model.args.valid_iters,
test_mode=True,
optimize_build_volume='pytorch1',
)



if __name__ == '__main__':
parser = argparse.ArgumentParser()
Expand All @@ -37,14 +55,17 @@ def forward(self, left, right):
parser.add_argument('--n_gru_layers', type=int, default=1, help="number of hidden GRU levels")
parser.add_argument('--max_disp', type=int, default=192, help="max disp of geometry encoding volume")
parser.add_argument('--low_memory', type=int, default=1, help='reduce memory usage')
parser.add_argument('--single_onnx', action='store_true', help='Export the full model to a single ONNX file using the pure PyTorch volume builder')
parser.add_argument('--single_onnx_name', type=str, default='foundation_stereo.onnx', help='Filename for the single-model ONNX export')
args = parser.parse_args()
os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
os.makedirs(args.save_path, exist_ok=True)

torch.autograd.set_grad_enabled(False)

model = torch.load(args.model_dir, map_location='cpu', weights_only=False)
model.args.max_disp = args.max_disp
model.args.valid_iters = args.valid_iters
model.args.image_size = [args.height, args.width]
model.cuda().eval()

feature_runner = TrtFeatureRunner(model)
Expand All @@ -56,29 +77,41 @@ def forward(self, left, right):
left_img = torch.randn(1, 3, args.height, args.width).cuda().float()*255
right_img = torch.randn(1, 3, args.height, args.width).cuda().float()*255

torch.onnx.export(
feature_runner,
(left_img, right_img),
args.save_path+'/feature_runner.onnx',
opset_version=17,
input_names = ['left', 'right'],
output_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x'],
do_constant_folding=True
)

features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x = feature_runner(left_img, right_img)
gwc_volume = build_gwc_volume_triton(features_left_04.half(), features_right_04.half(), args.max_disp//4, model.cv_group)
disp = post_runner(features_left_04.float(), features_left_08.float(), features_left_16.float(), features_left_32.float(), features_right_04.float(), stem_2x.float(), gwc_volume.float())

torch.onnx.export(
post_runner,
(features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x, gwc_volume),
args.save_path+'/post_runner.onnx',
opset_version=17,
input_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x', 'gwc_volume'],
output_names = ['disp'],
do_constant_folding=True
)
if args.single_onnx:
single_runner = SingleOnnxRunner(model).cuda().eval()
torch.onnx.export(
single_runner,
(left_img, right_img),
os.path.join(args.save_path, args.single_onnx_name),
opset_version=17,
input_names=['left', 'right'],
output_names=['disp'],
do_constant_folding=True,
)
else:
torch.onnx.export(
feature_runner,
(left_img, right_img),
args.save_path+'/feature_runner.onnx',
opset_version=17,
input_names = ['left', 'right'],
output_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x'],
do_constant_folding=True
)

features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x = feature_runner(left_img, right_img)
gwc_volume = build_gwc_volume_triton(features_left_04.half(), features_right_04.half(), args.max_disp//4, model.cv_group)
disp = post_runner(features_left_04.float(), features_left_08.float(), features_left_16.float(), features_left_32.float(), features_right_04.float(), stem_2x.float(), gwc_volume.float())

torch.onnx.export(
post_runner,
(features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x, gwc_volume),
args.save_path+'/post_runner.onnx',
opset_version=17,
input_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x', 'gwc_volume'],
output_names = ['disp'],
do_constant_folding=True
)

with open(f'{args.save_path}/onnx.yaml', 'w') as f:
yaml.safe_dump(OmegaConf.to_container(model.args), f)
25 changes: 23 additions & 2 deletions scripts/run_demo_tensorrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
set_logging_format, set_seed, vis_disparity,
depth2xyzmap, toOpen3dCloud, o3d,
)
from core.foundation_stereo import TrtRunner
from core.foundation_stereo import SingleTrtRunner, TrtRunner
import cv2


Expand All @@ -27,6 +27,7 @@
parser.add_argument('--denoise_radius', type=float, default=0.03, help='radius to use for outlier removal')
parser.add_argument('--get_pc', type=int, default=1, help='save point cloud output')
parser.add_argument('--zfar', type=float, default=100, help="max depth to include in point cloud")
parser.add_argument('--engine_path', type=str, default=None, help='Path to a single TensorRT engine. If omitted, auto-detect from onnx_dir.')
args = parser.parse_args()

set_logging_format()
Expand All @@ -42,7 +43,27 @@
cfg[k] = args.__dict__[k]
args = OmegaConf.create(cfg)
logging.info(f"args:\n{args}")
model = TrtRunner(args, args.onnx_dir+'/feature_runner.engine', args.onnx_dir+'/post_runner.engine')
engine_path = args.engine_path
if engine_path is None:
single_engine = os.path.join(args.onnx_dir, 'foundation_stereo.engine')
feature_engine = os.path.join(args.onnx_dir, 'feature_runner.engine')
post_engine = os.path.join(args.onnx_dir, 'post_runner.engine')
if os.path.exists(single_engine):
engine_path = single_engine
elif os.path.exists(feature_engine) and os.path.exists(post_engine):
engine_path = None
else:
raise FileNotFoundError(
f"Could not find TensorRT engine(s) in {args.onnx_dir}. "
"Expected either foundation_stereo.engine or feature_runner.engine + post_runner.engine."
)

if engine_path is not None:
logging.info(f"Using single TensorRT engine: {engine_path}")
model = SingleTrtRunner(args, engine_path)
else:
logging.info("Using split TensorRT engines: feature_runner.engine + post_runner.engine")
model = TrtRunner(args, feature_engine, post_engine)

img0 = imageio.imread(args.left_file)
img1 = imageio.imread(args.right_file)
Expand Down