diff --git a/core/foundation_stereo.py b/core/foundation_stereo.py
index 05b0aa6..c1368ef 100755
--- a/core/foundation_stereo.py
+++ b/core/foundation_stereo.py
@@ -372,23 +372,20 @@ def forward(self, features_left_04, features_left_08, features_left_16, features
     return disp_up
 
 
-class TrtRunner(nn.Module):
-  def __init__(self, args, feature_runner_engine_path, post_runner_engine_path):
+class _BaseTrtRunner(nn.Module):
+  def __init__(self, args):
     super().__init__()
     import tensorrt as trt
     self.args = args
-    with open(feature_runner_engine_path, 'rb') as file:
-      engine_data = file.read()
     self.trt_logger = trt.Logger(trt.Logger.WARNING)
-    self.feature_engine = trt.Runtime(self.trt_logger).deserialize_cuda_engine(engine_data)
-    self.feature_context = self.feature_engine.create_execution_context()
 
-    with open(post_runner_engine_path, 'rb') as file:
+  def load_engine(self, engine_path):
+    import tensorrt as trt
+    with open(engine_path, 'rb') as file:
       engine_data = file.read()
-    self.post_engine = trt.Runtime(self.trt_logger).deserialize_cuda_engine(engine_data)
-    self.post_context = self.post_engine.create_execution_context()
-    self.max_disp = args.max_disp
-    self.cv_group = args.get('cv_group', 8)
+    engine = trt.Runtime(self.trt_logger).deserialize_cuda_engine(engine_data)
+    context = engine.create_execution_context()
+    return engine, context
 
   def trt_dtype_to_torch(self, dt):
     import tensorrt as trt
@@ -429,6 +426,15 @@ def run_trt(self, engine, context, inputs_by_name:dict):
     assert ok
     return outputs
 
+
+class TrtRunner(_BaseTrtRunner):
+  def __init__(self, args, feature_runner_engine_path, post_runner_engine_path):
+    super().__init__(args)
+    self.feature_engine, self.feature_context = self.load_engine(feature_runner_engine_path)
+    self.post_engine, self.post_context = self.load_engine(post_runner_engine_path)
+    self.max_disp = args.max_disp
+    self.cv_group = args.get('cv_group', 8)
+
   def forward(self, image1, image2):
     import tensorrt as trt
     feat_out = self.run_trt(self.feature_engine, self.feature_context, {'left': image1, 'right': image2})
@@ -442,4 +448,14 @@ def forward(self, image1, image2):
         del post_inputs[k]
     out = self.run_trt(self.post_engine, self.post_context, post_inputs)
     disp = out['disp']
-    return disp
\ No newline at end of file
+    return disp
+
+
+class SingleTrtRunner(_BaseTrtRunner):
+  def __init__(self, args, engine_path):
+    super().__init__(args)
+    self.engine, self.context = self.load_engine(engine_path)
+
+  def forward(self, image1, image2):
+    out = self.run_trt(self.engine, self.context, {'left': image1, 'right': image2})
+    return out['disp']
diff --git a/core/submodule.py b/core/submodule.py
index 6764d64..33293ac 100755
--- a/core/submodule.py
+++ b/core/submodule.py
@@ -609,17 +609,14 @@ def __init__(self, in_planes, ratio=16):
         """From selective-IGEV
         """
         super(ChannelAttentionEnhancement, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.max_pool = nn.AdaptiveMaxPool2d(1)
-
         self.fc = nn.Sequential(nn.Conv2d(in_planes, in_planes // 16, 1, bias=False),
                                nn.ReLU(),
                                nn.Conv2d(in_planes // 16, in_planes, 1, bias=False))
         self.sigmoid = nn.Sigmoid()
 
     def forward(self, x):
-        avg_out = self.fc(self.avg_pool(x))
-        max_out = self.fc(self.max_pool(x))
+        avg_out = self.fc(torch.mean(x, dim=(2, 3), keepdim=True))
+        max_out = self.fc(torch.amax(x, dim=(2, 3), keepdim=True))
         out = avg_out + max_out
         return self.sigmoid(out)
 
@@ -672,4 +669,3 @@ def forward(self, x):
 
         x = input + x
         return x
-
diff --git a/docker/dockerfile b/docker/dockerfile
index 900291d..6a76c72 100755
--- a/docker/dockerfile
+++ b/docker/dockerfile
@@ -26,12 +26,13 @@ RUN conda init bash &&\
     echo "conda activate my" >> ~/.bashrc &&\
     conda activate my &&\
     pip install uv &&\
-    uv pip install torch==2.6.0 torchvision==0.21.0 xformers --index-url https://download.pytorch.org/whl/cu124
+    uv pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128
 
 COPY requirements.txt /tmp/requirements.txt
 RUN conda activate my &&\
     uv pip install -r /tmp/requirements.txt &&\
-    uv pip install onnxruntime-gpu onnx pycuda cuda-python tensorrt-cu12 tensorrt-lean-cu12 tensorrt-dispatch-cu12 nvidia-modelopt[torch] &&\
+    uv pip install onnxruntime-gpu onnx onnxscript pycuda cuda-python tensorrt-cu12 tensorrt-lean-cu12 tensorrt-dispatch-cu12 nvidia-modelopt[torch] &&\
+    uv pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128 &&\
     conda install -y -c anaconda h5py &&\
     conda install -y -c conda-forge libstdcxx-ng
 
diff --git a/readme.md b/readme.md
index e5acbd4..8889357 100644
--- a/readme.md
+++ b/readme.md
@@ -36,7 +36,7 @@ bash docker/run_container.sh
 - Option 2: pip
 ```bash
 conda create -n ffs python=3.12 && conda activate ffs
-pip install torch==2.6.0 torchvision==0.21.0 xformers --index-url https://download.pytorch.org/whl/cu124
+pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128
 pip install -r requirements.txt
 ```
 
@@ -118,6 +118,16 @@ python scripts/make_onnx.py --model_dir weights/23-36-37/model_best_bp2_serializ
 
 Refer to `scripts/make_onnx.py` for a comprehensive list of available flags.  Since some intermediate operation is not supported by TRT conversion. We split around it into 2 onnx files.
 
+Experimental single-file export is also available:
+```
+python scripts/make_onnx.py --model_dir weights/23-36-37/model_best_bp2_serialize.pth --save_path output/ --height 448 --width 640 --valid_iters 8 --max_disp 192 --single_onnx
+```
+This path replaces the Triton GWC volume builder with the pure PyTorch implementation during export so the whole model can be serialized into `output/foundation_stereo.onnx`. It is intended for ONNX export experiments; TensorRT compatibility still needs to be validated on the resulting graph.
+You can build a single TensorRT engine from it as below:
+```
+trtexec --onnx=output/foundation_stereo.onnx --saveEngine=output/foundation_stereo.engine --fp16 --useCudaGraph
+```
+
 Then convert from ONNX to TRT as below.
 ```
 trtexec --onnx=output/feature_runner.onnx --saveEngine=output/feature_runner.engine --fp16  --useCudaGraph
@@ -139,6 +149,7 @@ To use TRT for inference:
 ```
 python scripts/run_demo_tensorrt.py --onnx_dir output/ --left_file assets/left.png --right_file assets/right.png --intrinsic_file assets/K.txt --out_dir output/ --remove_invisible 0 --denoise_cloud 1  --get_pc 1 --zfar 100
 ```
+If `output/` contains `foundation_stereo.engine`, the demo will use the single-engine path automatically. You can also pass `--engine_path output/foundation_stereo.engine` explicitly.
 
 # Internet-Scale Pseudo-Labeling
 Real-world data offers greater diversity and realism than synthetic data. However, obtaining real stereo images with ground-truth metric depth annotation is notoriously difficult. To address this challenge, we propose an automatic data curation pipeline to generate pseudo-labels on internet-scale stereo images from [Stereo4D](https://stereo4d.github.io/) dataset. **Top:** Pseudo-labeling pipeline on in-the-wild internet stereo data. **Bottom:** Visualization of our generated pseudo-labels.
@@ -170,4 +181,4 @@ The dataset is available at HuggingFace: https://huggingface.co/datasets/nvidia/
 Please contact [Bowen Wen](https://wenbowen123.github.io/) (bowenw@nvidia.com) for questions and commercial inquiries.
 
 # Acknowledgement
-We would like to thank Xutong Ren, Karsten Patzwaldt, Yonggan Fu, Saurav Muralidharan, Han Cai, Pavlo Molchanov, Yu Wang, Varun Praveen, Joseph Aribido and Jun Gao for their insightful early discussions for this project. We would also like to thank NVIDIA Isaac and TAO teams for their engineering support and valuable discussions. Thanks to the authors of [FoundationStereo](https://github.com/NVlabs/FoundationStereo), [Selective-IGEV](https://github.com/Windsrain/Selective-Stereo), [Stereo4D](https://github.com/Stereo4d/stereo4d-code) and [RAFT-Stereo](https://github.com/princeton-vl/RAFT-Stereo) for their code release. Finally, thanks to CVPR reviewers and AC for their appreciation of this work and constructive feedback.
\ No newline at end of file
+We would like to thank Xutong Ren, Karsten Patzwaldt, Yonggan Fu, Saurav Muralidharan, Han Cai, Pavlo Molchanov, Yu Wang, Varun Praveen, Joseph Aribido and Jun Gao for their insightful early discussions for this project. We would also like to thank NVIDIA Isaac and TAO teams for their engineering support and valuable discussions. Thanks to the authors of [FoundationStereo](https://github.com/NVlabs/FoundationStereo), [Selective-IGEV](https://github.com/Windsrain/Selective-Stereo), [Stereo4D](https://github.com/Stereo4d/stereo4d-code) and [RAFT-Stereo](https://github.com/princeton-vl/RAFT-Stereo) for their code release. Finally, thanks to CVPR reviewers and AC for their appreciation of this work and constructive feedback.
diff --git a/scripts/make_onnx.py b/scripts/make_onnx.py
index aaa9915..30bdf39 100755
--- a/scripts/make_onnx.py
+++ b/scripts/make_onnx.py
@@ -1,4 +1,5 @@
 import warnings, argparse, logging, os, sys,zipfile
+import torch.nn as nn
 os.environ['TORCH_COMPILE_DISABLE'] = '1'
 os.environ['TORCHDYNAMO_DISABLE'] = '1'
 code_dir = os.path.dirname(os.path.abspath(__file__))
@@ -21,6 +22,23 @@ def forward(self, left, right):
         return disp
 
 
+class SingleOnnxRunner(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    @torch.no_grad()
+    def forward(self, left, right):
+        with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE):
+            return self.model(
+                left,
+                right,
+                iters=self.model.args.valid_iters,
+                test_mode=True,
+                optimize_build_volume='pytorch1',
+            )
+
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -37,14 +55,17 @@ def forward(self, left, right):
     parser.add_argument('--n_gru_layers', type=int, default=1, help="number of hidden GRU levels")
     parser.add_argument('--max_disp', type=int, default=192, help="max disp of geometry encoding volume")
     parser.add_argument('--low_memory', type=int, default=1, help='reduce memory usage')
+    parser.add_argument('--single_onnx', action='store_true', help='Export the full model to a single ONNX file using the pure PyTorch volume builder')
+    parser.add_argument('--single_onnx_name', type=str, default='foundation_stereo.onnx', help='Filename for the single-model ONNX export')
     args = parser.parse_args()
-    os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
+    os.makedirs(args.save_path, exist_ok=True)
 
     torch.autograd.set_grad_enabled(False)
 
     model = torch.load(args.model_dir, map_location='cpu', weights_only=False)
     model.args.max_disp = args.max_disp
     model.args.valid_iters = args.valid_iters
+    model.args.image_size = [args.height, args.width]
     model.cuda().eval()
 
     feature_runner = TrtFeatureRunner(model)
@@ -56,29 +77,41 @@ def forward(self, left, right):
     left_img = torch.randn(1, 3, args.height, args.width).cuda().float()*255
     right_img = torch.randn(1, 3, args.height, args.width).cuda().float()*255
 
-    torch.onnx.export(
-        feature_runner,
-        (left_img, right_img),
-        args.save_path+'/feature_runner.onnx',
-        opset_version=17,
-        input_names = ['left', 'right'],
-        output_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x'],
-        do_constant_folding=True
-    )
-
-    features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x = feature_runner(left_img, right_img)
-    gwc_volume = build_gwc_volume_triton(features_left_04.half(), features_right_04.half(), args.max_disp//4, model.cv_group)
-    disp = post_runner(features_left_04.float(), features_left_08.float(), features_left_16.float(), features_left_32.float(), features_right_04.float(), stem_2x.float(), gwc_volume.float())
-
-    torch.onnx.export(
-        post_runner,
-        (features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x, gwc_volume),
-        args.save_path+'/post_runner.onnx',
-        opset_version=17,
-        input_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x', 'gwc_volume'],
-        output_names = ['disp'],
-        do_constant_folding=True
-    )
+    if args.single_onnx:
+      single_runner = SingleOnnxRunner(model).cuda().eval()
+      torch.onnx.export(
+          single_runner,
+          (left_img, right_img),
+          os.path.join(args.save_path, args.single_onnx_name),
+          opset_version=17,
+          input_names=['left', 'right'],
+          output_names=['disp'],
+          do_constant_folding=True,
+      )
+    else:
+      torch.onnx.export(
+          feature_runner,
+          (left_img, right_img),
+          args.save_path+'/feature_runner.onnx',
+          opset_version=17,
+          input_names = ['left', 'right'],
+          output_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x'],
+          do_constant_folding=True
+      )
+
+      features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x = feature_runner(left_img, right_img)
+      gwc_volume = build_gwc_volume_triton(features_left_04.half(), features_right_04.half(), args.max_disp//4, model.cv_group)
+      disp = post_runner(features_left_04.float(), features_left_08.float(), features_left_16.float(), features_left_32.float(), features_right_04.float(), stem_2x.float(), gwc_volume.float())
+
+      torch.onnx.export(
+          post_runner,
+          (features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x, gwc_volume),
+          args.save_path+'/post_runner.onnx',
+          opset_version=17,
+          input_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x', 'gwc_volume'],
+          output_names = ['disp'],
+          do_constant_folding=True
+      )
 
     with open(f'{args.save_path}/onnx.yaml', 'w') as f:
       yaml.safe_dump(OmegaConf.to_container(model.args), f)
diff --git a/scripts/run_demo_tensorrt.py b/scripts/run_demo_tensorrt.py
index d45bee8..102f7f3 100755
--- a/scripts/run_demo_tensorrt.py
+++ b/scripts/run_demo_tensorrt.py
@@ -9,7 +9,7 @@
     set_logging_format, set_seed, vis_disparity,
     depth2xyzmap, toOpen3dCloud, o3d,
 )
-from core.foundation_stereo import TrtRunner
+from core.foundation_stereo import SingleTrtRunner, TrtRunner
 import cv2
 
 
@@ -27,6 +27,7 @@
   parser.add_argument('--denoise_radius', type=float, default=0.03, help='radius to use for outlier removal')
   parser.add_argument('--get_pc', type=int, default=1, help='save point cloud output')
   parser.add_argument('--zfar', type=float, default=100, help="max depth to include in point cloud")
+  parser.add_argument('--engine_path', type=str, default=None, help='Path to a single TensorRT engine. If omitted, auto-detect from onnx_dir.')
   args = parser.parse_args()
 
   set_logging_format()
@@ -42,7 +43,27 @@
       cfg[k] = args.__dict__[k]
   args = OmegaConf.create(cfg)
   logging.info(f"args:\n{args}")
-  model = TrtRunner(args, args.onnx_dir+'/feature_runner.engine', args.onnx_dir+'/post_runner.engine')
+  engine_path = args.engine_path
+  if engine_path is None:
+    single_engine = os.path.join(args.onnx_dir, 'foundation_stereo.engine')
+    feature_engine = os.path.join(args.onnx_dir, 'feature_runner.engine')
+    post_engine = os.path.join(args.onnx_dir, 'post_runner.engine')
+    if os.path.exists(single_engine):
+      engine_path = single_engine
+    elif os.path.exists(feature_engine) and os.path.exists(post_engine):
+      engine_path = None
+    else:
+      raise FileNotFoundError(
+        f"Could not find TensorRT engine(s) in {args.onnx_dir}. "
+        "Expected either foundation_stereo.engine or feature_runner.engine + post_runner.engine."
+      )
+
+  if engine_path is not None:
+    logging.info(f"Using single TensorRT engine: {engine_path}")
+    model = SingleTrtRunner(args, engine_path)
+  else:
+    logging.info("Using split TensorRT engines: feature_runner.engine + post_runner.engine")
+    model = TrtRunner(args, feature_engine, post_engine)
 
   img0 = imageio.imread(args.left_file)
   img1 = imageio.imread(args.right_file)