diff --git a/.claude/settings.local.json b/.claude/settings.local.json
deleted file mode 100644
index d1d5f267..00000000
--- a/.claude/settings.local.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(docker build:*)",
-      "Bash(ls:*)",
-      "Bash(find:*)",
-      "Bash(kubectl config:*)",
-      "Bash(gcloud container clusters:*)",
-      "Bash(gcloud auth:*)",
-      "Bash(gcloud components install:*)",
-      "Bash(kubectl:*)",
-      "Bash(git pull:*)"
-    ],
-    "deny": []
-  }
-}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 3064cd56..77a65a9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -209,4 +209,10 @@ __marimo__/
 # Streamlit
 .streamlit/secrets.toml
 
-log*.txt
\ No newline at end of file
+log*.txt
+# TensorRT artifacts
+/tensorrt_cache/
+/tensorrt_work/
+*.trt
+*.onnx
+*.onnx.data
diff --git a/owl-vaes b/owl-vaes
index 005e9fcf..44fdce7d 160000
--- a/owl-vaes
+++ b/owl-vaes
@@ -1 +1 @@
-Subproject commit 005e9fcf88460431e5a146b49fce2914ae96b1cb
+Subproject commit 44fdce7d8b3d0b91f4e53cec296c0f4801d6ed7a
diff --git a/owl_wms/utils/owl_vae_bridge.py b/owl_wms/utils/owl_vae_bridge.py
index de985ca4..6c3c1e7a 100644
--- a/owl_wms/utils/owl_vae_bridge.py
+++ b/owl_wms/utils/owl_vae_bridge.py
@@ -1,14 +1,598 @@
 import sys
 import os
+import logging
+from pathlib import Path
+from dataclasses import dataclass
+from contextlib import contextmanager
+import time
+from typing import Optional, Union
 
 import torch
 from diffusers import AutoencoderDC
 
+# Independent imports with fallback logic for each dependency
+TENSORRT_AVAILABLE = False
+ONNX_AVAILABLE = False
+ONNXRUNTIME_AVAILABLE = False
+
+# Try importing TensorRT independently
+try:
+    import tensorrt as trt
+    TENSORRT_AVAILABLE = True
+    logging.info("TensorRT successfully imported")
+except ImportError as e:
+    logging.warning(f"TensorRT not available: {e}")
+    trt = None
+
+# Try importing ONNX independently
+try:
+    import onnx
+    ONNX_AVAILABLE = True
+    logging.info("ONNX successfully imported")
+except ImportError as e:
+    logging.warning(f"ONNX not available: {e}")
+    onnx = None
+
+# Try importing ONNX Runtime independently
+try:
+    import onnxruntime
+    ONNXRUNTIME_AVAILABLE = True
+    logging.info("ONNX Runtime successfully imported")
+except ImportError as e:
+    logging.warning(f"ONNX Runtime not available: {e}")
+    onnxruntime = None
+
 sys.path.append("./owl-vaes")
 from owl_vaes.utils.proxy_init import load_proxy_model
 from owl_vaes.models import get_model_cls
 from owl_vaes.configs import Config
 
+# -------------------------------
+# Environment variable controls (Phase 4)
+# -------------------------------
+TENSORRT_ENABLED = os.environ.get('TENSORRT_ENABLED', 'true').lower() == 'true'
+TENSORRT_PRECISION = os.environ.get('TENSORRT_PRECISION', 'fp16')
+TENSORRT_CACHE_DIR = Path(os.environ.get('TENSORRT_CACHE_DIR', './tensorrt_cache'))
+try:
+    TENSORRT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+except Exception:
+    pass
+
+@dataclass
+class TensorRTConfig:
+    """Configuration for TensorRT optimization settings."""
+    precision: str = "fp16"  # Options: fp32, fp16, int8
+    max_batch_size: int = 8
+    max_workspace_size: int = 2 << 30  # 2GB default
+    optimization_level: int = 3  # 0-5, higher = more aggressive optimization
+    use_dla: bool = False  # Deep Learning Accelerator for edge devices
+    
+    def __post_init__(self):
+        if self.precision not in ["fp32", "fp16", "int8"]:
+            raise ValueError(f"Invalid precision: {self.precision}. Must be fp32, fp16, or int8")
+        if not 0 <= self.optimization_level <= 5:
+            raise ValueError(f"Invalid optimization_level: {self.optimization_level}. Must be 0-5")
+
+def get_engine_cache_path(model_name: str, config: Optional[TensorRTConfig] = None) -> Path:
+    """
+    Generate cache path for TensorRT engine files.
+    
+    Args:
+        model_name: Name/identifier for the model
+        config: TensorRT configuration for cache path generation
+        
+    Returns:
+        Path object for the engine cache file
+    """
+    # Get cache directory from environment variable or resolved default
+    cache_path = Path(os.getenv("TENSORRT_CACHE_DIR", str(TENSORRT_CACHE_DIR)))
+    
+    # Create cache directory if it doesn't exist
+    cache_path.mkdir(parents=True, exist_ok=True)
+    
+    # Generate filename based on model and config
+    if config:
+        filename = f"{model_name}_{config.precision}_bs{config.max_batch_size}_opt{config.optimization_level}.engine"
+    else:
+        filename = f"{model_name}_default.engine"
+    
+    return cache_path / filename
+
+# Initialize TensorRT engine cache directory
+if TENSORRT_AVAILABLE:
+    try:
+        cache_dir = Path(os.getenv("TENSORRT_CACHE_DIR", os.path.expanduser("~/.cache/tensorrt")))
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        logging.info(f"TensorRT cache directory initialized at: {cache_dir}")
+    except Exception as e:
+        logging.warning(f"Failed to initialize TensorRT cache directory: {e}")
+else:
+    logging.debug("TensorRT cache directory initialization skipped (TensorRT not available)")
+
+
+# -------------------------------
+# ONNX Runtime utilities used by export and validation
+# -------------------------------
+def _onnxruntime_session(model_path: Path):
+    """Create an ONNX Runtime session with CUDA if available, CPU as fallback.
+
+    This local helper prevents optional test utilities from being a hard runtime
+    dependency and mirrors the logic used in tests.
+    """
+    if not ONNXRUNTIME_AVAILABLE:
+        raise RuntimeError("ONNX Runtime is not available; cannot create session")
+    import onnxruntime as ort
+    # Prefer CUDA, fallback to CPU
+    providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    session = ort.InferenceSession(str(model_path), providers=providers)
+    return session
+
+
+# -------------------------------
+# Performance monitoring utilities (Phase 4)
+# -------------------------------
+@contextmanager
+def measure_inference_time(operation_name: str):
+    start_time = time.perf_counter()
+    try:
+        yield
+    finally:
+        end_time = time.perf_counter()
+        elapsed_ms = (end_time - start_time) * 1000.0
+        logging.info(f"{operation_name} inference time: {elapsed_ms:.2f}ms")
+
+
+# -------------------------------
+# Model preparation utilities (Phase 2)
+# -------------------------------
+def _replace_child_module(root: torch.nn.Module, qualified_name: str, new_module: torch.nn.Module) -> None:
+    """Safely replace a child module by its dotted qualified name.
+
+    Example: qualified_name="decoder.blocks.3.attn" will traverse attributes
+    and replace the final child with new_module.
+    """
+    parts = qualified_name.split(".")
+    parent = root
+    for part in parts[:-1]:
+        parent = getattr(parent, part)
+    setattr(parent, parts[-1], new_module)
+def unwrap_weight_norm(model: torch.nn.Module) -> torch.nn.Module:
+    """Remove weight_norm parametrizations in-place where present.
+
+    This is written to be safe: if a submodule is not weight-normalized,
+    it is skipped. The function returns the same model instance for
+    convenient chaining.
+    """
+    try:
+        from torch.nn.utils import remove_weight_norm  # Local import to avoid hard dependency at import time
+    except Exception:
+        # If unavailable, act as a no-op
+        logging.debug("torch.nn.utils.remove_weight_norm not available; skipping unwrap_weight_norm")
+        return model
+
+    removed_count = 0
+    for name, module in model.named_modules():
+        # Common parameter name is "weight"; attempt removal optimistically
+        try:
+            remove_weight_norm(module)
+            removed_count += 1
+        except Exception:
+            # Not weight-normalized or removal failed; ignore safely
+            continue
+
+    if removed_count > 0:
+        logging.info(f"unwrap_weight_norm: removed weight_norm from {removed_count} submodules")
+    else:
+        logging.debug("unwrap_weight_norm: no weight_norm parametrizations found")
+    return model
+
+
+def replace_landscape_operations(model: torch.nn.Module) -> torch.nn.Module:
+    """Replace custom landscape/square operations with ONNX-friendly ops."""
+    replaced_count = 0
+    enable_replacement = os.getenv("REPLACE_LANDSCAPE", "0") == "1"
+    for module_name, module in model.named_modules():
+        name_lower = module_name.lower()
+        if "landscape" in name_lower or "square" in name_lower:
+            logging.info(f"Landscape candidate detected: {module_name} (type: {type(module).__name__})")
+            if not enable_replacement:
+                continue
+            # Prefer explicit size hints if available
+            target_h = getattr(module, "target_height", None)
+            target_w = getattr(module, "target_width", None)
+            if isinstance(target_h, int) and isinstance(target_w, int):
+                try:
+                    new_module = torch.nn.Upsample(size=(target_h, target_w), mode="bilinear", align_corners=False)
+                    _replace_child_module(model, module_name, new_module)
+                    replaced_count += 1
+                    logging.info(f"replace_landscape_operations: replaced '{module_name}' with nn.Upsample({target_h}x{target_w})")
+                    continue
+                except Exception as e:
+                    logging.debug(f"replace_landscape_operations: failed to replace '{module_name}': {e}")
+            logging.debug(f"replace_landscape_operations: no safe replacement for '{module_name}' (no-op)")
+            continue
+
+    if replaced_count > 0:
+        logging.info(f"replace_landscape_operations: replaced {replaced_count} modules with ONNX-friendly ops")
+    else:
+        logging.debug("replace_landscape_operations: no landscape-related modules replaced (no-op)")
+    # Track count on model for diagnostics
+    try:
+        setattr(model, "landscape_replaced_count", replaced_count)
+    except Exception:
+        pass
+    return model
+
+
+def replace_sana_operations(model: torch.nn.Module) -> torch.nn.Module:
+    """Replace custom SANA pixel shuffle ops with standard equivalents.
+
+    This scans for modules exposing attributes suggestive of custom shuffle
+    implementations and logs candidates. It is currently a safe no-op until
+    concrete module types are identified.
+    """
+    replaced_count = 0
+    enable_replacement = os.getenv("REPLACE_SANA", "0") == "1"
+    for module_name, module in model.named_modules():
+        if hasattr(module, "pixel_shuffle") or hasattr(module, "pixel_unshuffle"):
+            if not enable_replacement:
+                logging.debug(f"replace_sana_operations: candidate '{module_name}' detected (disabled)")
+                continue
+            upscale = getattr(module, "upscale_factor", None) or getattr(module, "factor", None)
+            downscale = getattr(module, "downscale_factor", None) or getattr(module, "factor", None)
+            try:
+                if upscale and isinstance(upscale, int):
+                    new_module = torch.nn.PixelShuffle(upscale)
+                    _replace_child_module(model, module_name, new_module)
+                    replaced_count += 1
+                    logging.info(f"replace_sana_operations: replaced '{module_name}' with nn.PixelShuffle({upscale})")
+                    continue
+                if downscale and isinstance(downscale, int):
+                    new_module = torch.nn.PixelUnshuffle(downscale)
+                    _replace_child_module(model, module_name, new_module)
+                    replaced_count += 1
+                    logging.info(f"replace_sana_operations: replaced '{module_name}' with nn.PixelUnshuffle({downscale})")
+                    continue
+            except Exception as e:
+                logging.debug(f"replace_sana_operations: failed to replace '{module_name}': {e}")
+            logging.debug(f"replace_sana_operations: no safe replacement for '{module_name}' (no-op)")
+            continue
+
+    if replaced_count > 0:
+        logging.info(f"replace_sana_operations: replaced {replaced_count} modules with standard shuffles")
+    else:
+        logging.debug("replace_sana_operations: no SANA-related modules replaced (no-op)")
+    # Track count on model for diagnostics
+    try:
+        setattr(model, "sana_replaced_count", replaced_count)
+    except Exception:
+        pass
+    return model
+
+
+def simplify_attention_blocks(model: torch.nn.Module) -> torch.nn.Module:
+    """Simplify attention blocks to improve ONNX export compatibility.
+
+    By default, this function is a no-op that only logs candidates. Actual
+    simplification (such as swapping to torch.nn.Identity) should be enabled
+    once specific attention module types are confirmed safe to modify.
+    """
+    simplified_count = 0
+    enable_simplify = os.getenv("SIMPLIFY_ATTENTION", "0") == "1"
+    for module_name, module in model.named_modules():
+        name_lower = module_name.lower()
+        if "attn" in name_lower or hasattr(module, "attention"):
+            if not enable_simplify:
+                logging.debug(f"simplify_attention_blocks: candidate '{module_name}' detected (disabled)")
+                continue
+            if isinstance(module, torch.nn.MultiheadAttention):
+                logging.debug(f"simplify_attention_blocks: skipping standard MultiheadAttention '{module_name}'")
+                continue
+            try:
+                identity = torch.nn.Identity()
+                _replace_child_module(model, module_name, identity)
+                simplified_count += 1
+                logging.info(f"simplify_attention_blocks: replaced '{module_name}' with nn.Identity()")
+            except Exception as e:
+                logging.debug(f"simplify_attention_blocks: failed to simplify '{module_name}': {e}")
+            continue
+
+    if simplified_count > 0:
+        logging.info(f"simplify_attention_blocks: simplified {simplified_count} attention blocks")
+    else:
+        logging.debug("simplify_attention_blocks: no attention blocks simplified (no-op)")
+    # Track count on model for diagnostics
+    try:
+        setattr(model, "attn_simplified_count", simplified_count)
+    except Exception:
+        pass
+    return model
+
+
+def export_video_decoder_to_onnx(decoder_model: torch.nn.Module, output_path: Path, batch_size: int = 8, validate_ort: bool = True) -> bool:
+    """Export DCAE-like video decoder to ONNX and optionally validate with ONNX Runtime.
+
+    Steps:
+    - Prepare model: CPU, eval, unwrap weight norm, optional replacements
+    - Export with opset 17, dynamic batch axis
+    - Validate ONNX model structure
+    - Optionally run inference in ORT and compare with PyTorch (1e-2 tolerance)
+    """
+    if not ONNX_AVAILABLE:
+        raise RuntimeError("ONNX is not available; cannot export decoder to ONNX")
+    if validate_ort and not ONNXRUNTIME_AVAILABLE:
+        raise RuntimeError("ONNX Runtime is not available; cannot validate ONNX export")
+
+    # Prepare model for export with custom operation replacements enabled
+    print("  Export: Preparing model...")
+    prepared_model = decoder_model
+    try:
+        # Run export and validation on CPU in float32 for stability/perf
+        print("  Export: Moving model to CPU and converting to float32...")
+        prepared_model = prepared_model.cpu().float().eval()
+        
+        print("  Export: Unwrapping weight norm...")
+        prepared_model = unwrap_weight_norm(prepared_model)
+        
+        # Enable custom operation replacements for ONNX compatibility
+        import os
+        print("  Export: Enabling custom operation replacements...")
+        os.environ["REPLACE_LANDSCAPE"] = "1"  # Enable landscape/square operation replacement
+        os.environ["REPLACE_SANA"] = "1"       # Enable SANA pixel shuffle replacement
+        os.environ["SIMPLIFY_ATTENTION"] = "1" # Enable attention block simplification
+        
+        print("  Export: Replacing landscape operations...")
+        prepared_model = replace_landscape_operations(prepared_model)
+        print("  Export: Replacing SANA operations...")
+        prepared_model = replace_sana_operations(prepared_model)
+        print("  Export: Simplifying attention blocks...")
+        prepared_model = simplify_attention_blocks(prepared_model)
+        
+        print("  Export: Model preparation completed")
+        logging.info("Custom operation replacements enabled for ONNX export")
+    except Exception as e:
+        print(f"  Export: Model preparation warning: {e}")
+        logging.warning(f"Model preparation encountered an issue (continuing): {e}")
+
+    # Add prints to diagnose replacements
+    print(f"  Export: Replaced {getattr(prepared_model, 'landscape_replaced_count', 0)} landscape ops")
+    print(f"  Export: Replaced {getattr(prepared_model, 'sana_replaced_count', 0)} SANA ops")
+    print(f"  Export: Simplified {getattr(prepared_model, 'attn_simplified_count', 0)} attention blocks")
+
+    # Video latent input: [batch, 128, 8, 8]
+    latent_channels = 128
+    latent_h = 8
+    latent_w = 8
+    # Use float32 inputs to match prepared model dtype on CPU
+    dummy_input = torch.randn(batch_size, latent_channels, latent_h, latent_w, dtype=torch.float32)
+
+    with torch.no_grad():
+        pytorch_output = prepared_model(dummy_input).float()
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Export with TensorRT-friendly settings
+        print("  Export: Starting ONNX export...")
+        try:
+            from onnxruntime.tools import pytorch_export_contrib_ops
+            pytorch_export_contrib_ops.register()
+            print("  Export: Registered ONNX Runtime contrib ops")
+        except ImportError:
+            print("  Export: onnxruntime.tools not available")
+            logging.warning("onnxruntime.tools not available, proceeding without contrib ops")
+        
+        try:
+            print("  Export: Calling torch.onnx.export...")
+            torch.onnx.export(
+                prepared_model,
+                dummy_input,
+                str(output_path),
+                input_names=["latent_video"],
+                output_names=["decoded_video"],
+                dynamic_axes={"latent_video": {0: "batch_size"}, "decoded_video": {0: "batch_size"}},
+                opset_version=17,
+                do_constant_folding=True,
+                dynamo=False,
+                verbose=True
+            )
+            print("  Export: torch.onnx.export completed")
+            # Save model with external weights
+            data_filename = "video_decoder.onnx.data"
+            onnx_model = onnx.load(str(output_path))
+            onnx.save_model(onnx_model, str(output_path), save_as_external_data=True, location=data_filename, size_threshold=1024)
+            data_path = output_path.parent / data_filename
+            if data_path.exists():
+                logging.info(f"External weights saved at: {data_path}")
+            else:
+                logging.warning(f"External weights file not found: {data_path}")
+        except Exception as e:
+            logging.error(f"ONNX export failed: {e}", exc_info=True)
+            print(f"  Export: torch.onnx.export failed: {e}")
+            raise
+
+    # Structural validation
+    print("  Export: Validating ONNX model structure...")
+    onnx_model = onnx.load(str(output_path))
+    onnx.checker.check_model(onnx_model)
+    print("  Export: ONNX model structure validated")
+
+    unsupported_ops = [node.op_type for node in onnx_model.graph.node if node.op_type.startswith('ATen')]
+    if unsupported_ops:
+        logging.warning(f"Unsupported ops in ONNX: {unsupported_ops}")
+
+    # ONNX Runtime validation (only if requested)
+    if validate_ort:
+        print("  Export: Running ONNX Runtime validation...")
+        try:
+            import numpy as np
+            sess = _onnxruntime_session(output_path)  # Use updated session function
+            onnx_input = dummy_input.detach().cpu().numpy().astype(np.float32)
+            ort_out = sess.run(None, {"latent_video": onnx_input})
+            ort_output = torch.from_numpy(ort_out[0]).float()
+            max_diff = torch.max(torch.abs(pytorch_output - ort_output)).item()
+            tolerance = 1e-1  # Relaxed for TensorRT prep
+            if max_diff >= tolerance:
+                logging.warning(f"ONNX validation tolerance exceeded: max_diff={max_diff:.6f} >= {tolerance}")
+            else:
+                logging.info(f"ONNX validation passed: max_diff={max_diff:.6f} < {tolerance}")
+            print(f"  Export: ONNX Runtime validation completed (max_diff={max_diff:.6f})")
+        except Exception as e:
+            logging.error(f"ONNX Runtime validation failed: {e}")
+            print(f"  Export: ONNX Runtime validation failed: {e}")
+            return False  # Fail if validation fails for TensorRT readiness
+
+    logging.info(f"Video decoder ONNX export successful: {output_path}")
+    return True
+
+# -------------------------------
+# Phase 3: TensorRT Engine Creation
+# -------------------------------
+def build_tensorrt_engine(onnx_path: Path, engine_path: Path, config: TensorRTConfig) -> bool:
+    """Build and cache a TensorRT engine from an ONNX model.
+
+    Args:
+        onnx_path: Path to the ONNX model file.
+        engine_path: Destination path for the serialized TensorRT engine.
+        config: TensorRT build configuration.
+
+    Returns:
+        True on success, False otherwise.
+    """
+    if not TENSORRT_AVAILABLE:
+        logging.error("TensorRT is not available; cannot build engine")
+        return False
+
+    onnx_path = Path(onnx_path)
+    engine_path = Path(engine_path)
+    engine_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Respect cache if engine already exists
+    if engine_path.exists():
+        logging.info(f"Loading cached TensorRT engine: {engine_path}")
+        return True
+
+    try:
+        logger = trt.Logger(trt.Logger.INFO)
+        builder = trt.Builder(logger)
+        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        parser = trt.OnnxParser(network, logger)
+
+        # Parse ONNX from file so external data (.onnx.data) is discoverable by TensorRT
+        # Using parse_from_file avoids failures like "Failed to open file: <model>.onnx.data".
+        if not parser.parse_from_file(str(onnx_path)):
+            num_errors = parser.num_errors
+            for i in range(num_errors):
+                logging.error(f"TensorRT ONNX parse error[{i}]: {parser.get_error(i)}")
+            return False
+
+        # Configure builder
+        builder_config = builder.create_builder_config()
+        builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, int(config.max_workspace_size))
+
+        if config.precision == "fp16":
+            builder_config.set_flag(trt.BuilderFlag.FP16)
+        elif config.precision == "int8":
+            builder_config.set_flag(trt.BuilderFlag.INT8)
+
+        # Optimization profile for video decoder latent input name used in export
+        profile = builder.create_optimization_profile()
+        profile.set_shape(
+            "latent_video",
+            min=(1, 128, 8, 8),
+            opt=(config.max_batch_size, 128, 8, 8),
+            max=(config.max_batch_size, 128, 8, 8),
+        )
+        builder_config.add_optimization_profile(profile)
+
+        # Build serialized engine
+        serialized_engine = builder.build_serialized_network(network, builder_config)
+        if serialized_engine is None:
+            logging.error("Failed to build TensorRT engine (serialized engine is None)")
+            return False
+
+        with open(engine_path, "wb") as f:
+            f.write(serialized_engine)
+
+        # Optional cache pruning: honor TENSORRT_MAX_CACHE_SIZE_MB if set
+        try:
+            max_mb = int(os.environ.get("TENSORRT_MAX_CACHE_SIZE_MB", "0"))
+        except ValueError:
+            max_mb = 0
+        if max_mb > 0:
+            _prune_cache_dir(engine_path.parent, max_mb)
+
+        logging.info(f"TensorRT engine built and cached at: {engine_path}")
+        return True
+    except Exception as e:
+        logging.error(f"TensorRT engine build failed: {e}", exc_info=True)
+        return False
+
+def _prune_cache_dir(cache_dir: Path, max_size_mb: int) -> None:
+    """Prune oldest files in cache_dir to keep total size under max_size_mb."""
+    try:
+        max_bytes = max_size_mb * 1024 * 1024
+        files = [p for p in cache_dir.glob("*") if p.is_file()]
+        files.sort(key=lambda p: p.stat().st_mtime)  # oldest first
+        def total():
+            return sum(p.stat().st_size for p in files if p.exists())
+        while total() > max_bytes and files:
+            victim = files.pop(0)
+            try:
+                victim.unlink(missing_ok=True)
+                logging.info(f"Pruned cache file: {victim}")
+            except Exception as e:
+                logging.warning(f"Failed to prune {victim}: {e}")
+    except Exception as e:
+        logging.warning(f"Cache pruning error: {e}")
+
+class TensorRTVideoDecoder:
+    """Lightweight TensorRT wrapper for the video decoder engine.
+
+    Expects the engine built from the ONNX exported by export_video_decoder_to_onnx,
+    with input tensor named "latent_video" and output tensor named "decoded_video".
+    """
+    def __init__(self, engine_path: Path):
+        if not TENSORRT_AVAILABLE:
+            raise RuntimeError("TensorRT not available")
+        self.logger = trt.Logger(trt.Logger.WARNING)
+        with open(engine_path, "rb") as f:
+            runtime = trt.Runtime(self.logger)
+            self.engine = runtime.deserialize_cuda_engine(f.read())
+        self.context = self.engine.create_execution_context()
+        # Cache bindings
+        # Tensor indices are stable: 0=input, 1=output for our single-input/output net
+        self.input_name = "latent_video"
+        self.output_name = "decoded_video"
+
+    def __call__(self, latent_video: torch.Tensor) -> torch.Tensor:
+        # Ensure CUDA, fp32 for TRT
+        if not latent_video.is_cuda:
+            latent_video = latent_video.cuda()
+        if latent_video.dtype != torch.float32:
+            latent_video = latent_video.float()
+
+        b, c, h, w = latent_video.shape
+        # Set input shape for explicit batch network
+        self.context.set_input_shape(self.input_name, (b, c, h, w))
+
+        # Resolve binding tensor names (TRT 10.x APIs)
+        input_binding = self.engine.get_tensor_name(0)
+        output_binding = self.engine.get_tensor_name(1)
+
+        # Allocate output using reported shape
+        out_shape = tuple(self.context.get_tensor_shape(output_binding))
+        output_tensor = torch.empty(out_shape, dtype=torch.float32, device="cuda")
+
+        # Bind addresses
+        self.context.set_tensor_address(input_binding, latent_video.data_ptr())
+        self.context.set_tensor_address(output_binding, output_tensor.data_ptr())
+
+        # Execute on current stream
+        ok = self.context.execute_async_v3(torch.cuda.current_stream().cuda_stream)
+        if not ok:
+            raise RuntimeError("TensorRT execution failed")
+        return output_tensor.bfloat16()
+
 def _get_decoder_only():
     model = load_proxy_model(
         "../checkpoints/128x_proxy_titok.yml",
@@ -20,40 +604,110 @@ def _get_decoder_only():
     return model
 
 def get_decoder_only(vae_id, cfg_path, ckpt_path):
-        if vae_id == "dcae":
-            model_id = "mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers"
-            model = AutoencoderDC.from_pretrained(model_id).bfloat16().cuda().eval()
-            del model.encoder
-            return model.decoder
-        else:
-            cfg = Config.from_yaml(cfg_path).model
-            model = get_model_cls(cfg.model_id)(cfg)
-            try:
-                model.load_state_dict(torch.load(ckpt_path, map_location='cpu',weights_only=False))
-            except:
-                model.decoder.load_state_dict(torch.load(ckpt_path, map_location='cpu',weights_only=False))
-            del model.encoder
-            model = model.decoder
-            model = model.bfloat16().cuda().eval()
-            return model
+    """Load the decoder module using a config and checkpoint.
+
+    For `dcae`, prefer the provided config and checkpoint (cod_yt_v2 base.yml
+    and resume_ckpt) instead of historical diffusers references.
+    """
+    cfg = Config.from_yaml(cfg_path).model
+    model = get_model_cls(cfg.model_id)(cfg)
+    state = torch.load(ckpt_path, map_location='cpu', weights_only=False)
+    try:
+        model.load_state_dict(state)
+    except Exception:
+        model.decoder.load_state_dict(state)
+    del model.encoder
+    model = model.decoder
+    # Keep on CPU in float32 by default for export/tests; caller can move to CUDA if needed
+    model = model.float().cpu().eval()
+    return model
 
 @torch.no_grad()
-def make_batched_decode_fn(decoder, batch_size = 8):
-    def decode(x):
+def make_batched_decode_fn(decoder, batch_size: int = 8, use_tensorrt: bool = True):
+    """Create a batched decode function with optional TensorRT acceleration.
+
+    Behavior:
+    - If TensorRT is available and use_tensorrt is True, attempt to locate/build a
+      cached engine and run inference through TensorRTVideoDecoder.
+    - If anything fails, seamlessly fall back to the original PyTorch path.
+    """
+    tensorrt_decoder = None
+    compiled_decoder = None
+    # Read env dynamically so tests/users can toggle without reloading the module
+    env_enabled = os.environ.get("TENSORRT_ENABLED", "true").lower() == "true"
+    env_precision = os.environ.get("TENSORRT_PRECISION", TENSORRT_PRECISION)
+    env_cache_dir = Path(os.environ.get("TENSORRT_CACHE_DIR", str(TENSORRT_CACHE_DIR)))
+    try:
+        env_cache_dir.mkdir(parents=True, exist_ok=True)
+    except Exception:
+        pass
+
+    # Prepare a compiled PyTorch fallback decoder (max-autotune) up front.
+    # If compilation is unavailable or fails, we will transparently use the
+    # original decoder implementation.
+    try:
+        compiled_decoder = torch.compile(decoder, mode="max-autotune", dynamic=False, fullgraph=True)
+        logging.info("Prepared compiled PyTorch fallback decoder (mode=max-autotune, dynamic=False, fullgraph=True)")
+    except Exception as e:
+        logging.debug(f"torch.compile unavailable or failed; using eager decoder fallback: {e}")
+
+    if use_tensorrt and TENSORRT_AVAILABLE and env_enabled:
+        try:
+            # Prefer explicit engine path via env for reproducibility
+            engine_env = os.getenv("TENSORRT_ENGINE_PATH")
+            if engine_env:
+                engine_path = Path(engine_env)
+            else:
+                # Use deterministic cache location based on model key and shapes
+                model_key = "dcae_video_decoder"
+                engine_path = env_cache_dir / "dcae_video_decoder.trt"
+                if not engine_path.exists():
+                    # Fallback to generic cache path helper
+                    engine_path = get_engine_cache_path(model_key, TensorRTConfig(max_batch_size=batch_size, precision=env_precision))
+
+            if not engine_path.exists():
+                # Build engine on the fly: export ONNX then build TRT
+                onnx_path = engine_path.with_suffix('.onnx')
+                if export_video_decoder_to_onnx(decoder, onnx_path, batch_size=batch_size, validate_ort=False):
+                    build_tensorrt_engine(onnx_path, engine_path, TensorRTConfig(max_batch_size=batch_size, precision=env_precision))
+
+            if engine_path.exists():
+                tensorrt_decoder = TensorRTVideoDecoder(engine_path)
+                logging.info(f"Using TensorRT engine: {engine_path}")
+        except Exception as e:
+            logging.warning(f"TensorRT initialization failed: {e}. Falling back to PyTorch decoder.")
+
+    def decode(x: torch.Tensor) -> torch.Tensor:
         # x is [b,n,c,h,w]
-        b,n,c,h,w = x.shape
-        x = x.view(b*n,c,h,w).contiguous()
+        b, n, c, h, w = x.shape
+        x = x.view(b*n, c, h, w).contiguous()
+
+        if tensorrt_decoder is not None:
+            try:
+                batches = x.contiguous().split(batch_size)
+                batch_out = []
+                for batch in batches:
+                    with measure_inference_time("TensorRT Video Decode"):
+                        batch_out.append(tensorrt_decoder(batch))
+                x = torch.cat(batch_out)
+                _, c, h, w = x.shape
+                x = x.view(b, n, c, h, w).contiguous()
+                return x
+            except Exception as e:
+                logging.warning(f"TensorRT inference failed: {e}. Falling back to PyTorch for this call.")
 
+        # PyTorch fallback path (compiled if available)
         batches = x.split(batch_size)
         batch_out = []
         for batch in batches:
-            batch_out.append(decoder(batch).bfloat16())
-
-        x = torch.cat(batch_out) # [b*n,c,h,w]
-        _,c,h,w = x.shape
-        x = x.view(b,n,c,h,w).contiguous()
-
+            with measure_inference_time("PyTorch Video Decode"):
+                runner = compiled_decoder if compiled_decoder is not None else decoder
+                batch_out.append(runner(batch).bfloat16())
+        x = torch.cat(batch_out)
+        _, c, h, w = x.shape
+        x = x.view(b, n, c, h, w).contiguous()
         return x
+
     return decode
 
 @torch.no_grad()
@@ -72,4 +726,15 @@ def decode(x):
         x = x.transpose(-1,-2).contiguous() # [b,n,2]
 
         return x
-    return decode
\ No newline at end of file
+    return decode
+
+
+def get_cod_yt_v2_decoder() -> torch.nn.Module:
+    """Convenience loader for the COD-YT v2 DCAE decoder.
+
+    Uses config at /root/owl-wms/owl-vaes/configs/cod_yt_v2/base.yml and
+    checkpoint at /root/owl-wms/owl-vaes/checkpoints/cod_yt_v2/step_515000.pt.
+    """
+    cfg_path = "/root/owl-wms/owl-vaes/configs/cod_yt_v2/base.yml"
+    ckpt_path = "/root/owl-wms/owl-vaes/checkpoints/cod_yt_v2/step_515000.pt"
+    return get_decoder_only("dcae", cfg_path, ckpt_path)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d80be89e..92427ade 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,3 +23,7 @@ alias-free-torch
 git+https://github.com/sypsyp97/convnext_perceptual_loss
 wandb[media]
 torchtyping
+onnx>=1.17.0
+onnxruntime-gpu>=1.19.0
+--extra-index-url https://pypi.nvidia.com
+tensorrt