diff --git a/tests/test_foundation_stereo_trt.py b/tests/test_foundation_stereo_trt.py new file mode 100644 index 00000000..699c753c --- /dev/null +++ b/tests/test_foundation_stereo_trt.py @@ -0,0 +1,114 @@ +import asyncio +import os +import sys + +import cv2 +import numpy as np + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) + +from tool.foundation_stereo_trt import FoundationStereoTRT + + +def _load_calib(calib_path: str): + with open(calib_path, "r") as f: + lines = [ln.strip() for ln in f.readlines() if ln.strip()] + + if len(lines) < 5: + raise RuntimeError(f"Unexpected calib file format in {calib_path}") + + k_rows = [] + for i in range(1, 4): + k_rows.append([float(v) for v in lines[i].split()]) + k = np.array(k_rows, dtype=np.float32) + + baseline_line = lines[-1] + _, val_str = baseline_line.split(":", 1) + baseline = float(val_str.strip()) + return k, baseline + + +def _save_visuals(data_dir: str, disp: np.ndarray, depth: np.ndarray): + np.save(os.path.join(data_dir, "foundation_disp.npy"), disp) + np.save(os.path.join(data_dir, "foundation_depth.npy"), depth) + + if np.isfinite(disp).any(): + dmin = np.nanmin(disp[np.isfinite(disp)]) + dmax = np.nanmax(disp[np.isfinite(disp)]) + if dmax > dmin: + disp_norm = (disp - dmin) / (dmax - dmin) + else: + disp_norm = np.zeros_like(disp, dtype=np.float32) + else: + disp_norm = np.zeros_like(disp, dtype=np.float32) + disp_u8 = np.clip(disp_norm * 255.0, 0, 255).astype(np.uint8) + cv2.imwrite( + os.path.join(data_dir, "foundation_disp_vis.png"), + cv2.applyColorMap(disp_u8, cv2.COLORMAP_TURBO), + ) + + valid = np.isfinite(depth) & (depth > 0) + if valid.any(): + zmin = np.nanmin(depth[valid]) + zmax = np.nanmax(depth[valid]) + if zmax > zmin: + depth_norm = (np.clip(depth, zmin, zmax) - zmin) / (zmax - zmin) + else: + depth_norm = np.zeros_like(depth, dtype=np.float32) + else: + depth_norm = np.zeros_like(depth, dtype=np.float32) + depth_u8 = np.clip(depth_norm * 255.0, 0, 255).astype(np.uint8) + cv2.imwrite( + os.path.join(data_dir, "foundation_depth_vis.png"), + cv2.applyColorMap(depth_u8, cv2.COLORMAP_VIRIDIS), + ) + + +def _run_foundation_case(data_dir: str): + left_path = os.path.join(data_dir, "left.png") + right_path = os.path.join(data_dir, "right.png") + calib_path = os.path.join(data_dir, "calib.txt") + + assert os.path.exists(left_path), f"Missing left image: {left_path}" + assert os.path.exists(right_path), f"Missing right image: {right_path}" + assert os.path.exists(calib_path), f"Missing calib file: {calib_path}" + + left = cv2.imread(left_path, cv2.IMREAD_GRAYSCALE) + right = cv2.imread(right_path, cv2.IMREAD_GRAYSCALE) + assert left is not None and right is not None, "Failed to read stereo images" + assert left.shape == right.shape, f"Left/right shapes mismatch: {left.shape} vs {right.shape}" + + k, baseline = _load_calib(calib_path) + fx = float(k[0, 0]) + + stereo_engine = FoundationStereoTRT() + disp, depth = asyncio.run( + stereo_engine.infer( + left, + right, + np.array([[baseline]], dtype=np.float32), + np.array([[fx]], dtype=np.float32), + ) + ) + + assert disp.shape == left.shape, f"Disparity shape mismatch: {disp.shape} vs {left.shape}" + assert depth.shape == left.shape, f"Depth shape mismatch: {depth.shape} vs {left.shape}" + assert np.isfinite(disp).any(), "No finite disparity values" + assert np.isfinite(depth).any(), "No finite depth values" + + _save_visuals(data_dir, disp, depth) + + +def test_foundation_stereo_trt_with_looper_data(): + _run_foundation_case("/tinynav/tests/data/looper") + + +def test_foundation_stereo_trt_with_realsense_data(): + _run_foundation_case("/tinynav/tests/data/realsense") + + +if __name__ == "__main__": + test_foundation_stereo_trt_with_looper_data() + print("FoundationStereo TRT looper test passed.") + test_foundation_stereo_trt_with_realsense_data() + print("FoundationStereo TRT realsense test passed.") diff --git a/tinynav/models/Makefile b/tinynav/models/Makefile index 97788a35..7d2c2011 100644 --- a/tinynav/models/Makefile +++ b/tinynav/models/Makefile @@ -13,6 +13,7 @@ DINO_ONNX := dinov2_base_224x224_fp16.onnx SUPERPOINT_ONNX := superpoint_fp16_dynamic.onnx LIGHTGLUE_ONNX := lightglue_fp16.onnx RETINIFY_ONNX := retinify_0_1_5_dynamic.onnx +FOUNDATION_STEREO_ONNX := foundation_stereo.onnx DINO_ENGINE := $(basename $(DINO_ONNX))_$(ARCH).plan @@ -23,10 +24,11 @@ LIGHTGLUE_ENGINE := $(basename $(LIGHTGLUE_ONNX))_$(ARCH).plan # - Retinify: one profile, min/max shapes for RealSense (480x848) and Looper (640x544). SUPERPOINT_ENGINE := $(basename $(SUPERPOINT_ONNX))_$(ARCH).plan RETINIFY_ENGINE := $(basename $(RETINIFY_ONNX))_$(ARCH).plan +FOUNDATION_STEREO_ENGINE := $(basename $(FOUNDATION_STEREO_ONNX))_$(ARCH).plan ENGINES := $(DINO_ENGINE) $(LIGHTGLUE_ENGINE) $(SUPERPOINT_ENGINE) $(RETINIFY_ENGINE) -.PHONY: all dinov2 superpoint lightglue retinify clean help +.PHONY: all dinov2 superpoint lightglue retinify foundation_stereo clean help all: $(ENGINES) @@ -34,6 +36,7 @@ dinov2: $(DINO_ENGINE) superpoint: $(SUPERPOINT_ENGINE) lightglue: $(LIGHTGLUE_ENGINE) retinify: $(RETINIFY_ENGINE) +foundation_stereo: $(FOUNDATION_STEREO_ENGINE) help: @echo "Targets:" @@ -42,6 +45,7 @@ help: @echo " make superpoint # Build SuperPoint" @echo " make lightglue # Build LightGlue" @echo " make retinify # Build Retinify (dynamic, RealSense+Looper)" + @echo " make foundation_stereo # Build foundation stereo" @echo " make clean # Remove *_$(ARCH).plan" $(DINO_ENGINE): $(DINO_ONNX) @@ -73,6 +77,12 @@ $(RETINIFY_ENGINE): $(RETINIFY_ONNX) --layerPrecisions='/cost_volume/Unsqueeze:fp16,/cost_volume/post_process/conv_block3/conv_block3.0/BatchNormalization:fp32,/cost_volume/post_process/norm/norm.0/BatchNormalization:fp32,/disparity_refinement/disp_block3/disp_block3.2/block/block.0/BatchNormalization:fp32,/disparity_refinement/disp_block3/disp_block3.2/block/block.1/Relu:fp32,/disparity_refinement/disp_block5/disp_block5.9/bn/block/block.0/BatchNormalization:fp32,/disparity_refinement/disp_block5/disp_block5.9/bn/block/block.1/Relu:fp32,/disparity_refinement/disp_block5/disp_block5.10/block/block.0/BatchNormalization:fp32,/disparity_refinement/disp_block5/disp_block5.10/block/block.1/Relu:fp32,/disparity_refinement/upsample2_conv/upsample2_conv.0/BatchNormalization:fp32' \ --fp16 +# foundation_stereo.onnx export settings: +# --valid_iters 8 +# --max_disp 192 +$(FOUNDATION_STEREO_ENGINE): $(FOUNDATION_STEREO_ONNX) + $(TRTEXEC) --onnx=$< --saveEngine=$@ --fp16 + clean: rm -f *_$(ARCH).plan diff --git a/tinynav/models/foundation_stereo.onnx b/tinynav/models/foundation_stereo.onnx new file mode 100644 index 00000000..73fe77e3 --- /dev/null +++ b/tinynav/models/foundation_stereo.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3e88c3b2119968e983c80c344cd25592d25c92645ecc9645df35e95de8bb70c +size 120982927 diff --git a/tool/foundation_stereo_trt.py b/tool/foundation_stereo_trt.py new file mode 100644 index 00000000..cfc85001 --- /dev/null +++ b/tool/foundation_stereo_trt.py @@ -0,0 +1,459 @@ +import argparse +import asyncio +import ctypes +import logging +import math +import platform +import sys + +import cv2 +import numpy as np +import tensorrt as trt +from codetiming import Timer +from cuda import cudart + +try: + import rclpy + from cv_bridge import CvBridge + from geometry_msgs.msg import TransformStamped + from message_filters import ApproximateTimeSynchronizer, Subscriber + from rclpy.node import Node + from sensor_msgs.msg import CameraInfo, Image, PointCloud2, PointField + from tf2_ros import TransformBroadcaster + + ROS_AVAILABLE = True +except ImportError: + ROS_AVAILABLE = False + Node = object + +numpy_to_ctypes = { + np.dtype(np.float32): ctypes.c_float, + np.dtype(np.float16): ctypes.c_uint16, + np.dtype(np.int8): ctypes.c_int8, + np.dtype(np.uint8): ctypes.c_uint8, + np.dtype(np.int32): ctypes.c_int32, + np.dtype(np.int64): ctypes.c_int64, + np.dtype(np.bool_): ctypes.c_bool, +} + + +def disparity_to_depth(disparity: np.ndarray, baseline: float, focal_length: float) -> np.ndarray: + disparity = np.asarray(disparity, dtype=np.float32) + baseline = float(np.asarray(baseline).reshape(-1)[0]) + focal_length = float(np.asarray(focal_length).reshape(-1)[0]) + + if baseline <= 0.0: + raise ValueError(f"baseline must be positive, got {baseline}") + if focal_length <= 0.0: + raise ValueError(f"focal_length must be positive, got {focal_length}") + + depth = np.zeros_like(disparity, dtype=np.float32) + valid = np.isfinite(disparity) & (disparity > 0.0) + depth[valid] = (baseline * focal_length) / disparity[valid] + return depth + + +class TRTBase: + def __init__(self, engine_path: str): + trt_logger = trt.Logger(trt.Logger.WARNING) + with open(engine_path, "rb") as f, trt.Runtime(trt_logger) as runtime: + self.engine = runtime.deserialize_cuda_engine(f.read()) + self.context = self.engine.create_execution_context() + self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers() + + def allocate_buffers(self): + inputs = [] + outputs = [] + bindings = [] + _, stream = cudart.cudaStreamCreate() + + for i in range(self.engine.num_io_tensors): + name = self.engine.get_tensor_name(i) + shape = tuple(self.engine.get_tensor_shape(name)) + dtype = np.dtype(trt.nptype(self.engine.get_tensor_dtype(name))) + ctype_dtype = numpy_to_ctypes[dtype] + is_input = self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT + + size = trt.volume(shape) + nbytes = size * dtype.itemsize + + if "aarch64" in platform.machine(): + ptr = cudart.cudaHostAlloc(nbytes, cudart.cudaHostAllocMapped)[1] + host_mem = np.ctypeslib.as_array((ctype_dtype * size).from_address(ptr)) + host_mem = host_mem.view(dtype).reshape(shape) + device_ptr = cudart.cudaHostGetDevicePointer(ptr, 0)[1] + else: + ptr = cudart.cudaMallocHost(nbytes)[1] + host_mem = np.ctypeslib.as_array((ctype_dtype * size).from_address(ptr)) + host_mem = host_mem.view(dtype).reshape(shape) + device_ptr = cudart.cudaMalloc(nbytes)[1] + + bindings.append(int(device_ptr)) + io = { + "name": name, + "host": host_mem, + "device": device_ptr, + "shape": shape, + "nbytes": nbytes, + } + if is_input: + inputs.append(io) + else: + outputs.append(io) + + return inputs, outputs, bindings, stream + + async def run_graph(self): + if "aarch64" not in platform.machine(): + for inp in self.inputs: + cudart.cudaMemcpyAsync( + inp["device"], + inp["host"].ctypes.data, + inp["nbytes"], + cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, + self.stream, + ) + + for i in range(self.engine.num_io_tensors): + self.context.set_tensor_address(self.engine.get_tensor_name(i), self.bindings[i]) + self.context.execute_async_v3(stream_handle=self.stream) + + if "aarch64" not in platform.machine(): + for out in self.outputs: + cudart.cudaMemcpyAsync( + out["host"].ctypes.data, + out["device"], + out["nbytes"], + cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, + self.stream, + ) + + _, event = cudart.cudaEventCreate() + cudart.cudaEventRecord(event, self.stream) + while cudart.cudaEventQuery(event)[0] == cudart.cudaError_t.cudaErrorNotReady: + await asyncio.sleep(0) + + return {out["name"]: out["host"].copy() for out in self.outputs} + + +class FoundationStereoTRT(TRTBase): + def __init__(self, engine_path=f"/tinynav/tinynav/models/foundation_stereo_{platform.machine()}.plan"): + super().__init__(engine_path) + if len(self.inputs) != 2: + raise RuntimeError(f"Expected 2 inputs (left/right), got {len(self.inputs)}") + if len(self.outputs) != 1: + raise RuntimeError(f"Expected 1 output (disp), got {len(self.outputs)}") + + self.left_idx = 0 if self.inputs[0]["name"] == "left" else 1 + self.right_idx = 1 - self.left_idx + self.output_name = self.outputs[0]["name"] + + self.net_h = int(self.inputs[self.left_idx]["shape"][2]) + self.net_w = int(self.inputs[self.left_idx]["shape"][3]) + + def _to_three_channel_float(self, image: np.ndarray) -> np.ndarray: + if image.ndim == 2: + image = np.repeat(image[:, :, None], 3, axis=2) + elif image.ndim == 3 and image.shape[2] == 1: + image = np.repeat(image, 3, axis=2) + elif image.ndim == 3 and image.shape[2] >= 3: + image = image[:, :, :3] + else: + raise ValueError(f"Unsupported image shape: {image.shape}") + image = image.astype(np.float32, copy=False) + return np.transpose(image, (2, 0, 1))[None, ...] + + async def infer(self, left_img, right_img, baseline, focal_length): + if left_img.shape[:2] != right_img.shape[:2]: + raise ValueError(f"Left/right shape mismatch: {left_img.shape} vs {right_img.shape}") + + h_in, w_in = left_img.shape[:2] + if (h_in, w_in) != (self.net_h, self.net_w): + raise ValueError( + f"Input image size must match engine size ({self.net_h}, {self.net_w}), " + f"got ({h_in}, {w_in}). Rebuild engine for this resolution." + ) + + left_tensor = self._to_three_channel_float(left_img) + right_tensor = self._to_three_channel_float(right_img) + + np.copyto(self.inputs[self.left_idx]["host"], left_tensor) + np.copyto(self.inputs[self.right_idx]["host"], right_tensor) + + results = await self.run_graph() + + disp_net = np.asarray(results[self.output_name], dtype=np.float32).reshape(self.net_h, self.net_w) + disp_net = np.clip(disp_net, 0.0, None) + disp = disp_net + + depth = disparity_to_depth(disp, baseline, focal_length) + return disp, depth + + +class FoundationStereoDepthNode(Node): + def __init__( + self, + engine_path=f"/tinynav/tinynav/models/foundation_stereo_{platform.machine()}.plan", + left_topic="/camera/camera/infra1/image_rect_raw", + right_topic="/camera/camera/infra2/image_rect_raw", + camera_info_topic="/camera/camera/infra2/camera_info", + keyframe_depth_topic="/keyframe/depth", + pointcloud_topic="/keyframe/pointcloud", + frame_id="camera_foundation_stereo", + world_frame_id="world", + publish_pointcloud=False, + min_depth=0.05, + max_depth=20.0, + queue_size=10, + slop=0.02, + ): + super().__init__("foundation_stereo_depth_node") + self.logger = logging.getLogger(__name__) + self.bridge = CvBridge() + self.stereo_engine = FoundationStereoTRT(engine_path=engine_path) + + self.baseline = None + self.focal_length = None + self.fx = None + self.fy = None + self.cx = None + self.cy = None + self.camera_info_msg = None + self.min_depth = float(min_depth) + self.max_depth = float(max_depth) + self.publish_pointcloud = bool(publish_pointcloud) + self.frame_id = frame_id + self.world_frame_id = world_frame_id + self.tf_broadcaster = TransformBroadcaster(self) + + self.camerainfo_sub = self.create_subscription(CameraInfo, camera_info_topic, self.info_callback, 10) + self.left_sub = Subscriber(self, Image, left_topic) + self.right_sub = Subscriber(self, Image, right_topic) + self.ts = ApproximateTimeSynchronizer([self.left_sub, self.right_sub], queue_size=queue_size, slop=slop) + self.ts.registerCallback(self.images_callback) + + self.keyframe_depth_pub = self.create_publisher(Image, keyframe_depth_topic, 10) + self.pointcloud_pub = None + if self.publish_pointcloud: + self.pointcloud_pub = self.create_publisher(PointCloud2, pointcloud_topic, 10) + self.logger.info( + "FoundationStereoDepthNode initialized. left=%s right=%s info=%s depth=%s pointcloud=%s enabled=%s", + left_topic, + right_topic, + camera_info_topic, + keyframe_depth_topic, + pointcloud_topic, + self.publish_pointcloud, + ) + + def _publish_world_to_foundation_tf(self, stamp): + # Static orientation offset around X so world Z points upward in visualization. + qx = math.sin(math.radians(-90.0) * 0.5) + qw = math.cos(math.radians(-90.0) * 0.5) + + tf_msg = TransformStamped() + tf_msg.header.stamp = stamp + tf_msg.header.frame_id = self.world_frame_id + tf_msg.child_frame_id = self.frame_id + tf_msg.transform.translation.x = 0.0 + tf_msg.transform.translation.y = 0.0 + tf_msg.transform.translation.z = 0.0 + tf_msg.transform.rotation.x = qx + tf_msg.transform.rotation.y = 0.0 + tf_msg.transform.rotation.z = 0.0 + tf_msg.transform.rotation.w = qw + self.tf_broadcaster.sendTransform(tf_msg) + + def info_callback(self, msg: CameraInfo): + if self.focal_length is not None: + return + k = np.array(msg.k, dtype=np.float32).reshape(3, 3) + fx = float(k[0, 0]) + fy = float(k[1, 1]) + cx = float(k[0, 2]) + cy = float(k[1, 2]) + tx = float(msg.p[3]) + baseline = -tx / fx + if baseline <= 0.0: + self.get_logger().warning(f"Baseline from CameraInfo is non-positive: {baseline:.6f}") + self.focal_length = fx + self.fx = fx + self.fy = fy + self.cx = cx + self.cy = cy + self.baseline = baseline + self.camera_info_msg = msg + self.get_logger().info( + f"Camera intrinsics received. fx={self.fx:.3f} fy={self.fy:.3f} " + f"cx={self.cx:.3f} cy={self.cy:.3f} baseline={self.baseline:.5f} m" + ) + self.destroy_subscription(self.camerainfo_sub) + + def _build_colorized_pointcloud(self, depth: np.ndarray, left_img: np.ndarray, stamp) -> PointCloud2: + h, w = depth.shape + valid = np.isfinite(depth) & (depth > self.min_depth) & (depth < self.max_depth) + ys, xs = np.where(valid) + z = depth[ys, xs].astype(np.float32) + + x = ((xs.astype(np.float32) - self.cx) * z) / self.fx + y = ((ys.astype(np.float32) - self.cy) * z) / self.fy + + color_rgb = cv2.cvtColor(left_img, cv2.COLOR_GRAY2RGB) + r = color_rgb[ys, xs, 0].astype(np.uint32) + g = color_rgb[ys, xs, 1].astype(np.uint32) + b = color_rgb[ys, xs, 2].astype(np.uint32) + rgb_u32 = (r << 16) | (g << 8) | b + rgb_f32 = rgb_u32.view(np.float32) + + cloud = np.empty( + z.shape[0], + dtype=[("x", np.float32), ("y", np.float32), ("z", np.float32), ("rgb", np.float32)], + ) + cloud["x"] = x + cloud["y"] = y + cloud["z"] = z + cloud["rgb"] = rgb_f32 + + msg = PointCloud2() + msg.header.stamp = stamp + msg.header.frame_id = self.frame_id + msg.height = 1 + msg.width = int(cloud.shape[0]) + msg.fields = [ + PointField(name="x", offset=0, datatype=PointField.FLOAT32, count=1), + PointField(name="y", offset=4, datatype=PointField.FLOAT32, count=1), + PointField(name="z", offset=8, datatype=PointField.FLOAT32, count=1), + PointField(name="rgb", offset=12, datatype=PointField.FLOAT32, count=1), + ] + msg.is_bigendian = False + msg.point_step = 16 + msg.row_step = msg.point_step * msg.width + msg.is_dense = False + msg.data = cloud.tobytes() + return msg + + def images_callback(self, left_msg: Image, right_msg: Image): + if self.focal_length is None or self.baseline is None: + self.get_logger().warning("Skipping stereo frame: waiting for camera_info.") + return + + left_img = self.bridge.imgmsg_to_cv2(left_msg, "mono8") + right_img = self.bridge.imgmsg_to_cv2(right_msg, "mono8") + + with Timer( + name="FoundationStereo Inference", + text="[{name}] Elapsed time: {milliseconds:.0f} ms", + logger=self.logger.info, + ): + _, depth = asyncio.run( + self.stereo_engine.infer( + left_img, + right_img, + np.array([[self.baseline]], dtype=np.float32), + np.array([[self.focal_length]], dtype=np.float32), + ) + ) + + with Timer( + name="Publish Keyframe Depth", + text="[{name}] Elapsed time: {milliseconds:.0f} ms", + logger=self.logger.info, + ): + depth_msg = self.bridge.cv2_to_imgmsg(depth.astype(np.float32), encoding="32FC1") + depth_msg.header.stamp = left_msg.header.stamp + depth_msg.header.frame_id = self.frame_id + self.keyframe_depth_pub.publish(depth_msg) + self._publish_world_to_foundation_tf(left_msg.header.stamp) + + if self.publish_pointcloud: + with Timer( + name="Build+Publish Keyframe PointCloud", + text="[{name}] Elapsed time: {milliseconds:.0f} ms", + logger=self.logger.info, + ): + cloud_msg = self._build_colorized_pointcloud(depth, left_img, left_msg.header.stamp) + self.pointcloud_pub.publish(cloud_msg) + + +def main(args=None): + if not ROS_AVAILABLE: + raise RuntimeError("ROS dependencies are unavailable. Please run in the TinyNav ROS2 environment.") + + parser = argparse.ArgumentParser() + parser.add_argument( + "--engine_path", + type=str, + default=f"/tinynav/tinynav/models/foundation_stereo_{platform.machine()}.plan", + ) + parser.add_argument("--left_topic", type=str, default="/camera/camera/infra1/image_rect_raw") + parser.add_argument("--right_topic", type=str, default="/camera/camera/infra2/image_rect_raw") + parser.add_argument("--camera_info_topic", type=str, default="/camera/camera/infra2/camera_info") + parser.add_argument("--keyframe_depth_topic", type=str, default="/keyframe/depth") + parser.add_argument("--pointcloud_topic", type=str, default="/keyframe/pointcloud") + parser.add_argument("--frame_id", type=str, default="camera_foundation_stereo") + parser.add_argument("--world_frame_id", type=str, default="world") + parser.add_argument("--publish_pointcloud", action="store_true") + parser.add_argument("--min_depth", type=float, default=0.05) + parser.add_argument("--max_depth", type=float, default=20.0) + parser.add_argument("--sync_queue_size", type=int, default=10) + parser.add_argument("--sync_slop", type=float, default=0.02) + parser.add_argument("--log_level", type=str, default="INFO") + parsed_args, _ = parser.parse_known_args(args if args is not None else sys.argv[1:]) + + logging.basicConfig( + level=getattr(logging, parsed_args.log_level.upper(), logging.INFO), + format="%(asctime)s - %(filename)s:%(lineno)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + rclpy.init(args=args) + node = FoundationStereoDepthNode( + engine_path=parsed_args.engine_path, + left_topic=parsed_args.left_topic, + right_topic=parsed_args.right_topic, + camera_info_topic=parsed_args.camera_info_topic, + keyframe_depth_topic=parsed_args.keyframe_depth_topic, + pointcloud_topic=parsed_args.pointcloud_topic, + frame_id=parsed_args.frame_id, + world_frame_id=parsed_args.world_frame_id, + publish_pointcloud=parsed_args.publish_pointcloud, + min_depth=parsed_args.min_depth, + max_depth=parsed_args.max_depth, + queue_size=parsed_args.sync_queue_size, + slop=parsed_args.sync_slop, + ) + executor = rclpy.executors.SingleThreadedExecutor() + executor.add_node(node) + try: + executor.spin() + finally: + node.destroy_node() + executor.shutdown() + rclpy.shutdown() + + +if __name__ == "__main__": + import os + + left_path = "/tinynav/tests/data/looper/left.png" + right_path = "/tinynav/tests/data/looper/right.png" + run_local_infer = ( + len(sys.argv) > 1 and sys.argv[1] == "--local_infer" + ) or (not ROS_AVAILABLE and os.path.exists(left_path) and os.path.exists(right_path)) + if run_local_infer: + left = cv2.imread(left_path, cv2.IMREAD_GRAYSCALE) + right = cv2.imread(right_path, cv2.IMREAD_GRAYSCALE) + model = FoundationStereoTRT() + disp, depth = asyncio.run( + model.infer( + left, + right, + np.array([[0.056]], dtype=np.float32), + np.array([[323.0]], dtype=np.float32), + ) + ) + print("disp", disp.shape, disp.dtype, float(np.nanmean(disp))) + print("depth", depth.shape, depth.dtype, float(np.nanmean(depth[np.isfinite(depth)]))) + else: + main() diff --git a/tool/global_pointcloud_publisher.py b/tool/global_pointcloud_publisher.py index 8128c7b1..f846df37 100644 --- a/tool/global_pointcloud_publisher.py +++ b/tool/global_pointcloud_publisher.py @@ -235,18 +235,21 @@ def __init__(self, args): self.cloud_pub = self.create_publisher(PointCloud2, "/slam/global_cloud", 10) self.path_pub = self.create_publisher(Path, "/slam/global_cloud_path", 10) - self.depth_log_sub = self.create_subscription(Image, "/camera/camera/depth/image_rect_raw", self.depth_log_callback, self.sensor_qos) + self.depth_log_sub = self.create_subscription(Image, args.depth_topic, self.depth_log_callback, self.sensor_qos) self.pose_log_sub = self.create_subscription(PoseStamped, args.pose_topic, self.pose_log_callback, 10) image_msg_type = CompressedImage if args.image_mode == "color" else Image self.image_log_sub = self.create_subscription(image_msg_type, self.image_topic, self.image_log_callback, self.sensor_qos) - self.depth_sub = message_filters.Subscriber(self, Image, "/camera/camera/depth/image_rect_raw", qos_profile=self.sensor_qos) + self.depth_sub = message_filters.Subscriber(self, Image, args.depth_topic, qos_profile=self.sensor_qos) self.pose_sub = message_filters.Subscriber(self, PoseStamped, args.pose_topic) self.image_sub = message_filters.Subscriber(self, image_msg_type, self.image_topic, qos_profile=self.sensor_qos) self.sync = message_filters.ApproximateTimeSynchronizer([self.depth_sub, self.pose_sub, self.image_sub], queue_size=20, slop=0.08) self.sync.registerCallback(self.sync_callback) - self.get_logger().info(f"Publishing global cloud on /slam/global_cloud from /camera/camera/depth/image_rect_raw + {args.pose_topic} + {self.image_topic} ({args.image_mode})") + self.get_logger().info( + f"Publishing global cloud on /slam/global_cloud from {args.depth_topic} + " + f"{args.pose_topic} + {self.image_topic} ({args.image_mode})" + ) def camera_info_callback(self, msg: CameraInfo): self.depth_frame_id = msg.header.frame_id or self.depth_frame_id @@ -263,7 +266,7 @@ def color_camera_info_callback(self, msg: CameraInfo): self.try_update_frame_transforms() def depth_log_callback(self, msg: Image): - self.get_logger().info("Received first depth frame on /camera/camera/depth/image_rect_raw.", once=True) + self.get_logger().info(f"Received first depth frame on {self.args.depth_topic}.", once=True) def pose_log_callback(self, msg: PoseStamped): self.get_logger().info(f"Received first pose frame on {self.args.pose_topic}.", once=True) @@ -456,7 +459,7 @@ def sync_callback(self, depth_msg: Image, pose_msg: PoseStamped, image_msg: Imag return sample_u_grid, sample_v_grid = self.get_sample_grid(depth.shape) - # `/insight/vio_pose` provides T_w_i, where the world frame is z-up. + # `/insight/vio_20hz` provides T_w_i, where the world frame is z-up. # Depth backprojection yields points in the camera optical frame, so we # first map them into the IMU/depth rig frame before applying T_w_i. T_w_i = pose_msg2np(pose_msg) @@ -487,6 +490,13 @@ def sync_callback(self, depth_msg: Image, pose_msg: PoseStamped, image_msg: Imag ] ) self.get_logger().info(f"{self.args.image_mode.capitalize()} sample points: {sample_text}", throttle_duration_sec=1.0) + + # Filter out camera-frame points below the configured floor in Y. + if cloud_camera.size != 0: + keep_cam_mask = cloud_camera[:, 1] >= -1.0 + cloud_camera = cloud_camera[keep_cam_mask] + cloud_colors = cloud_colors[keep_cam_mask] + cloud_world = transform_points(cloud_camera, T_world_camera) keep_mask = crop_mask(cloud_world, current_position, 100.0) cloud_world = cloud_world[keep_mask] @@ -527,7 +537,8 @@ def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description="Project SLAM depth into a global world-frame point cloud." ) - parser.add_argument("--pose-topic", default="/insight/vio_pose") + parser.add_argument("--pose-topic", default="/insight/vio_20hz") + parser.add_argument("--depth-topic", default="/camera/camera/depth/image_rect_raw") parser.add_argument("--image-mode", choices=("grayscale", "color"), default="color") return parser