diff --git a/.gitmodules b/.gitmodules index 2277161..c1a2c88 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,15 @@ [submodule "third_party/ObjectClear"] # for object clearing [reconstruction] path = third_party/ObjectClear url = git@github.com:PointsCoder/ObjectClear.git +[submodule "third_party/graspness_unofficial"] # for grasp generation [motion] + path = third_party/graspness_unofficial + url = git@github.com:PointsCoder/graspness_unofficial.git +[submodule "third_party/graspnetAPI"] # for grasp evaluation and utilities [motion] + path = third_party/graspnetAPI + url = git@github.com:PointsCoder/graspnetAPI.git +[submodule "third_party/MinkowskiEngine"] # for sparse convolution operations [motion] + path = third_party/MinkowskiEngine + url = https://github.com/NVIDIA/MinkowskiEngine.git +[submodule "third_party/WiLoR"] # for hand extraction [motion] + path = third_party/WiLoR + url = git@github.com:DynamoDuan/WiLoR.git \ No newline at end of file diff --git a/docker/compose.yml b/docker/compose.yml index 3703950..d5cb736 100644 --- a/docker/compose.yml +++ b/docker/compose.yml @@ -1,6 +1,6 @@ services: # Real To Sim [Master Container] - openreal2sim: + openreal2sim-new: build: context: .. dockerfile: docker/real2sim/Dockerfile diff --git a/docker/real2sim/Dockerfile b/docker/real2sim/Dockerfile index f36a807..caa5dad 100644 --- a/docker/real2sim/Dockerfile +++ b/docker/real2sim/Dockerfile @@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive # ------------------------------------------------------------------------- RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-dev python3-pip python3-venv python-is-python3 \ - build-essential cmake ninja-build pkg-config \ + build-essential cmake ninja-build pkg-config libopenblas-dev \ git curl wget ffmpeg bash libspatialindex-dev \ && python3 -m pip install --upgrade --no-cache-dir pip wheel setuptools packaging \ && rm -rf /var/lib/apt/lists/* @@ -46,6 +46,9 @@ RUN pip install --no-cache-dir torch-scatter \ COPY docker/real2sim/requirements.docker.txt /tmp/requirements.docker.txt RUN pip install --no-cache-dir -r /tmp/requirements.docker.txt +# Install chumpy separately (setup.py needs pip module, ensure build deps are available) +RUN pip install --no-build-isolation --no-cache-dir "git+https://github.com/mattloper/chumpy" + # ------------------------------------------------------------------------- # Third-Party compiled Python deps @@ -82,7 +85,18 @@ RUN cd / && wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.t cmake .. &&\ make install +RUN cd / +COPY third_party/graspnetAPI /app/third_party/graspnetAPI +RUN cd /app/third_party/graspnetAPI && pip install -e . + +COPY third_party/graspness_unofficial /app/third_party/graspness_unofficial +RUN cd /app/third_party/graspness_unofficial/pointnet2 && python -m pip install --no-cache-dir -v --no-build-isolation . +RUN cd /app/third_party/graspness_unofficial/knn && python -m pip install --no-cache-dir -v --no-build-isolation . + +COPY third_party/MinkowskiEngine /app/third_party/MinkowskiEngine +RUN cd /app/third_party/MinkowskiEngine && python setup.py install --blas=openblas --force_cuda + # ------------------------------------------------------------------------- # Runtime env # ------------------------------------------------------------------------- -ENV PYTHONPATH=/app +ENV PYTHONPATH=/app:/app/third_party:/app/third_party/graspness_unofficial/pointnet2: diff --git a/docker/real2sim/requirements.docker.txt b/docker/real2sim/requirements.docker.txt index 75533f2..c1d4ab5 100644 --- a/docker/real2sim/requirements.docker.txt +++ b/docker/real2sim/requirements.docker.txt @@ -20,6 +20,7 @@ scipy==1.15.3 wandb==0.21.3 gdown + #__________________________________________________________________________________ # 2) Related deps (torch-scatter is separate) ninja==1.11.1 @@ -61,4 +62,27 @@ Rtree # 8) Foundation Pose transformations==2024.6.1 ruamel.yaml==0.18.6 -h5py==3.10.0 \ No newline at end of file +h5py==3.10.0 + +# 9) PromptDA +pyliblzfse +lightning==2.1.3 +termcolor +tyro==0.9.2 +spaces +trimesh + +# 10) WiLoR +pyrender +pytorch-lightning +scikit-image +smplx==0.1.28 +yacs +xtcocotools +pandas +hydra-submitit-launcher +hydra-colorlog +pyrootutils +webdataset +ultralytics==8.1.34 +# Note: chumpy installed separately in Dockerfile due to build isolation issues diff --git a/openreal2sim/motion/README.md b/openreal2sim/motion/README.md new file mode 100644 index 0000000..c892977 --- /dev/null +++ b/openreal2sim/motion/README.md @@ -0,0 +1,136 @@ +544## Motion & Grasp Preparation + +This directory processes human demonstrations into robot-friendly motion goals and grasp priors. +It consumes `outputs//scene/scene.pkl` and `simulation/scene.json` from the reconstruction pipeline and augments them with: + +- frame-aligned hand/keypoint observations +- cleaned trajectories and task metadata +- contact-based grasp points & approach directions +- dense grasp proposals for every object + +This folder operates on `motion` in `scene.pkl`, and `scene.json`. + +The resulting assets are consumed by the simulation agents (`openreal2sim/simulation/*`) for replay, teleoperation, or policy training. + +--- + +## Code Structure + +``` +openreal2sim/motion +├── motion_manager.py +├── modules +│ ├── hand_extraction.py +│ ├── demo_motion_process.py +│ ├── grasp_point_extraction.py +│ └── grasp_generation.py +├── utils +│ ├── grasp_utils.py +│ └── visualize_scene.py +└── README.md +``` + +Each module can be run independently, but `motion_manager.py` provides a stage-aware wrapper similar to `recon_agent.py`. + +--- + +## Workflow + +### 1. Hand Extraction +`modules/hand_extraction.py` + +- Detects hands with YOLO + SAM2, then lifts 3D hand pose using **WiLoR**. +- Records per-frame keypoints, MANO global orientation, and binary masks under `scene_dict["motion"]`. +- These signals define contact windows and guide later grasp localization. + +### 2. Demonstration Motion Processing +`modules/demo_motion_process.py` + +- Reprojects FD-POSE / simple / hybrid trajectories into the world frame. +- Picks the manipulated object by total displacement, classifies remaining objects as static, and decides which trajectory family (`traj_key`) best matches the demonstration. +- Downsample the trajectory, exports a final GLB at the end pose, and writes helper metadata (`manipulated_oid`, `start_frame_idx`, `task_type`, `start/end_related`, `gripper_closed`, etc.) back into `simulation/scene.json`. +- Whether a static object is start-related or end-related is determined by whether it is interleaved with the manipulated object in the x-y bounding box at corresponding timestep. We consider the start/end-related object as semantcially related with the manipulated object concerning the task. +- Currently we support three type of tasks: a task is `simple_pick` if the gripper is finally closed, and there is no end-related object; a task is `simple_pick_place` if the manipulated object is final put onto the ground and gripper is opened, and if there is no end-related object; a task is `targetted_pick_place` if there exists end-related objects. + + +### 3. Grasp Point & Direction Extraction +`modules/grasp_point_extraction.py` + +- Aligns the manipulated mesh to the selected keyframe, renders it with PyTorch3D, and fuses depth/hand keypoints to locate the contact patch. +- Projects the contact into object coordinates to obtain `grasp_point` and estimates a world-frame approach direction from MANO orientation. +- Stores diagnostics (render overlays) under `outputs//simulation/debug/`. + +### 4. Grasp Proposal Generation +`modules/grasp_generation.py` + +- Samples dense surface points per object, runs **GraspNet** to produce grasp candidates, and optionally applies NMS + score truncation. +- Saves raw `.npy` proposals & `.ply` visualizations under `outputs//grasps/`. +- For the manipulated object, rescores grasps using the contact point/direction hints and writes both `grasps` and `rescored_grasps` paths into `scene.json`. +- There are three strategies to rescore grasps: + - `default`: use the z-align strategy to rescore grasps. + - `point_only`: only use the contact point to rescore grasps. + - `point_and_direction`: use both the contact point and the approach direction to rescore grasps. + +--- + +## How to Use + +1. **Prerequisites** + - Run preprocess + reconstruction so that `outputs//scene/scene.pkl` and `simulation/scene.json` exist. + - Ensure third-party weights are in place: + - `third_party/WiLoR/pretrained_models` and `third_party/WiLoR/mano_data/MANO_RIGHT.pkl` + - `third_party/Grounded-SAM-2/checkpoints` + - GraspNet checkpoints + +2. **All-in-one pipeline** + +```bash +python openreal2sim/motion/motion_manager.py +``` + +Optional flags mirror reconstruction: + +```bash +python openreal2sim/motion/motion_manager.py \ + --stage grasp_generation \ + --key demo_video +``` + +3. **Stage-specific scripts** + +```bash +python openreal2sim/motion/modules/hand_extraction.py + +python openreal2sim/motion/modules/demo_motion_process.py + +python openreal2sim/motion/modules/grasp_point_extraction.py + + +python openreal2sim/motion/modules/grasp_generation.py \ + --n_points 120000 --keep 200 --nms True +``` + +All scripts read keys from `config/config.yaml`; set `OPENREAL2SIM_KEYS` or pass `--key` through `motion_manager.py` to limit processing. + +--- + +## Outputs + +Per key, after running the pipeline you should find: + +- `outputs//scene/scene.pkl` + - `motion.hand_kpts`: `[N,21,2]` MANO keypoints (image plane) + - `motion.hand_global_orient`: rotation matrices per frame + - `motion.hand_masks`: binary hand silhouettes +- `outputs//simulation/scene.json` + - `manipulated_oid`, `traj_key`, `start_frame_idx`, `task_type`, `start_related`, `end_related`, `gripper_closed` + - `objects[oid].final_trajs`, + - `objects[manipulated_oid].grasp_point`, `grasp_direction`, `grasps`, `rescored_grasps` +- `outputs//motion/scene_end.glb`: fused background + final manipulated mesh. +- `outputs//grasps/*.npy`: raw GraspNet candidates per object. +- `outputs//simulation/debug/grasp_point_visualization_*.png`: sanity-check renders. + +These artifacts are consumed by the Isaac Lab / ManiSkill / MuJoCo simulation runners during trajectory replay and policy datasets export. + +--- + diff --git a/openreal2sim/motion/modules/demo_motion_process.py b/openreal2sim/motion/modules/demo_motion_process.py new file mode 100755 index 0000000..96a880a --- /dev/null +++ b/openreal2sim/motion/modules/demo_motion_process.py @@ -0,0 +1,373 @@ + + +#### + +### assign object types, determine start and end frame, downsample traj, compute stacking sequence and assign mask. +### TODO: 加一个ending的mesh进来 就这样。 +### A visualizer left. +import yaml +import numpy as np +from pyquaternion import Quaternion +from pathlib import Path +import pickle +import cv2 +import trimesh +import sys +import json +import os +base_dir = Path.cwd() +sys.path.append(str(base_dir / "openreal2sim" / "motion" / "modules")) + +def recompute_traj(traj, c2w, extrinsics): + for i in range(len(traj)): + traj[i] = c2w @ np.linalg.inv(extrinsics[0]) @ extrinsics[i] @ np.linalg.inv(c2w) @ traj[i] + return traj + +def recompute_all_traj(scene_json_dict, scene_dict): + for obj_id, obj in scene_json_dict["objects"].items(): + for key in 'fdpose_trajs', 'simple_trajs', 'hybrid_trajs': + traj_path = obj[key] + traj = np.load(traj_path) + traj = traj.reshape(-1, 4, 4) + c2w = np.array(scene_json_dict["camera"]["camera_opencv_to_world"]).astype(np.float64).reshape(4,4) + extrinsics = np.array(scene_dict["extrinsics"]).astype(np.float64).reshape(-1,4,4) + traj = recompute_traj(traj, c2w, extrinsics) + new_traj_path = traj_path.replace(".npy", "_recomputed.npy").replace("reconstruction", "motion") + scene_json_dict["objects"][obj_id][f"{key}_recomputed"] = str(new_traj_path) + np.save(new_traj_path, traj) + + +def get_mask_from_frame(scene_dict, frame_index, oid): + mask = scene_dict["mask"][frame_index][oid]["mask"] + return mask + + +def get_object_bbox(mesh_in_path): + mesh = trimesh.load(mesh_in_path) + bbox = mesh.bounds + return bbox + + + +def lift_traj(traj, height=0.02): + new_traj = [] + for i in range(len(traj)): + new_traj.append(traj[i]) + for i in range(1, len(new_traj) - 1): + new_traj[i][2][3] += height + return new_traj + +def determine_stacking_bbox(bbox_0, bbox_1): + """ + Determine if two bboxes have intersection on x and y axes. + + bbox_0, bbox_1: np.ndarray, (2,3) or (2,2) or (2,?) where axis 0 is (min,max) and axis 1 is x,y,(z) + + Returns: True if x and y intervals overlap (i.e. stacking is possible), False otherwise + + Generally this is enough for start frame. + """ + # Extract x/y intervals + x0_min, x0_max = bbox_0[0][0], bbox_0[1][0] + y0_min, y0_max = bbox_0[0][1], bbox_0[1][1] + x1_min, x1_max = bbox_1[0][0], bbox_1[1][0] + y1_min, y1_max = bbox_1[0][1], bbox_1[1][1] + z0_min, z0_max = bbox_0[0][2], bbox_0[1][2] + z1_min, z1_max = bbox_1[0][2], bbox_1[1][2] + + x_overlap = not (x0_max < x1_min or x1_max < x0_min) + y_overlap = not (y0_max < y1_min or y1_max < y0_min) + if x_overlap and y_overlap: + return True + + return False + + + +def load_obj_masks(data: dict): + """ + Return object list for frame-0: + [{'mask': bool array, 'name': name, 'bbox': (x1,y1,x2,y2)}, ...] + Filter out names: 'ground' / 'hand' / 'robot' + """ + frame_objs = data.get(0, {}) # only frame 0 + objs = [] + for oid, item in frame_objs.items(): + name = item["name"] + if name in ("ground", "hand", "robot"): + continue + objs.append({ + "oid": oid, + "mask": item["mask"].astype(bool), + "name": name, + "bbox": item["bbox"] # used for cropping + }) + # Keep original behavior: sort by mask area (desc) + objs.sort(key=lambda x: int(x["oid"])) + return objs + + + +def pose_distance(T1, T2): + # Translation distance + t1 = T1[:3, 3] + t2 = T2[:3, 3] + trans_dist = np.linalg.norm(t2 - t1) + # Rotation difference (angle) + R1 = T1[:3, :3] + R2 = T2[:3, :3] + dR = R2 @ R1.T + # numerical errors can make the trace slightly out of bounds + cos_angle = (np.trace(dR) - 1) / 2 + cos_angle = np.clip(cos_angle, -1.0, 1.0) + angle = np.arccos(cos_angle) + return trans_dist, angle + +def build_mask_array( + oid: int, + scene_dict: dict[str, any] +) -> np.ndarray: + + H, W, N = scene_dict["height"], scene_dict["width"], scene_dict["n_frames"] + out = np.zeros((N, H, W), dtype=np.uint8) + for i in range(N): + m = scene_dict["mask"][i][oid]["mask"] + out[i] = m.astype(np.uint8) + return out + + + +def downsample_traj(trajectory, trans_threshold=0.05, rot_threshold=np.radians(20)): + """ + Downsample pose trajectory where each element is a 4x4 transformation matrix (SE3). + Retains keyframes with sufficient translation or rotation from the previous kept frame. + Returns: list of indices for retained frames. + """ + if len(trajectory) <= 2: + return list(range(len(trajectory))) + + + downsampled_indices = [0] + prev_idx = 0 + for i in range(1, len(trajectory) - 1): + trans_dist, angle = pose_distance(trajectory[prev_idx], trajectory[i]) + if trans_dist >= trans_threshold or angle >= rot_threshold: + downsampled_indices.append(i) + prev_idx = i + if len(downsampled_indices) > 0: + last_kept_idx = downsampled_indices[-1] + trans_dist, angle = pose_distance(trajectory[last_kept_idx], trajectory[-1]) + if trans_dist > 0.6 * trans_threshold or angle > 0.6 * rot_threshold: + downsampled_indices.append(len(trajectory) - 1) + elif trans_dist > 0.1 * trans_threshold or angle > 0.1 * rot_threshold: + downsampled_indices[-1] = len(trajectory) - 1 + + return downsampled_indices + +def refine_end_frame(hand_masks, object_masks): + N = min(len(hand_masks), len(object_masks)) + kernel = np.ones((10, 10), np.uint8) + for i in range(N-1, -1, -1): + if hand_masks[i] is None: + continue + hand_mask = hand_masks[i].astype(np.uint8) + obj_mask = object_masks[i].astype(np.uint8) + obj_mask_dilated = cv2.dilate(obj_mask, kernel, iterations=1) + overlap = (hand_mask & obj_mask_dilated).sum() + if overlap > 0: + return i + 1 + return 0 + + +def demo_motion_process(keys, key_scene_dicts, key_cfgs): + base_dir = Path.cwd() + for key in keys: + scene_dict = key_scene_dicts[key] + scene_json_dict_path = base_dir / f"outputs/{key}/simulation/scene.json" + with open(scene_json_dict_path, "r") as f: + scene_json_dict = json.load(f) + recompute_all_traj(scene_json_dict, scene_dict) + key_cfg = key_cfgs[key] + max_placement_oid = None + max_placement_distance = 0.0 + placement_distances = {} + fd_traj_dict = {} + for obj_id, obj in scene_json_dict["objects"].items(): + obj_traj = obj["simple_trajs_recomputed"] + obj_traj = np.load(obj_traj) + obj_traj = obj_traj.reshape(-1, 4, 4) + abs_distance, _ = pose_distance(obj_traj[0], obj_traj[-1]) + fd_traj = obj["fdpose_trajs_recomputed"] + fd_traj = np.load(fd_traj) + fd_traj = fd_traj.reshape(-1, 4, 4) + fd_dist, fd_angle = pose_distance(fd_traj[0], fd_traj[-1]) + + fd_traj_dict[obj_id] = { + "fd_distance": fd_dist, + "fd_angle": fd_angle + } + print(f"Object: {obj_id}, distance: {abs_distance}") + if abs_distance > max_placement_distance: + max_placement_distance = abs_distance + max_placement_oid = obj_id + scene_json_dict["objects"][obj_id]["type"] = "static" + placement_distances[obj_id] = abs_distance + scene_json_dict["manipulated_oid"] = max_placement_oid + scene_dict["info"]["manipulated_oid"] = max_placement_oid + name = scene_json_dict["objects"][max_placement_oid]["name"] + scene_json_dict["objects"][max_placement_oid]["type"] = "manipulated" + print(f"Manipulated object: {max_placement_oid}, distance: {abs_distance}") + manipulated_fd_trajs_path = scene_json_dict["objects"][max_placement_oid]["fdpose_trajs_recomputed"] + manipulated_fd_trajs = np.load(manipulated_fd_trajs_path) + angle_displacement = [pose_distance(manipulated_fd_trajs[i], manipulated_fd_trajs[i+1])[1] for i in range(len(manipulated_fd_trajs)-1)] + angle_displacement_sum = sum(angle_displacement) + # manipulated_simple_trajs_path = scene_json_dict["objects"][max_placement_oid]["simple_trajs_recomputed"] + # manipulated_simple_trajs = np.load(manipulated_simple_trajs_path) + # pos_displacement = [pose_distance(manipulated_simple_trajs[i], manipulated_fd_trajs[i])[0] for i in range(len(manipulated_fd_trajs))] + fd_traj_dict.pop(max_placement_oid) + traj_key = 'fdpose_trajs_recomputed' + if len(fd_traj_dict.keys()) > 0: + max_fd_distance = max(fd_traj_dict.values(), key=lambda x: x["fd_distance"]) + max_fd_angle = max(fd_traj_dict.values(), key=lambda x: x["fd_angle"]) + else: + max_fd_distance = 0.0 + max_fd_angle = 0.0 + if angle_displacement_sum > 180 or max_fd_angle > np.radians(20): + traj_key = 'simple_trajs_recomputed' + else: + if max_fd_distance > 0.05: + traj_key = 'hybrid_trajs_recomputed' + print(f"Traj key: {traj_key}") + scene_json_dict["traj_key"] = traj_key + scene_dict["info"]["traj_key"] = traj_key + manipulated_trajs_path = scene_json_dict["objects"][max_placement_oid][traj_key] + manipulated_trajs = np.load(manipulated_trajs_path) + object_masks = build_mask_array(int(max_placement_oid), scene_dict) + hand_masks = scene_dict["motion"]["hand_masks"] + end_frame = refine_end_frame(hand_masks, object_masks) + print(f"End frame: {end_frame}") + trajs = manipulated_trajs + if end_frame == len(manipulated_trajs) - 1: + scene_json_dict["gripper_closed"] = True + elif end_frame == 0: + if np.abs(manipulated_trajs[0][2][3]) < 0.03: + scene_json_dict["gripper_closed"] = False + else: + scene_json_dict["gripper_closed"] = True + else: + scene_json_dict["gripper_closed"] = False + #trajs = manipulated_trajs[:end_frame] + + + downsampled_indices = downsample_traj(trajs, trans_threshold=0.03, rot_threshold=np.radians(20)) + scene_json_dict["chosen_indices"] = downsampled_indices + trajs = trajs[downsampled_indices] + + + trajs = lift_traj(trajs, height=0.02) + print(f"Downsampled indices: {[downsampled_indices]}") + downsampled_traj_path = base_dir / f"outputs/{key}/simulation" / f"{max_placement_oid}_final_traj.npy" + + + print(f"Trajs: {trajs[-1]}") + np.save(downsampled_traj_path, trajs) + scene_json_dict["objects"][max_placement_oid]["final_trajs"] = str(downsampled_traj_path) + #scene_json_dict["start_frame_idx"] = downsampled_indices[1] + scene_dict["info"]["start_frame_idx"] = downsampled_indices[1] + end_pose = trajs[-1] + mesh_in_path = scene_json_dict["objects"][max_placement_oid]["optimized"] + mesh = trimesh.load(mesh_in_path) + mesh.apply_transform(end_pose) + path = base_dir / f"outputs/{key}/simulation/{max_placement_oid}_{name}_end.glb" + mesh.export(path) + scene_json_dict["objects"][max_placement_oid]["end_mesh"] = str(path) + bbox_end = get_object_bbox(path) + + start_related = [] + end_related = [] + mesh_in_path = scene_json_dict["objects"][max_placement_oid]["optimized"] + bbox_start = get_object_bbox(mesh_in_path) + for oid, obj in scene_json_dict["objects"].items(): + if not isinstance(oid, int): + continue + if oid == max_placement_oid: + continue + mesh_in_path = obj["optimized"] + bbox = get_object_bbox(mesh_in_path) + if determine_stacking_bbox(bbox_start, bbox): + start_related.append(oid) + if determine_stacking_bbox(bbox_end, bbox): + end_related.append(oid) + + print(f"Start related: {start_related}") + print(f"End related: {end_related}") + + scene_json_dict["start_related"] = list(start_related) + scene_json_dict["end_related"] = list(end_related) + + scene_path = scene_json_dict["scene_mesh"]["optimized"] + scene_mesh = trimesh.load(scene_path) + scene_mesh = scene_mesh + mesh + if not os.path.exists(base_dir / f"outputs/{key}/motion"): + os.makedirs(base_dir / f"outputs/{key}/motion") + path = base_dir / f"outputs/{key}/motion/scene_end.glb" + scene_mesh.export(path) + + + + if len(end_related) > 0 : + target_oid = end_related[0] + scene_json_dict["target_oid"] = target_oid + scene_json_dict["task_type"] = "targetted_pick_place" + if len(scene_json_dict["task_desc"]) > 0: + task_desc = "Pick up the " + name + " and place it on the " + target_oid + "." + if task_desc not in scene_json_dict["task_desc"]: + scene_json_dict["task_desc"].append(task_desc) + else: + task_desc = "Pick up the " + name + " and place it on the " + target_oid + "." + if task_desc not in scene_json_dict["task_desc"]: + scene_json_dict["task_desc"] = [task_desc] + else: + if not scene_json_dict["gripper_closed"]: + scene_json_dict["task_type"] = "simple_pick_place" + task_desc = "Pick up the " + name + " and place it on the ground." + if len(scene_json_dict.get("task_desc", [])) == 0: + scene_json_dict["task_desc"] = [task_desc] + if task_desc not in scene_json_dict["task_desc"]: + scene_json_dict["task_desc"].append(task_desc) + else: + scene_json_dict["task_type"] = "simple_pick" + task_desc = "Pick up the " + name + "." + if len(scene_json_dict.get("task_desc", [])) == 0: + scene_json_dict["task_desc"] = [task_desc] + if task_desc not in scene_json_dict["task_desc"]: + scene_json_dict["task_desc"].append(task_desc) + + print(f"Task description: {scene_json_dict['task_desc']}") + print(f"Task type: {scene_json_dict['task_type']}") + + with open(base_dir / f"outputs/{key}/simulation/scene.json", "w") as f: + json.dump(scene_json_dict, f, indent=2) + with open(base_dir / f"outputs/{key}/scene/scene.pkl", "wb") as f: + pickle.dump(scene_dict, f) + return key_scene_dicts + + +if __name__ == "__main__": + base_dir = Path.cwd() + cfg_path = base_dir / "config" / "config.yaml" + cfg = yaml.safe_load(cfg_path.open("r")) + keys = cfg["keys"] + from utils.compose_config import compose_configs + key_cfgs = {key: compose_configs(key, cfg) for key in keys} + print(f"Key cfgs: {key_cfgs}") + key_scene_dicts = {} + for key in keys: + scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl' + with open(scene_pkl, 'rb') as f: + scene_dict = pickle.load(f) + key_scene_dicts[key] = scene_dict + demo_motion_process(keys, key_scene_dicts, key_cfgs) + + + diff --git a/openreal2sim/motion/modules/grasp_generation.py b/openreal2sim/motion/modules/grasp_generation.py new file mode 100755 index 0000000..f0ec412 --- /dev/null +++ b/openreal2sim/motion/modules/grasp_generation.py @@ -0,0 +1,313 @@ +import os +import sys +import numpy as np +import time +import torch +from typing import List +import open3d as o3d +from graspnetAPI.grasp import GraspGroup +from pathlib import Path +base_dir = Path.cwd() +sys.path.append(str(base_dir / "third_party")) +from graspness_unofficial.models.graspnet import GraspNet, pred_decode +from graspness_unofficial.dataset.graspnet_dataset import minkowski_collate_fn +from graspness_unofficial.utils.collision_detector import ModelFreeCollisionDetector +from graspness_unofficial.utils.data_utils import CameraInfo, create_point_cloud_from_depth_image, get_workspace_mask +from scipy.spatial.transform import Rotation as R +from utils.grasp_utils import * +from utils.compose_config import compose_configs +import argparse +import json +import yaml + + +def sample_surface(glb_path: str, n_points: int = 5000) -> np.ndarray: + """Load mesh and sample surface points.""" + mesh = trimesh.load_mesh(glb_path) + if isinstance(mesh, trimesh.Scene): + mesh = list(mesh.geometry.values())[0] + pts, _ = trimesh.sample.sample_surface(mesh, n_points) + return np.array(pts, dtype=np.float32) + + +def crop_pointcloud_quadrant( + points, + quadrant=1, + axes="xy", +): + # Map axis letters to indices + axis_index = {"x": 0, "y": 1, "z": 2} + a1, a2 = axes[0], axes[1] + idx1, idx2 = axis_index[a1], axis_index[a2] + + center_full = points.mean(axis=0) + + c1, c2 = center_full[idx1], center_full[idx2] + + ge = np.greater # > + + # Build boolean masks for the four quadrants + # Right/Left split along axis1, Top/Bottom split along axis2 + right = ge(points[:, idx1], c1) # axis1 >= c1 (or > c1) + top = ge(points[:, idx2], c2) # axis2 >= c2 (or > c2) + + # Quadrant mapping per convention documented above + if quadrant == 1: + mask = right & top + elif quadrant == 2: + mask = (~right) & top + elif quadrant == 3: + mask = (~right) & (~top) + elif quadrant == 4: + mask = right & (~top) + else: + raise ValueError("`quadrant` must be an integer in {1,2,3,4}.") + + cropped_points = points[mask] + return cropped_points, mask + + +def read_score_safe(gg, i: int, fallback: float) -> float: + """Try to read candidate score; fall back to a synthetic rank-based value.""" + try: + return float(getattr(gg[i], "score")) + except Exception: + pass + try: + return float(gg.grasp_group_array[i][0]) + except Exception: + return fallback + + +def grasps_to_pointcloud( + gg, pts_per_gripper: int = 400, color=(1.0, 0.0, 0.0) +) -> o3d.geometry.PointCloud: + """Sample each gripper mesh to points and tint with color.""" + geoms = gg.to_open3d_geometry_list() # list of TriangleMesh + out = o3d.geometry.PointCloud() + for g in geoms: + out += g.sample_points_uniformly(pts_per_gripper) + if len(out.points) > 0: + out.colors = o3d.utility.Vector3dVector( + np.tile(np.array(color, dtype=np.float32), (len(out.points), 1)) + ) + return out + + +def save_vis_ply( + points_xyz: np.ndarray, + gg, + save_path: Path, + pts_per_gripper: int = 400, + cloud_color=(0.0, 1.0, 0.0), +): + """Write a PLY: green object cloud + red ALL grasp candidates.""" + cloud = o3d.geometry.PointCloud() + cloud.points = o3d.utility.Vector3dVector(points_xyz.astype(np.float32)) + cloud.colors = o3d.utility.Vector3dVector( + np.tile(np.array(cloud_color, dtype=np.float32), (len(cloud.points), 1)) + ) + grasp_pcd = grasps_to_pointcloud( + gg, pts_per_gripper=pts_per_gripper, color=(1.0, 0.0, 0.0) + ) + merged = cloud + grasp_pcd + save_path.parent.mkdir(parents=True, exist_ok=True) + o3d.io.write_point_cloud(str(save_path), merged) + print(f"[VIS] saved {save_path}") + + +def process_one_object( + gg_net: MyGraspNet, + obj_idx: int, + obj_dict: dict, + proposals_dir: Path, + keep: int | None, + nms: bool, + n_points: int, + overwrite: bool, + vis_pts_per_gripper: int = 400, +) -> str | None: + """ + Build full point cloud for one object, run GraspNet, export all candidates to NPZ, + save a PLY visualization, and return the NPZ path (string). + """ + glb_path = obj_dict["optimized"] + if glb_path is None or (not os.path.exists(glb_path)): + print( + f"[WARN][{obj_dict['oid']}_{obj_dict['name']}] mesh not found -> {glb_path}" + ) + return None + + npy_path = proposals_dir / f"{obj_dict['oid']}_{obj_dict['name']}_grasp.npy" + vis_path = proposals_dir / f"{obj_dict['oid']}_{obj_dict['name']}_grasp_viz.ply" + + # Build cloud & inference + pcs = sample_surface(glb_path, n_points=n_points) + # pcs = crop_pointcloud_quadrant(pcs, quadrant=3, axes="xy")[0] + pcs[..., 2] *= -1.0 # keep identical convention + gg = gg_net.inference(pcs) + if nms: + gg = gg.nms() + gg = gg.sort_by_score() + if isinstance(keep, int) and keep is not None and len(gg) > keep: + gg = gg[:keep] + if len(gg) == 0: + print(f"[WARN][{obj_dict['oid']}_{obj_dict['name']}] zero proposals") + return None + + # Save visualization + save_vis_ply(pcs, gg, vis_path, pts_per_gripper=vis_pts_per_gripper) + + for g_i in range(len(gg)): + translation = gg[g_i].translation + rotation = gg[g_i].rotation_matrix + translation = np.array([translation[0], translation[1], -translation[2]]) + rotation[2, :] = -rotation[2, :] + rotation[:, 2] = -rotation[:, 2] + gg.grasp_group_array[g_i][13:16] = translation + gg.grasp_group_array[g_i][4:13] = rotation.reshape(-1) + + gg.save_npy(npy_path) + print( + f"[OK][{obj_dict['oid']}_{obj_dict['name']}] saved {len(gg.grasp_group_array)} -> {npy_path.name}" + ) + + return str(npy_path.resolve()), gg + + +def run_for_key( + key: str, + n_points: int, + keep: int | None, + nms: bool, + overwrite: bool, + vis_pts_per_gripper: int, + strategy: str, +): + base_dir = Path.cwd() + out_dir = base_dir / "outputs" + scene_json = out_dir / key / "simulation" / "scene.json" + if not scene_json.exists(): + raise FileNotFoundError(scene_json) + + # Load scene.json once + scene_dict = json.load(open(scene_json)) + objects = scene_dict.get("objects", {}) + if not isinstance(objects, dict) or len(objects) == 0: + print(f"[WARN][{key}] scene_dict['objects'] is empty.") + return + + # Prepare save dir + key_dir = out_dir / key + proposals_dir = key_dir / "grasps" + proposals_dir.mkdir(parents=True, exist_ok=True) + + # Single GraspNet instance reused for all objects + net = MyGraspNet() + + manipulated_oid = scene_dict["manipulated_oid"] + # Loop all objects + for i, obj in objects.items(): + npy_path, gg = process_one_object( + gg_net=net, + obj_idx=i, + obj_dict=obj, + proposals_dir=proposals_dir, + keep=keep, + nms=nms, + n_points=n_points, + overwrite=overwrite, + vis_pts_per_gripper=vis_pts_per_gripper, + ) + if npy_path is not None: + scene_dict["objects"][i]["grasps"] = npy_path + with open(scene_json, "w") as f: + json.dump(scene_dict, f, indent=2) + print( + f"[OK][{key}] scene.json updated with 'grasps' for {obj['oid']}_{obj['name']}." + ) + if int(obj["oid"]) == int(manipulated_oid): + if scene_dict["objects"][i]["grasp_point"] is not None and strategy == "point_and_direction": + grasp_point = scene_dict["objects"][i]["grasp_point"] + grasp_direction = np.asarray(scene_dict["objects"][i]["grasp_direction"], dtype=np.float32) + if grasp_direction.shape == (3, 3): + grasp_direction = grasp_direction[:, 0] + grasp_direction = np.asarray(grasp_direction, dtype=np.float32).reshape(-1) + if grasp_direction.shape[0] != 3 or np.linalg.norm(grasp_direction) < 1e-6: + print(f"[WARN][{key}] Invalid grasp_direction for {obj['oid']}_{obj['name']} – falling back to default Z.") + grasp_dir_hint = None + else: + grasp_dir_hint = (grasp_direction / np.linalg.norm(grasp_direction)).tolist() + gg = get_best_grasp_with_hints(gg, grasp_point, grasp_dir_hint) + elif scene_dict["objects"][i]["grasp_point"] is not None and strategy == "point_only": + grasp_point = scene_dict["objects"][i]["grasp_point"] + gg = get_best_grasp_with_hints(gg, grasp_point, [0,0,1]) + else: + z = [0,0,1] # default direction + gg = get_best_grasp_with_hints(gg, None, z) + npy_path = npy_path.replace(".npy", "_rescored.npy") + gg.save_npy(npy_path) + if npy_path is not None: + scene_dict["objects"][i]["rescored_grasps"] = npy_path + with open(scene_json, "w") as f: + json.dump(scene_dict, f, indent=2) + + +def grasp_generation(keys): + cfg_path = Path.cwd() / "config" / "config.yaml" + cfg = yaml.safe_load(cfg_path.open("r")) + keys = cfg["keys"] + key_cfgs = {key: compose_configs(key, cfg) for key in keys} + for key in keys: + cfg = key_cfgs[key] + print(f"\n========== [GraspDet] Processing key: {key} ==========") + run_for_key( + key=key, + n_points=cfg["n_points"], + keep=cfg["keep"], + nms=cfg["nms"], + overwrite=cfg["overwrite"], + vis_pts_per_gripper=cfg["vis_pts_per_gripper"], + strategy=cfg["rescore_strategy"], + ) + + print('[Info] Grasp generation completed.') + + +def main(): + parser = argparse.ArgumentParser( + "Export grasp proposals for ALL objects (batch over keys) and write paths into scene.json" + ) + parser.add_argument("--n_points", type=int, default=100000) + parser.add_argument("--keep", type=int, default=None) # set None to keep all + parser.add_argument("--nms", type=bool, default=True) + parser.add_argument("--overwrite", action="store_true") + parser.add_argument( + "--vis_pts_per_gripper", + type=int, + default=400, + help="points sampled per gripper mesh for PLY", + ) + args = parser.parse_args() + + # load keys from YAML + cfg_path = Path.cwd() / "config" / "config.yaml" + cfg = yaml.safe_load(cfg_path.open("r")) + keys = cfg["keys"] + + for key in keys: + print(f"\n========== [GraspDet] Processing key: {key} ==========") + key_cfgs = compose_configs(key, cfg) + run_for_key( + key=key, + n_points=args.n_points, + keep=args.keep, + nms=args.nms, + overwrite=args.overwrite, + vis_pts_per_gripper=args.vis_pts_per_gripper, + key_cfgs=key_cfgs, + ) + + +if __name__ == "__main__": + main() diff --git a/openreal2sim/motion/modules/grasp_point_extraction.py b/openreal2sim/motion/modules/grasp_point_extraction.py new file mode 100755 index 0000000..d0f0eb0 --- /dev/null +++ b/openreal2sim/motion/modules/grasp_point_extraction.py @@ -0,0 +1,434 @@ +### This part is used to generate the affordance of the object. +### Include: +### 1. Grasp point generation. +### 2. Affordance map generation. Mainly for articulated object. This might be further combined with a PartSLIP network. +### TODO: Human annotation. +### The logic we use for grasp point generation here is: we select the frist frame of object-hand contact, compute the contact point, overlay the object and extract the 3D coordinate of the point. +### Be careful that we transfer this point to the first frame using pose matrices and extrinsics. +### TODO: This might be helpful to object selection in the first step. +import pickle +import pathlib +from pathlib import Path +import yaml +import os +import sys +import torch +import numpy as np +import cv2 +import imageio +import json +import pytorch3d +from pytorch3d.structures import Meshes +from pytorch3d.renderer import ( + look_at_view_transform, + PerspectiveCameras, + RasterizationSettings, + MeshRenderer, + MeshRasterizer, + SoftPhongShader, + PointLights, + TexturesVertex, + BlendParams, +) +from pytorch3d.utils.camera_conversions import cameras_from_opencv_projection +import trimesh + + +def create_pytorch3d_camera(K, T_c2w, img_size, device="cuda:0"): + """ + Create a PyTorch3D camera from OpenCV-style camera parameters. + + Args: + K: (3, 3) camera intrinsic matrix + T_c2w: (4, 4) camera-to-world transformation matrix + img_size: (H, W) or int, image size + device: device string + + Returns: + PerspectiveCameras object + """ + device = torch.device(device if torch.cuda.is_available() else "cpu") + + # Parse image size + if isinstance(img_size, int): + H, W = img_size, img_size + else: + H, W = img_size + image_size = torch.tensor([[H, W]], dtype=torch.float32, device=device) + + # Convert camera-to-world to world-to-camera for OpenCV convention + # OpenCV projection: x_screen = K @ (R_w2c @ x_world + tvec_w2c) + T_w2c = np.linalg.inv(T_c2w) + R_w2c = T_w2c[:3, :3] # world-to-camera rotation + tvec_w2c = T_w2c[:3, 3] # world-to-camera translation + + # Convert to torch tensors with batch dimension + R_w2c_torch = torch.tensor(R_w2c, dtype=torch.float32, device=device).unsqueeze(0) # (1, 3, 3) + tvec_w2c_torch = torch.tensor(tvec_w2c, dtype=torch.float32, device=device).unsqueeze(0) # (1, 3) + K_torch = torch.tensor(K, dtype=torch.float32, device=device).unsqueeze(0) # (1, 3, 3) + + # Create PyTorch3D camera using OpenCV convention + camera = cameras_from_opencv_projection( + R=R_w2c_torch, + tvec=tvec_w2c_torch, + camera_matrix=K_torch, + image_size=image_size, + ) + R = camera.R.cpu().numpy() + R = R.reshape(3, 3) + T = camera.T.cpu().numpy() + T = T.reshape(3,) + fx = K[0,0] + fy = K[1,1] + cx = K[0,2] + cy = K[1,2] + height, width = image_size[0].cpu().numpy() + camera = PerspectiveCameras( + device=device, + focal_length=torch.tensor([[fx, fy]], dtype=torch.float32, device=device), + principal_point=torch.tensor([[cx, cy]], dtype=torch.float32, device=device), + R=[R], + T=[T], + image_size=torch.tensor([[height, width]], dtype=torch.float32, device=device), + in_ndc=False, + ) + return camera + + +def render_pytorch3d_rgbd(mesh, K, T_c2w, img_size, device="cuda:0"): + """ + Given a trimesh mesh (in world coordinate system), render using pytorch3d. + Args: + mesh: trimesh.Trimesh, should be in world coordinate system. + K: (3,3) camera intrinsic matrix, fx, fy, cx, cy. + T_c2w: (4,4) camera-to-world transformation matrix. + img_size: (H, W), output image (and depth) size. + device: device string + Returns: + mask_img: (H,W) uint8 binary mask (1 for foreground, 0 for background) + depth_img: (H,W) float32 (Z buffer, 0 for background) + """ + if not isinstance(mesh, trimesh.Trimesh): + if hasattr(mesh, 'geometry') and len(mesh.geometry) > 0: + mesh = list(mesh.geometry.values())[0] + else: + raise ValueError('mesh is not a valid trimesh.Trimesh!') + device = torch.device(device if torch.cuda.is_available() else "cpu") + + verts = torch.tensor(np.asarray(mesh.vertices), dtype=torch.float32, device=device) + faces = torch.tensor(np.asarray(mesh.faces), dtype=torch.int64, device=device) + verts_rgb = torch.ones_like(verts)[None] * torch.tensor([[0.7, 0.7, 0.7]], dtype=torch.float32, device=device) + from pytorch3d.renderer import TexturesVertex + textures = TexturesVertex(verts_features=verts_rgb) + + mesh_p3d = Meshes(verts=[verts], faces=[faces], textures=textures) + + # Create camera using unified helper function + cameras = create_pytorch3d_camera(K, T_c2w, img_size, device=device) + + # Get image dimensions for rasterization + if isinstance(img_size, int): + H, W = img_size, img_size + else: + H, W = img_size + + blend_params = BlendParams(sigma=1e-4, gamma=1e-4) + raster_settings = RasterizationSettings( + image_size=(H, W), + blur_radius=0.0, + faces_per_pixel=8, + bin_size=1024, + max_faces_per_bin=1000000, + ) + lights = PointLights(device=device, location=[[0.0, 0.0, 5.0]]) + + renderer = MeshRenderer( + rasterizer=MeshRasterizer( + cameras=cameras, + raster_settings=raster_settings + ), + shader=SoftPhongShader( + device=device, + cameras=cameras, + lights=lights, + blend_params=blend_params + ), + ) + # Render RGB (used only for mask) + img = renderer(mesh_p3d).detach().cpu().numpy()[0][:,:,:3].clip(0, 1) * 255 + img = img.astype(np.uint8) + # Depth renderer + class DepthShader(torch.nn.Module): + def __init__(self, device="cpu"): + super().__init__() + self.device = device + def forward(self, fragments, meshes, **kwargs): + return fragments.zbuf + depth_renderer = MeshRenderer( + rasterizer=MeshRasterizer( + cameras=cameras, + raster_settings=raster_settings + ), + shader=DepthShader(device=device) + ) + depth = depth_renderer(mesh_p3d)[0, ..., 0].detach().cpu().numpy() # (H, W) + mask = (depth > 0).astype(np.uint8) + return img, mask, depth + + +def find_nearest_point(point, object_mask, erosion_size=8): + """ + Given a point and an object binary mask (H,W), return the nearest point in the mask, + preferring points deeper inside the mask (away from edges). + + Args: + point: (x, y) point coordinates + object_mask: (H, W) binary mask + erosion_size: size of erosion kernel to remove edges (default: 5) + + Returns: + np.array([x, y]) or None if no valid point found + """ + H, W = object_mask.shape + # Convert mask to uint8 for cv2 operations + mask_uint8 = (object_mask * 255).astype(np.uint8) + + # Erode the mask to remove edges and get inner region + kernel = np.ones((erosion_size, erosion_size), np.uint8) + eroded_mask = cv2.erode(mask_uint8, kernel, iterations=1) + eroded_mask = (eroded_mask > 0).astype(bool) + + # Try to find point in eroded mask first (deeper inside) + ys, xs = np.where(eroded_mask) + if len(xs) > 0: + dists = np.sqrt((xs - point[0]) ** 2 + (ys - point[1]) ** 2) + min_idx = np.argmin(dists) + return np.array([xs[min_idx], ys[min_idx]]) + + # If eroded mask is empty, fall back to original mask + ys, xs = np.where(object_mask) + if len(xs) == 0: + return None + dists = np.sqrt((xs - point[0]) ** 2 + (ys - point[1]) ** 2) + min_idx = np.argmin(dists) + return np.array([xs[min_idx], ys[min_idx]]) + +def get_bbox_mask_from_mask(mask): + ys, xs = np.where(mask) + if len(xs) == 0: + return None + x_min = np.min(xs) + x_max = np.max(xs) + y_min = np.min(ys) + y_max = np.max(ys) + return x_min, x_max, y_min, y_max + +def compute_contact_point(kpts_2d, object_mask): + """ + Given 2D keypoints (N,2) and an object binary mask (H,W), return the mean keypoint location [x, y] + of all keypoints that fall inside the mask. + """ + kpts_2d = np.asarray(kpts_2d) + H, W = object_mask.shape + inside = [] + for kp in kpts_2d: + x, y = int(round(kp[0])), int(round(kp[1])) + if 0 <= y < H and 0 <= x < W: + if object_mask[y, x]: + inside.append(kp) + if len(inside) > 0: + point = np.mean(np.stack(inside, axis=0), axis=0) + px, py = int(round(point[0])), int(round(point[1])) + if 0 <= py < H and 0 <= px < W and object_mask[py, px]: + return point, True, True + else: + return point, True, False + else: + return np.mean(kpts_2d, axis=0), False, False + +def bbox_to_mask(bbox): + x, y, w, h = bbox + mask = np.zeros((h, w)) + mask[y:y+h, x:x+w] = True + return mask + + +def visualize_grasp_points(image, kpts_2d, contact_point, output_path): + """ + Simple visualization of fingertip keypoints and contact point on image. + + Args: + image: Input image (H, W, 3) uint8 or float + kpts_2d: Fingertip keypoints array (N, 2) + contact_point: Contact point (2,) + output_path: Path to save visualization + """ + # Prepare image + vis_image = image.copy() + if vis_image.dtype != np.uint8: + if vis_image.max() <= 1.0: + vis_image = (vis_image * 255).astype(np.uint8) + else: + vis_image = vis_image.astype(np.uint8) + + # Convert RGB to BGR for OpenCV + if vis_image.shape[-1] == 3: + vis_image = cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR) + + H, W = vis_image.shape[:2] + + # Draw fingertip keypoints (blue) + kpts_2d = np.asarray(kpts_2d) + for kp in kpts_2d: + kp_x, kp_y = int(round(kp[0])), int(round(kp[1])) + if 0 <= kp_y < H and 0 <= kp_x < W: + cv2.circle(vis_image, (kp_x, kp_y), 1, (255, 0, 0), -1) # Blue filled circle + + # Draw contact point (red) + cp_x, cp_y = int(round(contact_point[0])), int(round(contact_point[1])) + if 0 <= cp_y < H and 0 <= cp_x < W: + cv2.circle(vis_image, (cp_x, cp_y), 1, (0, 0, 255), -1) # Red filled circle + + # Convert back to RGB and save + vis_image_rgb = cv2.cvtColor(vis_image, cv2.COLOR_BGR2RGB) + imageio.imwrite(str(output_path), vis_image_rgb) + print(f"[Info] Saved grasp point visualization to: {output_path}") + + +def grasp_point_extraction(keys, key_scene_dicts, key_cfgs): + base_dir = Path.cwd() + for key in keys: + scene_dict = key_scene_dicts[key] + scene_dict["key"] = key # Store key in scene_dict for visualization + key_cfg = key_cfgs[key] + single_grasp_point_generation(scene_dict, key_cfg, key, base_dir) + key_scene_dicts[key] = scene_dict + with open(base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f: + pickle.dump(scene_dict, f) + return key_scene_dicts + +def single_grasp_point_generation(scene_dict, key_cfg, key, base_dir): + """Generate grasp point for a single scene.""" + # Setup + gpu_id = key_cfg["gpu"] + device = f"cuda:{gpu_id}" + start_frame_idx = scene_dict["info"]["start_frame_idx"] + i = start_frame_idx + manipulated_oid = int(scene_dict["info"]["manipulated_oid"]) + + # Load camera parameters + T_c2w = np.array(scene_dict["info"]["camera"]["camera_opencv_to_world"]).astype(np.float64).reshape(4, 4) + K = np.array(scene_dict["intrinsics"]).astype(np.float32).reshape(3, 3) + img_size = scene_dict["height"], scene_dict["width"] + + # Load object model and trajectory + model_path = scene_dict["info"]["objects"][manipulated_oid]["optimized"] + model = trimesh.load(model_path) + + traj_key = scene_dict["info"]["traj_key"].replace("_recomputed", "") + print(f"[Info] Trajectory key: {traj_key}") + traj_path = scene_dict["info"]["objects"][manipulated_oid][traj_key] + traj = np.load(traj_path).reshape(-1, 4, 4) + start_pose = traj[start_frame_idx] + + # Transform model to world coordinate system + model = model.apply_transform(start_pose) + + # Render object to get mask and depth + rendered_image, mask_img, depth = render_pytorch3d_rgbd(model, K, T_c2w, img_size, device=device) + + # Find contact point from hand keypoints + kpts_2d = scene_dict["motion"]["hand_kpts"][i][[4, 8, 12, 16, 20]] + direction = scene_dict["motion"]["hand_global_orient"][i] + direction = direction.reshape(3, 3) + direction_world = T_c2w[:3, :3] @ direction + point_2d, is_close, is_inside = compute_contact_point(kpts_2d, mask_img) + if not is_close: + point_2d = find_nearest_point(point_2d, mask_img, erosion_size=key_cfg["affordance_erosion_pixel"]) + if point_2d is None: + print(f"[Error] Failed to find valid contact point in mask") + return None + + x, y = int(round(point_2d[0])), int(round(point_2d[1])) + + # Visualize keypoints and contact point + vis_dir = base_dir / f"outputs/{key}/motion/debug" + vis_dir.mkdir(parents=True, exist_ok=True) + vis_path = vis_dir / f"grasp_point_visualization_frame_{i:06d}.png" + visualize_grasp_points(rendered_image, kpts_2d, point_2d, vis_path) + + # Get depth value at the contact point + z = depth[y, x] if 0 <= y < depth.shape[0] and 0 <= x < depth.shape[1] and depth[y, x] > 0 else 0.0 + if z <= 0: + print(f"[Warning] Invalid depth at contact point ({x}, {y}), using nearest valid depth") + valid_mask = depth > 0 + if np.any(valid_mask): + y_coords, x_coords = np.where(valid_mask) + dists = np.sqrt((x_coords - x)**2 + (y_coords - y)**2) + nearest_idx = np.argmin(dists) + z = depth[y_coords[nearest_idx], x_coords[nearest_idx]] + else: + print(f"[Error] No valid depth found, cannot unproject point") + return None + + # Create camera object for unprojection (reuse the same helper function) + camera = create_pytorch3d_camera(K, T_c2w, img_size, device=device) + + # Unproject 2D point to 3D using PyTorch3D. The rasterized depth is an NDC z-buffer + # in [0, 1]. Convert pixel coordinates to NDC [-1, 1] before calling unproject. + if isinstance(img_size, int): + H = W = img_size + else: + H, W = img_size + xy_coords = torch.tensor([[x, y]], dtype=torch.float32, device=device) + depth_values = torch.tensor([[z]], dtype=torch.float32, device=device) + points_camera = camera.unproject_points( + xy_depth=torch.cat([xy_coords, depth_values], dim=1), + world_coordinates=False, + scaled_depth_input=False, + from_ndc=False, + ) + winner_point_3d = points_camera[0].cpu().numpy() # (3,) in camera coordinates + print(f"[Info] Winner point 3D: {winner_point_3d}") + # Convert from camera coordinates to object coordinates + # winner_point_3d is in camera coordinate system + # start_pose is object pose in world coordinate system + # To get point in object coordinate system: + # 1. camera -> world: T_c2w @ point_cam + # 2. world -> object: inv(start_pose) @ point_world + point_cam_homo = np.hstack([winner_point_3d, 1.0]) # (4,) + point_world = T_c2w @ point_cam_homo + point_world = point_world / point_world[3] + winner_point_to_obj = np.linalg.inv(start_pose) @ point_world + winner_point_to_obj = winner_point_to_obj[:3] / winner_point_to_obj[3] + scene_dict["info"]["objects"][manipulated_oid]["grasp_point"] = winner_point_to_obj.tolist() + scene_dict["info"]["objects"][manipulated_oid]["grasp_direction"] = direction_world.tolist() + + scene_json_path = base_dir / f"outputs/{key}/simulation/scene.json" + with open(scene_json_path, "r") as f: + scene_json = json.load(f) + scene_json["objects"][str(manipulated_oid)]["grasp_point"] = winner_point_to_obj.tolist() + scene_json["objects"][str(manipulated_oid)]["grasp_direction"] = direction_world.tolist() + with open(scene_json_path, "w") as f: + json.dump(scene_json, f, indent=2) + return winner_point_to_obj + + + +if __name__ == "__main__": + base_dir = Path.cwd() + cfg_path = base_dir / "config" / "config.yaml" + cfg = yaml.safe_load(cfg_path.open("r")) + keys = cfg["keys"] + sys.path.append(str(base_dir / "openreal2sim" / "simulation")) + from utils.compose_config import compose_configs + key_cfgs = {key: compose_configs(key, cfg) for key in keys} + print(f"Key cfgs: {key_cfgs}") + key_scene_dicts = {} + for key in keys: + scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl' + with open(scene_pkl, 'rb') as f: + scene_dict = pickle.load(f) + key_scene_dicts[key] = scene_dict + grasp_point_extraction(keys, key_scene_dicts, key_cfgs) + diff --git a/openreal2sim/motion/modules/hand_extraction.py b/openreal2sim/motion/modules/hand_extraction.py new file mode 100755 index 0000000..3686ccb --- /dev/null +++ b/openreal2sim/motion/modules/hand_extraction.py @@ -0,0 +1,191 @@ +## In this file: we use hand-object detector to decide the starting and ending frame of contact. +## Then we use wilor to extract the hand info. + + +import os +import sys +import cv2 +import numpy as np +import torch +import yaml +from typing import List, Tuple, Dict, Optional +import logging +from pathlib import Path +import pickle +import logging +import tqdm +base_dir = Path.cwd() +sys.path.append(str(base_dir / "openreal2sim" / "motion" / "modules")) +sys.path.append(str(base_dir / "third_party" / "WiLoR")) +sys.path.append(str(base_dir / "third_party" / "Grounded-SAM-2")) + + +from wilor.utils.renderer import cam_crop_to_full +from wilor.models import load_wilor +from wilor.utils import recursive_to +from wilor.datasets.vitdet_dataset import ViTDetDataset +from sam2.build_sam import build_sam2 +from sam2.sam2_image_predictor import SAM2ImagePredictor +from ultralytics import YOLO + +class WiLoRExtractor: + def __init__(self, + model_path: str, + cfg_path: str, + yolo_weights_path: str, + sam_weights_path: str, + sam_cfg_path: str, + device: str): + self._wilor_model, self._wilor_cfg = load_wilor(model_path, cfg_path) + self._wilor_model.eval() + self._yolo_detector = YOLO(yolo_weights_path) + self.device = torch.device(device) + self._sam_predictor = SAM2ImagePredictor(build_sam2(sam_cfg_path, sam_weights_path)) + + def process(self, images: np.ndarray, batch_size: int = 16, rescale_factor: float = 1.0): + boxes = [] + right = [] + masks = [] + self._wilor_model.to(self.device) + self._yolo_detector.to(self.device) + self._sam_predictor.model.to(self.device) + self._wilor_model.eval() + + all_global_orient = [] + all_kpts = [] + all_masks = [] + + for i in tqdm.tqdm(range(0, len(images)), desc="Detecting hands"): + batch = np.array(images)[i] + with torch.no_grad(): + detections = self._yolo_detector.predict(batch, conf=0.3, verbose=False, save=False, show=False)[0] + + if len(detections.boxes.cls.cpu().detach().numpy()) == 0: + all_global_orient.append(None) + all_kpts.append(None) + all_masks.append(None) + else: + det = max(detections, key=lambda d: d.boxes.conf.cpu().detach().item()) + Bbox = det.boxes.data.cpu().detach().squeeze().numpy() + cls_flag = det.boxes.cls.cpu().detach().squeeze().item() + + self._sam_predictor.set_image(batch) + hand_masks, scores, logits = self._sam_predictor.predict( + point_coords=None, + point_labels=None, + box=np.array([[Bbox[0], Bbox[1], Bbox[2], Bbox[3]]]), + multimask_output=False + ) + hand_masks = hand_masks[0] + hand_masks = hand_masks.astype(bool) + dataset = ViTDetDataset(self._wilor_cfg, batch, np.array([Bbox[:4]]), np.array([cls_flag]), rescale_factor=rescale_factor) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0) + sign = False + for batch in dataloader: + assert sign == False + batch = recursive_to(batch, torch.device(self.device)) + with torch.no_grad(): + out = self._wilor_model(batch) + multiplier = 2 * batch['right'][0].cpu().numpy() - 1 + pred_cam = out['pred_cam'][0] + pred_cam[1] = multiplier * pred_cam[1] + box_center = batch['box_center'][0].float() + box_size = batch['box_size'][0].float() + img_size = batch['img_size'][0].float() + scaled_focal_length = self._wilor_cfg.EXTRA.FOCAL_LENGTH / self._wilor_cfg.MODEL.IMAGE_SIZE * img_size.max() + #import pdb; pdb.set_trace() + pred_cam_t_full = cam_crop_to_full(pred_cam.reshape(1, 3), box_center.reshape(1, 2), box_size.reshape(1, 1), img_size.reshape(1, 2), scaled_focal_length).detach().cpu().numpy() + batch_size = batch['img'].shape[0] + joints = out['pred_keypoints_3d'][0].detach().cpu().numpy() + joints[:, 0] = multiplier * joints[:, 0] + cam_t = pred_cam_t_full[0] + kpts_2d = self.project_full_img(joints, cam_t, float(scaled_focal_length), img_size) + all_kpts.append(kpts_2d) + all_global_orient.append(out['pred_mano_params']['global_orient'][0,0].detach().cpu().numpy()) + all_masks.append(hand_masks) + sign = True + return all_kpts, all_global_orient, all_masks + + + + + + def project_full_img(self, points, cam_trans, focal_length, img_res): + ''' we use simple K here. It works.''' + if not isinstance(points, torch.Tensor): + points = torch.tensor(points, dtype=torch.float32) + if not isinstance(cam_trans, torch.Tensor): + cam_trans = torch.tensor(cam_trans, dtype=torch.float32) + # Ensure numeric image resolution + try: + img_w = float(img_res[0]) + img_h = float(img_res[1]) + except Exception: + # Fallback for unexpected types + img_w, img_h = float(img_res[0].item()), float(img_res[1].item()) + K = torch.eye(3, dtype=torch.float32) + K[0, 0] = float(focal_length) + K[1, 1] = float(focal_length) + K[0, 2] = img_w / 2.0 + K[1, 2] = img_h / 2.0 + pts = points + cam_trans + pts = pts / pts[..., -1:] + V_2d = (K @ pts.T).T + return V_2d[..., :-1].detach().cpu().numpy() + + + + + + +def hand_extraction(keys, key_scene_dicts, key_cfgs): + base_dir = Path.cwd() + model_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "wilor_final.ckpt" + cfg_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "model_config.yaml" + yolo_weights_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "detector.pt" + sam_cfg_path = "configs/sam2.1/sam2.1_hiera_l.yaml" + sam_weights_path = "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt" + + for key in keys: + + scene_dict = key_scene_dicts[key] + config = key_cfgs[key] + gpu_id = config['gpu'] + device = f"cuda:{gpu_id}" + wilor_extractor = WiLoRExtractor(model_path=model_path, cfg_path=cfg_path, yolo_weights_path=yolo_weights_path, sam_weights_path=sam_weights_path, sam_cfg_path=sam_cfg_path, device=device) + #images = np.transpose(scene_dict["images"], (0, 3, 1, 2)) + images = scene_dict["images"].astype(np.float32) + kpts, global_orient, masks = wilor_extractor.process(images) + if scene_dict.get("motion") is None: + scene_dict["motion"] = {} + scene_dict["motion"]["hand_kpts"] = kpts + scene_dict["motion"]["hand_global_orient"] = global_orient + scene_dict["motion"]["hand_masks"] = masks + key_scene_dicts[key] = scene_dict + with open(base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f: + pickle.dump(scene_dict, f) + + + return key_scene_dicts + + + + +if __name__ == "__main__": + base_dir = Path.cwd() + cfg_path = base_dir / "config" / "config.yaml" + cfg = yaml.safe_load(cfg_path.open("r")) + keys = cfg["keys"] + + from utils.compose_config import compose_configs + key_cfgs = {key: compose_configs(key, cfg) for key in keys} + print(f"Key cfgs: {key_cfgs}") + key_scene_dicts = {} + for key in keys: + scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl' + with open(scene_pkl, 'rb') as f: + scene_dict = pickle.load(f) + key_scene_dicts[key] = scene_dict + hand_extraction(keys, key_scene_dicts, key_cfgs) + + diff --git a/openreal2sim/motion/motion_manager.py b/openreal2sim/motion/motion_manager.py new file mode 100644 index 0000000..acc7dcf --- /dev/null +++ b/openreal2sim/motion/motion_manager.py @@ -0,0 +1,86 @@ +from pathlib import Path +import sys +import pickle +import yaml +import argparse + +sys.path.append(str(Path.cwd() / "openreal2sim" / "motion" / "modules")) +sys.path.append(str(Path.cwd() / "openreal2sim" / "motion" / "utils")) + +from utils.compose_config import compose_configs + +class MotionAgent: + def __init__(self, stage=None, key=None): + print('[Info] Initializing MotionAgent...') + self.base_dir = Path.cwd() + cfg_path = self.base_dir / "config" / "config.yaml" + cfg = yaml.safe_load(cfg_path.open("r")) + self.keys = [key] if key is not None else cfg["keys"] + self.key_cfgs = {key: compose_configs(key, cfg) for key in self.keys} + self.key_scene_dicts = {} + for key in self.keys: + scene_pkl = self.base_dir / f'outputs/{key}/scene/scene.pkl' + with open(scene_pkl, 'rb') as f: + scene_dict = pickle.load(f) + self.key_scene_dicts[key] = scene_dict + self.stages = [ + #"hand_extraction", + "demo_motion_process", + "grasp_point_extraction", + "grasp_generation", + ] + if stage is not None: + if stage in self.stages: + start_idx = self.stages.index(stage) + self.stages = self.stages[start_idx:] + else: + print(f"[Warning] Stage '{stage}' not found. It must be one of {self.stages}. Running all stages by default.") + print('[Info] MotionAgent initialized.') + + def save_scene_dicts(self): + for key, scene_dict in self.key_scene_dicts.items(): + with open(self.base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f: + pickle.dump(scene_dict, f) + print('[Info] Scene dictionaries saved.') + + def hand_extraction(self): + from modules.hand_extraction import hand_extraction + self.key_scene_dicts = hand_extraction(self.keys, self.key_scene_dicts, self.key_cfgs) + print("[Info] Hand extraction completed.") + + def demo_motion_process(self): + from modules.demo_motion_process import demo_motion_process + self.key_scene_dicts = demo_motion_process(self.keys, self.key_scene_dicts, self.key_cfgs) + print("[Info] Demo motion process completed.") + + def grasp_point_extraction(self): + from modules.grasp_point_extraction import grasp_point_extraction + self.key_scene_dicts = grasp_point_extraction(self.keys, self.key_scene_dicts, self.key_cfgs) + print("[Info] Grasp point extraction completed.") + + def grasp_generation(self): + from modules.grasp_generation import grasp_generation + grasp_generation(self.keys) + print("[Info] Grasp generation completed.") + + def run(self): + if "hand_extraction" in self.stages: + self.hand_extraction() + if "demo_motion_process" in self.stages: + self.demo_motion_process() + if "grasp_point_extraction" in self.stages: + self.grasp_point_extraction() + if "grasp_generation" in self.stages: + self.grasp_generation() + self.save_scene_dicts() + print('[Info] MotionAgent run completed.') + return self.key_scene_dicts + +if __name__ == '__main__': + args = argparse.ArgumentParser() + args.add_argument('--stage', type=str, default=None, help='Starting from a certain stage') + args.add_argument('--key', type=str, default=None, help='Process a single key instead of all keys from config') + args = args.parse_args() + + agent = MotionAgent(stage=args.stage, key=args.key) + scene_dicts = agent.run() diff --git a/openreal2sim/motion/utils/compose_config.py b/openreal2sim/motion/utils/compose_config.py new file mode 100755 index 0000000..6f30c1b --- /dev/null +++ b/openreal2sim/motion/utils/compose_config.py @@ -0,0 +1,23 @@ + +default_config = { + "gpu": "0", + "rescore_strategy": "point_only", + "n_points": 5000, + "keep": 10, + "nms": True, + "overwrite": False, + "vis_pts_per_gripper": 400, + "affordance_erosion_pixel": 5 +} + +def compose_configs(key_name: str, config: dict) -> dict: + ret_key_config = {} + local_config = config["local"].get(key_name, {}) + local_config = local_config.get("motion", {}) + global_config = config.get("global", {}) + global_config = global_config.get("motion", {}) + for param in default_config.keys(): + value = local_config.get(param, global_config.get(param, default_config[param])) + ret_key_config[param] = value + print(f"[Info] Config for {key_name}: {ret_key_config}") + return ret_key_config \ No newline at end of file diff --git a/openreal2sim/motion/utils/grasp_utils.py b/openreal2sim/motion/utils/grasp_utils.py new file mode 100755 index 0000000..0c0a170 --- /dev/null +++ b/openreal2sim/motion/utils/grasp_utils.py @@ -0,0 +1,391 @@ +import os +import sys +import numpy as np +import time +import torch +from typing import List +import open3d as o3d +from graspnetAPI.grasp import GraspGroup +from pathlib import Path +base_dir = Path.cwd() +from graspness_unofficial.models.graspnet import GraspNet, pred_decode +from graspness_unofficial.dataset.graspnet_dataset import minkowski_collate_fn +from graspness_unofficial.utils.collision_detector import ModelFreeCollisionDetector +from graspness_unofficial.utils.data_utils import CameraInfo, create_point_cloud_from_depth_image, get_workspace_mask +from scipy.spatial.transform import Rotation as R +import trimesh +grasp_cfgs = { + "save_files": False, + "checkpoint_path": base_dir / "third_party" / "graspness_unofficial" / "ckpt" / "minkuresunet_kinect.tar", #hardcode path + "seed_feat_dim": 512, + "camera": "kinect", + "num_point": 80000, + "batch_size": 1, + "voxel_size": 0.001, # 0.005 + "collision_thresh": 0.00001, + "voxel_size_cd": 0.01, # 0.01 + "infer": True, +} + +def my_worker_init_fn(worker_id): + np.random.seed(np.random.get_state()[1][0] + worker_id) + pass + +class MyGraspNet(): + def __init__(self): + cfgs = grasp_cfgs + self.cfgs = cfgs + self.net = GraspNet(seed_feat_dim=cfgs["seed_feat_dim"], is_training=False) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.net.to(self.device) + # Load checkpoint + checkpoint = torch.load(cfgs["checkpoint_path"]) + self.net.load_state_dict(checkpoint['model_state_dict']) + start_epoch = checkpoint['epoch'] + print("-> loaded checkpoint %s (epoch: %d)" % (cfgs["checkpoint_path"], start_epoch)) + + self.net.eval() + + def inference(self, pcs): + + data_dict = {'point_clouds': pcs.astype(np.float32), + 'coors': pcs.astype(np.float32) / self.cfgs["voxel_size"], + 'feats': np.ones_like(pcs).astype(np.float32)} + batch_data = minkowski_collate_fn([data_dict]) + tic = time.time() + for key in batch_data: + if 'list' in key: + for i in range(len(batch_data[key])): + for j in range(len(batch_data[key][i])): + batch_data[key][i][j] = batch_data[key][i][j].to(self.device) + else: + batch_data[key] = batch_data[key].to(self.device) + + # Forward pass + with torch.no_grad(): + end_points = self.net(batch_data) + grasp_preds = pred_decode(end_points) + + preds = grasp_preds[0].detach().cpu().numpy() + gg = GraspGroup(preds) + + # collision detection + if self.cfgs["collision_thresh"] > 0: + cloud = data_dict['point_clouds'] + mfcdetector = ModelFreeCollisionDetector(cloud, voxel_size=self.cfgs["voxel_size_cd"]) + collision_mask = mfcdetector.detect(gg, approach_dist=0.05, collision_thresh=self.cfgs["collision_thresh"]) + gg = gg[~collision_mask] + + toc = time.time() + # print('inference time: %fs' % (toc - tic)) + return gg + +def inference(cfgs, data_input): + batch_data = minkowski_collate_fn([data_input]) + net = GraspNet(seed_feat_dim=cfgs["seed_feat_dim"], is_training=False) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + net.to(device) + + # Load checkpoint + checkpoint = torch.load(cfgs["checkpoint_path"]) + net.load_state_dict(checkpoint['model_state_dict']) + start_epoch = checkpoint['epoch'] + print("-> loaded checkpoint %s (epoch: %d)" % (cfgs["checkpoint_path"], start_epoch)) + + net.eval() + tic = time.time() + + for key in batch_data: + if 'list' in key: + for i in range(len(batch_data[key])): + for j in range(len(batch_data[key][i])): + batch_data[key][i][j] = batch_data[key][i][j].to(device) + else: + batch_data[key] = batch_data[key].to(device) + + # Forward pass + with torch.no_grad(): + end_points = net(batch_data) + grasp_preds = pred_decode(end_points) + + preds = grasp_preds[0].detach().cpu().numpy() + gg = GraspGroup(preds) + # collision detection + if cfgs["collision_thresh"] > 0: + cloud = data_input['point_clouds'] + mfcdetector = ModelFreeCollisionDetector(cloud, voxel_size=cfgs["voxel_size_cd"]) + collision_mask = mfcdetector.detect(gg, approach_dist=0.05, collision_thresh=cfgs["collision_thresh"]) + gg = gg[~collision_mask] + + toc = time.time() + # print('inference time: %fs' % (toc - tic)) + return gg + +def pointcloud_to_grasp(cfgs, pcs): + data_dict = {'point_clouds': pcs.astype(np.float32), + 'coors': pcs.astype(np.float32) / cfgs["voxel_size"], + 'feats': np.ones_like(pcs).astype(np.float32)} + gg = inference(cfgs, data_dict) + return gg + +def vis_grasp(pcs, gg): + gg = gg.nms() + gg = gg.sort_by_score() + keep = 1 + if gg.__len__() > keep: + gg = gg[:keep] + grippers = gg.to_open3d_geometry_list() + cloud = o3d.geometry.PointCloud() + cloud.points = o3d.utility.Vector3dVector(pcs.astype(np.float32)) + o3d.visualization.draw_geometries([cloud, *grippers]) + +def grasp_to_pointcloud(grippers, gripper_points=1000, gripper_color=[1, 0, 0]): + grippers_pcd = o3d.geometry.PointCloud() + for gripper in grippers: + g_pcd = gripper.sample_points_uniformly(gripper_points) + grippers_pcd += g_pcd + grippers_pcd.colors = o3d.utility.Vector3dVector(np.tile(np.array([gripper_color]), (len(grippers_pcd.points), 1))) + return grippers_pcd + + +def vis_save_grasp(points, gg, best_grasp, save_path, colors=None, grasp_position=None, place_position=None): + # visualize grasp pos, place pos, grasp poses, and pcd + cloud = o3d.geometry.PointCloud() + cloud.points = o3d.utility.Vector3dVector(points.astype(np.float32)) + + if colors is None: + cloud.colors = o3d.utility.Vector3dVector(np.tile(np.array([[0, 1, 0]]), (len(cloud.points), 1))) + elif isinstance(colors, np.ndarray): + cloud.colors = o3d.utility.Vector3dVector(colors.astype(np.float32)) + elif isinstance(colors, list): + cloud.colors = o3d.utility.Vector3dVector(np.tile(np.array([colors]), (len(cloud.points), 1))) + + if type(gg) == GraspGroup: + pcd_w_grasp = grasp_to_pointcloud(gg.to_open3d_geometry_list()) + else: + pcd_w_grasp = grasp_to_pointcloud([gg.to_open3d_geometry()]) + + pcd_best_grasp = grasp_to_pointcloud([best_grasp.to_open3d_geometry()], gripper_color=[0, 0, 1]) + pcd_w_grasp += pcd_best_grasp + + if grasp_position is not None and place_position is not None: + pick_pcd = o3d.geometry.PointCloud() + place_pcd = o3d.geometry.PointCloud() + pick_pcd.points = o3d.utility.Vector3dVector(np.array(grasp_position).reshape(1,3).astype(np.float32)) + place_pcd.points = o3d.utility.Vector3dVector(np.array(place_position).reshape(1,3).astype(np.float32)) + pick_pcd.colors = place_pcd.colors = o3d.utility.Vector3dVector(np.array([[0,0,1]]).astype(np.float32)) + pcd_w_grasp = pcd_w_grasp + pick_pcd + place_pcd + + pcd_w_grasp += cloud + + o3d.io.write_point_cloud(save_path, pcd_w_grasp) + +def get_pose_from_grasp(best_grasp): + grasp_position = best_grasp.translation + # convert rotation to isaacgym convention + delta_m = np.array([[0, 0, 1], [0, -1, 0], [1, 0, 0]]) + rotation = np.dot(best_grasp.rotation_matrix, delta_m) + quaternion_grasp = R.from_matrix(rotation).as_quat() + quaternion_grasp = np.array([quaternion_grasp[3],quaternion_grasp[0],quaternion_grasp[1],quaternion_grasp[2]]) + rotation_unit_vect = rotation[:,2] + # grasp_position -= 0.03 * rotation_unit_vect + return grasp_position, quaternion_grasp, rotation_unit_vect + +def get_best_grasp(gg, position, max_dis=0.05): + # get best grasp pose around the grasp position + best_grasp = None + for g in gg: + if np.linalg.norm(g.translation - position) < max_dis: + if best_grasp is None: + best_grasp = g + else: + if g.score > best_grasp.score: + best_grasp = g + if best_grasp is None: + best_grasp = gg[0] + return best_grasp + +def get_best_grasp_z_aligned(gg): + # get best grasp pose that is facing the -z axis (for top-down grasping) + best_grasp = None + best_angle = np.inf + + gravity_direction = np.array([0, 0, -1]) # -Z direction in world coordinates + + for g in gg: + + # approach vector is +X axis of the grasp frame, make it close to the -Z axis in the world frame + approach_vector = g.rotation_matrix[:, 0] + approach_vector /= np.linalg.norm(approach_vector) + + # compute the angle between the approach vector and the gravity direction + angle = np.arccos(np.clip(np.dot(approach_vector, gravity_direction), -1.0, 1.0)) + + if angle < best_angle: + best_angle = angle + best_grasp = g + elif np.isclose(angle, best_angle) and g.score > best_grasp.score: + best_grasp = g + + if best_grasp is None: + print("No best grasp found, falling back to the first grasp.") + best_grasp = gg[0] # fallback + + return best_grasp + +def get_best_grasp_z_aligned_and_y_aligned(gg, + desired_approach_dir=np.array([0, 0, -1]), + desired_width_dir=np.array([0, 1, 0]), + angle_tolerance=0.2): + """ + Pick the grasp whose: + 1) X-axis (approach) is closest to desired_approach_dir (default -Z), + 2) Y-axis (gripper width) is closest to desired_width_dir (default +Y), + 3) Score is highest if angles tie. + + Grasp coordinate system: + - rotation_matrix[:, 0] → X-axis = Approach direction + - rotation_matrix[:, 1] → Y-axis = Gripper width direction + - rotation_matrix[:, 2] → Z-axis = Depth/thickness direction + """ + + best_grasp = None + # Track best angles and score + best_approach_angle = np.inf + best_width_angle = np.inf + best_score = -np.inf + + # Normalize desired directions + desired_approach_dir = desired_approach_dir / np.linalg.norm(desired_approach_dir) + desired_width_dir = desired_width_dir / np.linalg.norm(desired_width_dir) + + for g in gg: + # 1) Approach vector angle + approach_vec = g.rotation_matrix[:, 0] + approach_vec /= np.linalg.norm(approach_vec) + approach_angle = angle_between(approach_vec, desired_approach_dir) + + # 2) Width vector angle + width_vec = g.rotation_matrix[:, 1] + width_vec /= np.linalg.norm(width_vec) + width_angle = angle_between(width_vec, desired_width_dir) + + # 3) Compare to the "best" so far in a hierarchical manner + if approach_angle < best_approach_angle: + # Definitely better in terms of approach alignment => choose this + best_approach_angle = approach_angle + best_width_angle = width_angle + best_score = g.score + best_grasp = g + elif np.isclose(approach_angle, best_approach_angle, atol=angle_tolerance): + # Approach angles are essentially tied, compare width alignment + if width_angle < best_width_angle: + best_width_angle = width_angle + best_score = g.score + best_grasp = g + elif np.isclose(width_angle, best_width_angle, atol=angle_tolerance): + # Both angles tied, pick the higher score + if g.score > best_score: + best_score = g.score + best_grasp = g + + if best_grasp is None and len(gg) > 0: + print("No valid grasp found using angle criteria. Falling back to the first grasp.") + best_grasp = gg[0] + + return best_grasp + + +def get_best_grasp_with_hints(gg: GraspGroup, point: List[float] = None, direction: List[float] = None): + """ + Rescore all grasps using optional spatial and directional hints, then return a new + GraspGroup sorted by this custom score (best first). Does NOT mutate the original gg. + + Scoring terms in [0, 1], combined by a weighted sum: + - dir_term: alignment between grasp approach (+X axis) and `direction` + - pt_term : proximity to `point` (RBF over distance) + - net_term: original network score normalized over gg + + Args: + gg: GraspGroup from graspnetAPI. + point: (3,) world point. If provided, grasps closer to this point are preferred. + direction: (3,) world direction. If provided, grasps whose approach (+X) aligns + with this direction are preferred. + + Returns: + GraspGroup: a *new* group sorted by the custom score (descending). + The best guess is result[0]. + """ + # --- Early exits --- + if gg is None or len(gg) == 0: + return gg + + # Internal weights (you can tweak if needed) + w_dir = 1.0 + w_pt = 1.0 + w_net = 0 + + # If hints are missing, zero-out the corresponding weights + if point is None: + w_pt = 0.0 + if direction is None or (np.asarray(direction).shape != (3,)): + w_dir = 0.0 + + # Length-scale for the point proximity (meters). Similar to your 0.05 window. + sigma = 0.05 + sigma2 = max(sigma * sigma, 1e-12) + + # --- Gather per-grasp attributes --- + translations = [] + approach_axes = [] # grasp frame +X + net_scores = [] + for g in gg: + translations.append(g.translation.astype(np.float64)) + # Normalize +X axis as approach direction + ax = g.rotation_matrix[:, 0].astype(np.float64) + n = np.linalg.norm(ax) + approach_axes.append(ax / n if n > 0 else np.array([1.0, 0.0, 0.0], dtype=np.float64)) + net_scores.append(float(g.score)) + + translations = np.vstack(translations) # (N,3) + approach_axes = np.vstack(approach_axes) # (N,3) + net_scores = np.asarray(net_scores, dtype=np.float64) # (N,) + + # --- Normalize original network scores to [0,1] --- + if np.isfinite(net_scores).all() and (net_scores.max() > net_scores.min()): + net_term = (net_scores - net_scores.min()) / (net_scores.max() - net_scores.min()) + else: + net_term = np.zeros_like(net_scores) + + # --- Direction alignment term (cosine mapped to [0,1]) --- + if w_dir > 0.0: + d = np.asarray(direction, dtype=np.float64) + nd = np.linalg.norm(d) + if nd > 0: + d = d / nd + cosv = np.clip((approach_axes * d[None, :]).sum(axis=1), -1.0, 1.0) + dir_term = 0.5 * (cosv + 1.0) # map [-1,1] -> [0,1] + else: + dir_term = np.zeros(len(gg), dtype=np.float64) + else: + dir_term = np.zeros(len(gg), dtype=np.float64) + + # --- Point proximity term (RBF over Euclidean distance) --- + if w_pt > 0.0: + p = np.asarray(point, dtype=np.float64).reshape(1, 3) + dists = np.linalg.norm(translations - p, axis=1) + pt_term = np.exp(-0.5 * (dists * dists) / sigma2) # in (0,1] + else: + pt_term = np.zeros(len(gg), dtype=np.float64) + + # --- Combine --- + total_score = w_dir * dir_term + w_pt * pt_term + w_net * net_term + gg.scores = total_score + gg.sort_by_score() # Sort in-place by the new score (best first) + return gg + + +def angle_between(v1, v2): + """Utility to compute the angle between two normalized vectors in [0, π].""" + dot_val = np.clip(np.dot(v1, v2), -1.0, 1.0) + return np.arccos(dot_val) diff --git a/openreal2sim/motion/utils/notification.py b/openreal2sim/motion/utils/notification.py new file mode 100755 index 0000000..a36c546 --- /dev/null +++ b/openreal2sim/motion/utils/notification.py @@ -0,0 +1,42 @@ +import requests + +ntfy_url = "" + +def notify_success(pipeline_label): + try: + requests.post( + ntfy_url, + data=pipeline_label, + headers={ + "Title": f"Reconstruction completed", + "Tags": "white_check_mark" + } + ) + except: + pass + +def notify_started(pipeline_label): + try: + requests.post( + ntfy_url, + data=pipeline_label, + headers={ + "Title": f"Reconstruction started", + "Tags": "watch" + } + ) + except: + pass + +def notify_failed(pipeline_label): + try: + requests.post( + ntfy_url, + data=pipeline_label, + headers={ + "Title": f"Reconstruction failed", + "Tags": "x" + } + ) + except: + pass \ No newline at end of file diff --git a/openreal2sim/motion/utils/visualize_scene.py b/openreal2sim/motion/utils/visualize_scene.py new file mode 100644 index 0000000..7d6cdd0 --- /dev/null +++ b/openreal2sim/motion/utils/visualize_scene.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Aggregate (all frames) export for object trajectories in the world frame. + +Outputs: +- One PLY containing: + * Background point cloud (downsampled once, light gray) + * For each frame (with stride) and each object: + - Object point cloud transformed by Δ_w[frame] + - Object center point at that frame (brighter color) + Optionally encodes time in alpha channel (RGBA): early=dark, late=bright. + +- (Optional) One GLB containing all frames at once (can be large): + * Background mesh once + * For each frame (with stride), one duplicated and transformed mesh per object. +""" + +import json +from pathlib import Path +import argparse +import numpy as np +import trimesh +import yaml + + +def _create_arrow_mesh(origin: np.ndarray, direction: np.ndarray, length: float, radius: float = 0.01) -> trimesh.Trimesh: + """Construct an arrow mesh even if `trimesh.creation.arrow` is missing.""" + direction = np.asarray(direction, dtype=np.float64) + norm = np.linalg.norm(direction) + if norm < 1e-9: + raise ValueError("Grasp direction must be non-zero.") + direction = direction / norm + shaft_length = length * 0.75 + tip_length = length - shaft_length + + shaft = trimesh.creation.cylinder(radius=radius, height=shaft_length, sections=18) + shaft.apply_translation([0.0, 0.0, shaft_length / 2.0]) + + tip = trimesh.creation.cone(radius=radius * 1.6, height=tip_length, sections=18) + tip.apply_translation([0.0, 0.0, shaft_length + tip_length]) + + arrow = trimesh.util.concatenate([shaft, tip]) + + align_tf = trimesh.geometry.align_vectors(np.array([0.0, 0.0, 1.0]), direction) + arrow.apply_transform(align_tf) + arrow.apply_translation(origin) + return arrow + + +# ---------------- utils ---------------- +def load_scene(scene_json: Path): + scene = json.loads(Path(scene_json).read_text()) + bg_path = Path(scene["background"]["registered"]) + objs = scene["objects"] + return scene, bg_path, objs + +def sample_surface_points(mesh: trimesh.Trimesh, n_pts: int) -> np.ndarray: + """ + Return (N,3) uniformly sampled surface points. + Prefer sample_surface_even; fallback to sample_surface. + """ + if n_pts <= 0: + return mesh.vertices.view(np.ndarray).copy() + try: + from trimesh.sample import sample_surface_even + pts, _ = sample_surface_even(mesh, n_pts) + except Exception: + from trimesh.sample import sample_surface + pts, _ = sample_surface(mesh, n_pts) + return pts.astype(np.float64, copy=False) + +def voxel_downsample(points: np.ndarray, voxel: float) -> np.ndarray: + """ + Simple voxel downsampling by snapping to a voxel grid and + keeping the first sample per voxel. + """ + if voxel is None or voxel <= 0 or len(points) == 0: + return points + grid = np.floor(points / float(voxel)).astype(np.int64) + _, keep_idx = np.unique(grid, axis=0, return_index=True) + keep_idx.sort() + return points[keep_idx] + +def transform_points(D: np.ndarray, pts: np.ndarray) -> np.ndarray: + """Row-vector transform: p' = p * R^T + t""" + R = D[:3, :3] + t = D[:3, 3] + return pts @ R.T + t + +def transform_point(D: np.ndarray, p: np.ndarray) -> np.ndarray: + R = D[:3, :3] + t = D[:3, 3] + return p @ R.T + t + +def color_palette(k: int) -> np.ndarray: + base = np.array([ + [230, 57, 70], + [29, 161, 242], + [39, 174, 96], + [155, 89, 182], + [241, 196, 15], + [230, 126, 34], + [52, 152, 219], + [46, 204, 113], + [243, 156, 18], + [231, 76, 60], + ], dtype=np.uint8) + if k <= base.shape[0]: + return base[:k] + reps = int(np.ceil(k / base.shape[0])) + pal = np.vstack([base for _ in range(reps)])[:k] + return pal + +def save_scene_glb_all(bg_mesh, per_frame_meshes, out_path: Path): + """ + Save a single GLB with background once and all per-frame object mesh copies. + per_frame_meshes: List[(node_name, trimesh.Trimesh)] for all frames. + """ + sc = trimesh.Scene() + sc.add_geometry(bg_mesh, node_name="background") + for name, m in per_frame_meshes: + sc.add_geometry(m, node_name=name) + out_path.parent.mkdir(parents=True, exist_ok=True) + sc.export(str(out_path), file_type="glb") + +# ---------------- core: single-key visualization (callable) ---------------- +def visualize_single_scene( + key: str, + out_dir: Path | None = None, # None -> outputs//debug + *, + bg_samples: int = 60000, # default background sample count + obj_samples: int = 12000, # default object sample count + bg_voxel: float = 0.01, # meters + obj_voxel: float = 0.005, # meters + stride: int = 1, + save_glb_all: bool = False, + encode_time_in_alpha: bool = False, + mesh_key: str = "optimized", + traj_key: str = "final_trajs", +) -> dict: + """ + Render a single combined PLY (and optional GLB) for one key. + + Args: + key: scene key under outputs//simulation/scene.json + out_dir: if None -> outputs//debug ; otherwise results write to // + bg_samples, obj_samples: sample counts + bg_voxel, obj_voxel: voxel sizes in meters + stride: frame stride + save_glb_all: also export a GLB with all frames (heavy) + encode_time_in_alpha: encode time as alpha in PLY colors + mesh_key: which object mesh field to use (default "fdpose"; also "refined") + traj_key: which trajectory field to use (default "trajs"; e.g., "trajs_hybrid") + + Returns: + dict: {"ply": , "glb": , "frames": int, "points": int} + """ + base = Path.cwd() + scene_json = base / "outputs" / key / "simulation" / "scene.json" + out_dir = base / "outputs" / key / "motion" / "debug" + out_dir.mkdir(parents=True, exist_ok=True) + + scene, bg_path, objs = load_scene(scene_json) + assert bg_path.exists(), f"[{key}] background_aligned.glb not found: {bg_path}" + if len(objs.keys()) == 0: + raise RuntimeError(f"[{key}] scene.objects is empty") + + print(f"[{key}] load background: {bg_path}") + bg_mesh = trimesh.load(str(bg_path), force='mesh') + + # --- background point cloud (once) --- + bg_pts = sample_surface_points(bg_mesh, bg_samples) + bg_pts = voxel_downsample(bg_pts, bg_voxel) + bg_color = np.array([180, 180, 180], dtype=np.uint8) + + # --- preload all objects --- + palette = color_palette(len(objs)) + seq_lengths = [] + obj_infos = [] + for oid, obj in objs.items(): + oid = obj["oid"] + oname = obj["name"] + name = f"{oid}_{oname}" + + # 1) mesh field (e.g., fdpose or refined) + mesh_path = Path(obj.get(mesh_key, "")) + assert mesh_path.exists(), f"[{key}] object mesh '{mesh_key}' not found: {mesh_path}" + + # 2) trajectory field (e.g., trajs or trajs_hybrid) + trajs_path = Path(obj.get(traj_key, "")) + if not trajs_path.exists(): + traj_key = "fdpose_trajs" + trajs_path = Path(obj.get(traj_key, "")) + rel = np.load(str(trajs_path)) + N = rel.shape[0] + seq_lengths.append(N) + rel = np.stack([np.eye(4) for _ in range(N)]) + else: + rel = np.load(str(trajs_path)) # [N,4,4] + assert rel.ndim == 3 and rel.shape[1:] == (4,4), f"[{key}] trajs shape invalid: {rel.shape}" + N = rel.shape[0] + seq_lengths.append(N) + + + print(f"[{key}] load object {name}: mesh={mesh_path.name} ({mesh_key}), trajs={trajs_path.name} ({traj_key})") + base_mesh = trimesh.load(str(mesh_path), force='mesh') # world-space at frame 0 + + + obj_pts0 = sample_surface_points(base_mesh, obj_samples) + obj_pts0 = voxel_downsample(obj_pts0, obj_voxel) + center0 = base_mesh.centroid.view(np.ndarray) + + obj_infos.append({ + "name": name, + "base_mesh": base_mesh, + "rel": rel.astype(np.float64), + "pts0": obj_pts0.astype(np.float64), + "center0": center0.astype(np.float64), + "color": palette[0].astype(np.uint8) + }) + print(f"[{key}] object {name}: traj_len={N}, pts0={len(obj_pts0)}") + + # unify sequence length by the shortest and apply stride + N_all = min(seq_lengths) + idxs = np.arange(0, N_all, max(1, int(stride))) + n_frames = len(idxs) + print(f"[{key}] aggregating {n_frames} frames into ONE PLY (and optional ONE GLB)") + + # --- aggregate PLY --- + pts_all_list = [] + cols_all_list = [] + + # add background first + pts_all_list.append(bg_pts) + if encode_time_in_alpha: + alpha_bg = np.full((len(bg_pts), 1), 255, dtype=np.uint8) # background alpha = 255 + cols_all_list.append(np.hstack([np.repeat(bg_color[None, :], len(bg_pts), axis=0), alpha_bg])) + else: + cols_all_list.append(np.repeat(bg_color[None, :], len(bg_pts), axis=0)) + + # optional GLB: collect per-frame object mesh copies + all_mesh_copies = [] # (name, mesh) + + for k, i in enumerate(idxs): + # time alpha from 64 -> 255 + if encode_time_in_alpha: + alpha_val = int(np.clip(64 + (191 * (k / max(1, n_frames - 1))), 64, 255)) + for info in obj_infos: + Di = info["rel"][i] + # object cloud + pts_i = transform_points(Di, info["pts0"]) + pts_all_list.append(pts_i) + + if encode_time_in_alpha: + alpha = np.full((len(pts_i), 1), alpha_val, dtype=np.uint8) + cols = np.hstack([np.repeat(info["color"][None, :], len(pts_i), axis=0), alpha]) + else: + cols = np.repeat(info["color"][None, :], len(pts_i), axis=0) + cols_all_list.append(cols) + + # object center (brighter) + c_i = transform_point(Di, info["center0"]) + pts_all_list.append(c_i.reshape(1, 3)) + ctr_col = np.clip(info["color"].astype(int) + 50, 0, 255).astype(np.uint8) + if encode_time_in_alpha: + cols_all_list.append(np.array([[ctr_col[0], ctr_col[1], ctr_col[2], alpha_val]], dtype=np.uint8)) + else: + cols_all_list.append(ctr_col.reshape(1, 3).astype(np.uint8)) + + # GLB accumulation (optional) + if save_glb_all: + m = info["base_mesh"].copy() + m.apply_transform(Di) + all_mesh_copies.append((f"{info['name']}_f{i:06d}", m)) + + + # 处理抓取点(grasp_point),生成球体mesh,并按红色加入到 ply 和 glb 中 + manipulated_oid = scene["manipulated_oid"] + grasp_point = np.array(scene["objects"][manipulated_oid]["grasp_point"]) + grasp_point_radius = 0.01 # 球体半径可调 + + # 创建抓取点球体mesh,并移动到抓取点位置 + grasp_point_mesh = trimesh.creation.icosphere(subdivisions=3, radius=grasp_point_radius) + grasp_point_mesh.apply_translation(grasp_point) + + + grasp_color = np.array([0, 0, 255], dtype=np.uint8) # 蓝色 + grasp_pts = grasp_point_mesh.vertices + grasp_cols = np.tile(grasp_color, (grasp_pts.shape[0], 1)) + if encode_time_in_alpha: + grasp_cols = np.hstack([grasp_cols, np.full((grasp_pts.shape[0], 1), 255, dtype=np.uint8)]) + pts_all_list.append(grasp_pts) + cols_all_list.append(grasp_cols) + + grasp_direction = np.array(scene["objects"][manipulated_oid]["grasp_direction"], dtype=np.float32) + if grasp_direction.shape == (3, 3): + grasp_direction = grasp_direction[:, 0] + grasp_direction = grasp_direction.reshape(-1) + if grasp_direction.shape[0] != 3 or np.linalg.norm(grasp_direction) < 1e-6: + print(f"[WARN][{key}] Invalid grasp_direction recorded; defaulting to +Z.") + grasp_direction = np.array([0.0, 0.0, 1.0], dtype=np.float32) + else: + grasp_direction = grasp_direction / np.linalg.norm(grasp_direction) + grasp_direction_length = 0.1 # 箭头长度 + grasp_direction_mesh = _create_arrow_mesh(grasp_point, grasp_direction, grasp_direction_length, radius=0.01) + grasp_direction_pts = grasp_direction_mesh.vertices + grasp_direction_cols = np.tile(grasp_color, (grasp_direction_pts.shape[0], 1)) + if encode_time_in_alpha: + grasp_direction_cols = np.hstack([grasp_direction_cols, np.full((grasp_direction_pts.shape[0], 1), 255, dtype=np.uint8)]) + pts_all_list.append(grasp_direction_pts) + cols_all_list.append(grasp_direction_cols) + + # GLB文件也加入mesh + if save_glb_all: + all_mesh_copies.append(("grasp_point", grasp_point_mesh)) + + # 重新聚合所有点和颜色再写PLY + pts_all = np.vstack(pts_all_list) + cols_all = np.vstack(cols_all_list) + + ply_path = out_dir / f"object_{traj_key}.ply" + point_cloud = trimesh.points.PointCloud(vertices=pts_all, colors=cols_all) + point_cloud.export(str(ply_path)) + print(f"[{key}][PLY] aggregated: {ply_path} (pts={pts_all.shape[0]:,})") + + + # write single GLB (optional) + glb_path = None + if save_glb_all: + glb_path = out_dir / "scene_all_frames.glb" + save_scene_glb_all(bg_mesh, all_mesh_copies, glb_path) + print(f"[{key}][GLB] aggregated: {glb_path} (meshes={len(all_mesh_copies) + 1:,} incl. background)") + + return {"ply": ply_path, "glb": glb_path, "frames": int(n_frames), "points": int(len(pts_all))} + +def visualize_scene(keys): + for key in keys: + visualize_single_scene(key) + print(f"[done] key={key}") + print('[Info] Scene visualization completed.') + +# ---------------- main (batch over config keys) ---------------- +def parse_args(): + ap = argparse.ArgumentParser() + ap.add_argument("--stride", type=int, default=5, help="frame stride (1 = all frames)") + ap.add_argument("--save_glb_all", action="store_true", help="export a single GLB containing all frames") + ap.add_argument("--encode_time_in_alpha", action="store_true", help="encode time as alpha in PLY colors") + ap.add_argument("--mesh_key", type=str, default="optimized", help="object mesh field name (e.g., fdpose or optimized)") + ap.add_argument("--traj_key", type=str, default="hybrid_trajs", help="trajectory field name (e.g., fdpose_trajs/simple_trajs/hybrid_trajs)") + return ap.parse_args() + +def main(): + args = parse_args() + base = Path.cwd() + cfg = yaml.safe_load((base / "config" / "config.yaml").open("r")) + keys = cfg["keys"] + + for key in keys: + print(f"========== [visualize object trajectories] key: {key} ==========\n") + visualize_single_scene( + key=key, + out_dir=None, # default -> outputs//debug + bg_samples=60000, + obj_samples=12000, + bg_voxel=0.01, + obj_voxel=0.005, + stride=args.stride, + save_glb_all=args.save_glb_all, + encode_time_in_alpha=args.encode_time_in_alpha, + mesh_key=args.mesh_key, + traj_key=args.traj_key, + ) + print(f"[done] key={key}") + +if __name__ == "__main__": + main() diff --git a/openreal2sim/reconstruction/modules/background_mesh_generation.py b/openreal2sim/reconstruction/modules/background_mesh_generation.py index bcb014d..2e0d85a 100755 --- a/openreal2sim/reconstruction/modules/background_mesh_generation.py +++ b/openreal2sim/reconstruction/modules/background_mesh_generation.py @@ -16,8 +16,7 @@ "background": { "original": # original background mesh path, }, - "groundplane_in_cam": { - "point": # a point on the ground plane [x,y,z], +b "point": # a point on the ground plane [x,y,z], "normal": # the normal of the ground plane [x,y,z], } } @@ -32,6 +31,18 @@ from PIL import Image # ──────────────────── util ───────────────────── +def depth_to_pointcloud(depth, intrinsic): + H, W = depth.shape + u, v = np.meshgrid(np.arange(W), np.arange(H)) + u = u.reshape(-1) + v = v.reshape(-1) + depth = depth.reshape(-1) + pointcloud = np.zeros((len(depth), 3)) + pointcloud[:, 0] = (u - intrinsic[0, 2]) * depth / intrinsic[0, 0] + pointcloud[:, 1] = (v - intrinsic[1, 2]) * depth / intrinsic[1, 1] + pointcloud[:, 2] = depth + return pointcloud + def get_boundary_edges(mesh: trimesh.Trimesh): edges = np.vstack([ mesh.faces[:, [0, 1]], @@ -164,10 +175,12 @@ def visualize_mesh_thickness_with_pointmap(scene_dir: Path, o3d.io.write_point_cloud(str(out_file), pcd) print(f"✓ thickness debug cloud saved: {out_file}") + # ──────────────────── core ───────────────────── def background_mesh_generation(keys, key_scene_dicts, key_cfgs): base_dir = Path.cwd() for key in keys: + print(f"[Info] Processing {key}...\n") scene_dict = key_scene_dicts[key] cfg = key_cfgs[key] recon_dir = base_dir / f'outputs/{key}/reconstruction' @@ -190,7 +203,7 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs): pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(plane_pts) plane_model, _ = pcd.segment_plane(distance_threshold=0.01, - ransac_n=3, num_iterations=1000) + ransac_n=10, num_iterations=2000) a, b, c, d = plane_model normal = np.array([a, b, c], np.float64) normal /= (np.linalg.norm(normal) + 1e-12) @@ -205,11 +218,12 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs): p0 = plane_pts.mean(axis=0) signed = np.median((obj_pts - p0) @ normal) if signed < 0: + print(f"[Info] Flipping normal direction for {key} because the median signed distance is negative") normal = -normal save_plane_normal_path = recon_dir / "ground_normal.ply" visualize_plane_normal(plane_pts, normal, save_plane_normal_path, - num_arrow_pts=200, normal_length=0.5) + num_arrow_pts=200, normal_length=0.5) # background mesh thicken direction is opposite to ground normal (downward) direction = -normal @@ -227,7 +241,7 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs): orig_F = 2 * (H - 1) * (W - 1) new_F = 2 * (H2 - 1) * (W2 - 1) print(f"[i] simplified grid: ({H},{W}) -> ({H2},{W2}) " - f"faces ~ {orig_F} -> {new_F}") + f"faces ~ {orig_F} -> {new_F}") # construct mesh verts = pmap_s[..., :3].reshape(-1, 3) @@ -236,18 +250,28 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs): base = i * W2 for j in range(W2 - 1): faces += [[base + j, base + j + 1, base + j + W2], - [base + j + 1, base + j + W2 + 1, base + j + W2]] + [base + j + 1, base + j + W2 + 1, base + j + W2]] faces = np.asarray(faces, dtype=np.int32) uu, vv = np.meshgrid(np.linspace(0, 1, W2, dtype=np.float32), - np.linspace(1, 0, H2, dtype=np.float32)) + np.linspace(1, 0, H2, dtype=np.float32)) uv = np.stack([uu, vv], -1).reshape(-1, 2) mesh = trimesh.Trimesh(vertices=verts, faces=faces, process=False) + + face_normals = mesh.face_normals + if len(face_normals) > 0: + dot_products = np.dot(face_normals, normal) + avg_dot = np.mean(dot_products) + if avg_dot < 0: + print("[Info] Flipping face orientation to align with ground normal") + mesh.faces = np.flip(mesh.faces, axis=1) + mesh.visual = trimesh.visual.TextureVisuals(uv=uv, image=img) # flipping mesh normals for correct texturing - mesh.faces = mesh.faces[:, ::-1] + if signed > 0: + mesh.faces = mesh.faces[:, ::-1] thickness = cfg["bg_mesh_thickness"] mesh_thick = add_thickness_below_mesh_preserve_texture(mesh, thickness, direction) @@ -272,6 +296,7 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs): pickle.dump(scene_dict, f) print(f"[Info] [{key}] scene_dict updated.") + return key_scene_dicts diff --git a/openreal2sim/reconstruction/modules/background_pixel_inpainting.py b/openreal2sim/reconstruction/modules/background_pixel_inpainting.py index d2601b8..4798a8b 100755 --- a/openreal2sim/reconstruction/modules/background_pixel_inpainting.py +++ b/openreal2sim/reconstruction/modules/background_pixel_inpainting.py @@ -30,10 +30,23 @@ base_dir = Path.cwd() repo_dir = str(base_dir / 'third_party/ObjectClear') sys.path.append(repo_dir) - from objectclear.pipelines import ObjectClearPipeline from objectclear.utils import resize_by_short_side +# ---------------- add sam to path ---------------- +sam_dir = base_dir / "third_party/Grounded-SAM-2" +sys.path.append(str(sam_dir)) + + + +from sam2.build_sam import build_sam2 +from sam2.sam2_image_predictor import SAM2ImagePredictor +from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection + +def dilate_mask(mask, kernel_size=3): + return cv2.dilate(mask, np.ones((kernel_size, kernel_size), np.uint8), iterations=1) + + def background_pixel_inpainting(keys, key_scene_dicts, key_cfgs): # hyperparameters @@ -120,6 +133,8 @@ def background_pixel_inpainting(keys, key_scene_dicts, key_cfgs): # Model was trained on 512 short side image = resize_by_short_side(image, 512, resample=Image.BICUBIC) + mask = dilate_mask(np.array(mask), kernel_size=10) + mask = Image.fromarray(mask) mask = resize_by_short_side(mask, 512, resample=Image.NEAREST) w, h = image.size @@ -143,14 +158,51 @@ def background_pixel_inpainting(keys, key_scene_dicts, key_cfgs): # Save as background.jpg in the scene folder save_path = os.path.join(result_root, 'background.jpg') fused_img_pil.save(save_path) - - # Save in scene_dict + + # ---------- run Plane Segmentation ---------- + TEXT = "ground. plane. table top. desk top" + CFG = "configs/sam2.1/sam2.1_hiera_l.yaml" + CKPT = base_dir / "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt" + img_predictor = SAM2ImagePredictor(build_sam2(CFG, CKPT)) + dino_proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base") + dino_model = AutoModelForZeroShotObjectDetection.from_pretrained( + "IDEA-Research/grounding-dino-base").to(device) + + batch = dino_proc(images=[fused_img_pil], text = TEXT, return_tensors="pt") + batch={k:v.to(device) for k,v in batch.items()} + with torch.no_grad(): + out=dino_model(pixel_values=batch["pixel_values"], + input_ids=batch["input_ids"], + attention_mask=batch.get("attention_mask")) + boxes = dino_proc.post_process_grounded_object_detection(out,batch["input_ids"], + .25,.3,target_sizes=[np.asarray(fused_img_pil).shape[:2]])[0]["boxes"].cpu().numpy() + plane_masks = [] + for box in boxes: + img_predictor.set_image(fused_img_pil) + m,*_=img_predictor.predict(box=box,multimask_output=False) + m = m[0] >.5 + if np.sum(m) > 0: + plane_masks.append(m) + if plane_masks: + mask_shape = plane_masks[0].shape + plane_mask_img = np.zeros(mask_shape, dtype=np.uint8) + for idx, plane_mask in enumerate(plane_masks): + plane_mask_img[plane_mask > 0] = idx + 1 + max_idx = len(plane_masks) + plane_mask_img_vis = cv2.applyColorMap( + (plane_mask_img * (255 // max(1,max_idx))).astype(np.uint8), cv2.COLORMAP_JET + ) + plane_mask_img_vis[plane_mask_img == 0] = [0,0,0] + out_path = os.path.join(result_root, "plane_mask.jpg") + cv2.imwrite(out_path, plane_mask_img_vis) + if "recon" not in scene_dict: scene_dict["recon"] = {} scene_dict["recon"]["background"] = np.ascontiguousarray(np.array(fused_img_pil, dtype=np.uint8)) scene_dict["recon"]["foreground"] = np.ascontiguousarray(np.array(image_orig, dtype=np.uint8)) scene_dict["recon"]["ground_mask"] = ground_accum if ground_accum is not None else None # H x W, bool scene_dict["recon"]["object_mask"] = mask_accum if mask_accum is not None else None # H x W, bool + scene_dict["recon"]["plane_mask"] = plane_mask_img key_scene_dicts[key] = scene_dict with open(base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f: pickle.dump(scene_dict, f) diff --git a/openreal2sim/reconstruction/modules/background_point_inpainting.py b/openreal2sim/reconstruction/modules/background_point_inpainting.py index 70476df..0ed7c2d 100755 --- a/openreal2sim/reconstruction/modules/background_point_inpainting.py +++ b/openreal2sim/reconstruction/modules/background_point_inpainting.py @@ -43,6 +43,21 @@ def dilate_mask(binary_mask: np.ndarray, pixels: int = 4, shape: str = "ellipse" out = cv2.dilate(binary_mask.astype(np.uint8), k, iterations=1) return out.astype(bool) +def fill_mask(mask: np.ndarray) -> np.ndarray: + """ + Fill the mask with the flood fill algorithm. + """ + mask = mask.astype(np.uint8) + filled = mask.copy() + inv_mask = 1 - mask + h, w = mask.shape + floodfilled = inv_mask.copy() + floodfilled_pad = np.pad(floodfilled, 1, mode='constant', constant_values=0) + cv2.floodFill(floodfilled_pad, None, (0,0), 255) + floodfilled_nohole = floodfilled_pad[1:-1,1:-1] + out_mask = ((floodfilled_nohole == 0) | (mask == 1)).astype(bool) + return out_mask + # ─────────────────────────── Depth prediction ────────────────────────── def run_moge_depth(img_rgb: np.ndarray, device: torch.device) -> np.ndarray: """MoGe-2 inference (unchanged). Returns float32 depth (H,W).""" @@ -100,23 +115,35 @@ def robust_scale_shift_align( return float(a), float(b) def depth_to_points(depth: np.ndarray, K: np.ndarray) -> np.ndarray: + """ + Convert a depth map to 3D points and exclude pixels with zero or non-finite depth. + Returns an (N,3) array containing only valid points (z>0 and finite). + """ H, W = depth.shape - fx, fy, cx, cy = K[0,0], K[1,1], K[0,2], K[1,2] + fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2] ii, jj = np.meshgrid(np.arange(W), np.arange(H), indexing="xy") - z = depth.reshape(-1) + z = depth.reshape(-1).astype(np.float32) x = (ii.reshape(-1) - cx) * z / fx y = (jj.reshape(-1) - cy) * z / fy - return np.stack([x, y, z], 1) + pts = np.stack([x, y, z], axis=1) + valid = (z > 0) & (z < np.inf) + pts[~valid] = np.nan + return pts + + def plane_fill(depth0, K, ground, obj): H,W=depth0.shape; fx,fy,cx,cy=K[0,0],K[1,1],K[0,2],K[1,2] - # fit plane on ground region pts_ground = depth_to_points(depth0, K)[ground.ravel()] - pc = o3d.geometry.PointCloud(); pc.points = o3d.utility.Vector3dVector(pts_ground) - plane,_ = pc.segment_plane(distance_threshold=0.01, ransac_n=3, num_iterations=1000) + + + pc = o3d.geometry.PointCloud() + pc.points = o3d.utility.Vector3dVector(pts_ground) + + np.random.seed(42) + plane , _= pc.segment_plane(distance_threshold=0.02, ransac_n=3, num_iterations=1000) a,b,c,d = plane - # intersect rays with plane inside the object mask ys,xs = np.where(obj) dx = (xs - cx) / fx; dy = (ys - cy) / fy; dz = np.ones_like(dx) @@ -128,9 +155,60 @@ def plane_fill(depth0, K, ground, obj): z_new = t[valid] * dz[valid] depth0_filled = depth0.copy() - depth0_filled[ys[valid], xs[valid]] = np.clip(z_new, 0, depth0.max()) + + depth0_filled[ys[valid], xs[valid]] = np.clip(z_new, 0, 5 * depth0.max()) return depth0_filled +def hybrid_fill(depth0: np.ndarray, fg_img: np.ndarray, bg_img: np.ndarray, K: np.ndarray, + obj_msk: np.ndarray, device: torch.device, + plane_mask: np.ndarray = None) -> np.ndarray: + """ + Generate depth using hybrid approach: + - Ground regions use plane fill + - Other regions use MoGe depth prediction + + Args: + depth0: Original depth map + bg_img: Inpainted background image + K: Camera intrinsic matrix + obj_msk: Object mask + device: PyTorch device + existing_ground_mask: Existing ground mask to sample points from (H, W) + + Returns: + depth_bg: Generated background depth + """ + print(f"[Info] Starting hybrid depth generation...") + depth_bg = depth0.copy() + kernel = np.ones((25, 25), np.uint8) # prevent boundaries problem. + obj_msk = cv2.dilate(obj_msk.astype(np.uint8), kernel, iterations=1).astype(bool) + left_obj_msk = obj_msk.copy() + plane_masks = [] + for plane_idx in range(1, plane_mask.max()): + plane_mask = (plane_mask == plane_idx).astype(bool) + plane_masks.append(plane_mask) + for plane_mask in plane_masks: + print(f"[Info] plane mask: {plane_mask.sum()}, plane mask and object mask: {(plane_mask & left_obj_msk).sum()}") + if (plane_mask & left_obj_msk).sum() > 0: + #depth_filled = plane_fill(depth0, K, plane_mask, left_obj_msk & plane_mask) + depth_filled = plane_fill(depth0, K, plane_mask, left_obj_msk & plane_mask) + depth_bg[plane_mask & left_obj_msk] = depth_filled[plane_mask & left_obj_msk] + left_obj_msk = left_obj_msk & ~plane_mask + depth_moge = run_moge_depth(bg_img, device) + + if left_obj_msk.sum() > 0: + a, b = robust_scale_shift_align( + pred_depth=depth_moge, + ref_depth=depth0, + mask= ~obj_msk.astype(np.uint8), + iters=5, + huber_delta=0.02 + ) + depth_align = (a * depth_moge + b).astype(np.float32) + depth_bg[left_obj_msk] = depth_align[left_obj_msk] + + return depth_bg + def export_cloud(pts: np.ndarray, colors: np.ndarray, out_path: Path): pts = pts.reshape(-1,3) @@ -140,6 +218,10 @@ def export_cloud(pts: np.ndarray, colors: np.ndarray, out_path: Path): pcd.colors = o3d.utility.Vector3dVector(colors) o3d.io.write_point_cloud(str(out_path), pcd) +def clear_nan(pts: np.ndarray, colors: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + valid_mask = np.isfinite(pts).all(axis=1) + return pts[valid_mask], colors[valid_mask] + def background_point_inpainting(keys, key_scene_dicts, key_cfgs): for key in tqdm.tqdm(keys): @@ -158,27 +240,28 @@ def background_point_inpainting(keys, key_scene_dicts, key_cfgs): if "ground_mask" not in scene_dict["recon"] or "object_mask" not in scene_dict["recon"]: print(f"[Warning] [{key}] 'recon' key missing 'ground_mask' or 'object_mask'; run background_pixel_inpainting first.") continue - - ground_mask = scene_dict["recon"]["ground_mask"] # H x W, bool + ground_mask = scene_dict["recon"]["ground_mask"] + plane_mask = scene_dict["recon"]["plane_mask"] if scene_dict["recon"]["plane_mask"] is not None else None object_mask = scene_dict["recon"]["object_mask"] # H x W, bool - # dilate the masks a bit more to remove boundary outliers - object_mask = dilate_mask(object_mask, pixels=key_cfg["obj_dilate_pixels"], shape="ellipse") + # dilate the masks a bit more to remove boundary outliers + object_mask = fill_mask(object_mask) ground_mask = dilate_mask(ground_mask, pixels=key_cfg["ground_dilate_pixels"], shape="ellipse") fg_img = scene_dict["recon"]["foreground"] # H x W x 3, uint8 bg_img = scene_dict["recon"]["background"] # H x W x 3, uint8 depth0 = scene_dict["depths"][0] # H x W, float32 K = scene_dict["intrinsics"].astype(np.float32) # 3 x 3, float32 H, W = depth0.shape - + gpu_id = key_cfg["gpu"] + device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu") completion_mode = key_cfg["bg_completion_mode"] if completion_mode == "plane": # assume the masked region is the ground, and using ground plane for geometry completion depth_bg = plane_fill(depth0, K, ground_mask, object_mask) + elif completion_mode == "hybrid" and plane_mask is not None: + depth_bg = hybrid_fill(depth0, fg_img, bg_img, K, object_mask, device, plane_mask) else: # use monocular depth prediction for geometry completion print(f"[Info] [{key}] predicting depth with MoGe-2 ...") - gpu_id = key_cfg["gpu"] - device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu") depth_pd = run_moge_depth(bg_img, device) # align the depth scale of current bg depth prediction and original image depth prediction @@ -197,11 +280,14 @@ def background_point_inpainting(keys, key_scene_dicts, key_cfgs): scene_dict["recon"]["bg_depth"] = depth_bg.astype(np.float32) scene_dict["recon"]["fg_depth"] = depth0.astype(np.float32) bg_pts = depth_to_points(depth_bg, K) + bg_colors = bg_img.reshape(-1,3) / 255. bg_color_pts = np.concatenate([bg_pts.reshape(H,W,3), bg_colors.reshape(H,W,3)], -1) fg_pts = depth_to_points(depth0, K) fg_colors = fg_img.reshape(-1,3) / 255. fg_color_pts = np.concatenate([fg_pts.reshape(H,W,3), fg_colors.reshape(H,W,3)], -1) + bg_pts, bg_colors = clear_nan(bg_pts, bg_colors) + fg_pts, fg_colors = clear_nan(fg_pts, fg_colors) scene_dict["recon"]["bg_pts"] = bg_color_pts.astype(np.float32) # (H, W, 6) xyzrgb rgb in [0,1] scene_dict["recon"]["fg_pts"] = fg_color_pts.astype(np.float32) # (H, W, 6) xyzrgb rgb in [0,1] key_scene_dicts[key] = scene_dict @@ -212,6 +298,7 @@ def background_point_inpainting(keys, key_scene_dicts, key_cfgs): export_cloud(fg_pts, fg_colors, recon_dir / "foreground_points.ply") export_cloud(bg_pts, bg_colors, recon_dir / "background_points.ply") print(f"[Info] [{key}] geometry inpainting done.\n") + return key_scene_dicts if __name__ == "__main__": diff --git a/openreal2sim/reconstruction/modules/object_mesh_generation.py b/openreal2sim/reconstruction/modules/object_mesh_generation.py index fd1d3de..bd336bc 100755 --- a/openreal2sim/reconstruction/modules/object_mesh_generation.py +++ b/openreal2sim/reconstruction/modules/object_mesh_generation.py @@ -27,20 +27,166 @@ import numpy as np from PIL import Image import yaml - +import torch +import cv2 base_dir = Path.cwd() output_dir = base_dir / "outputs" repo_dir = str(base_dir / 'third_party/Hunyuan3D-2') sys.path.append(repo_dir) -# --- Switched from Trellis to Hunyuan3D --- + from hy3dgen.rembg import BackgroundRemover from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline from hy3dgen.texgen import Hunyuan3DPaintPipeline from hy3dgen.shapegen import FaceReducer, FloaterRemover, DegenerateFaceRemover +base_dir = Path.cwd() +repo_dir = str(base_dir / 'third_party/ObjectClear') +sys.path.append(repo_dir) +from objectclear.pipelines import ObjectClearPipeline +from objectclear.utils import resize_by_short_side + + +sys.path.append(str(base_dir / 'third_party/Grounded-SAM-2')) +from sam2.build_sam import build_sam2 +from sam2.sam2_image_predictor import SAM2ImagePredictor + +# TODO: super-resolution # ------------------------------------------------------------------ +def object_stacking(single_scene_dict): + objects = load_obj_masks(single_scene_dict['mask']) + # objects is a list of dicts, convert it into a dict mapping oid to object dict for fast access. + objects_by_oid = {obj['oid']: obj for obj in objects} + if not objects or len(objects) <= 1: + # Nothing to stack + stacking = {oid: [] for oid in objects_by_oid} + return stacking + + K = single_scene_dict["intrinsics"] + depth = single_scene_dict["depths"][0] + H, W = depth.shape + ground = single_scene_dict["info"]["groundplane_in_cam"] + g_point = np.array(ground["point"]).reshape(3) + g_normal = np.array(ground["normal"]).reshape(3) + g_normal = g_normal/np.linalg.norm(g_normal) + + # Camera center calculation: assume pinhole at [0,0,0] + cam_center = np.zeros(3) + + + stacking_normal = g_normal / np.linalg.norm(g_normal) + + cam_ground_sign = np.sign(g_normal[2]) + + def depth_to_points(mask): + # Project 2D pixel to camera coordinates + ys, xs = np.where(mask) + ds = depth[ys, xs] + fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2] + xs_ = (xs - cx) * ds / fx + ys_ = (ys - cy) * ds / fy + return np.stack([xs_, ys_, ds], axis=1) # (N, 3) + + def dilate_mask(mask, radius=1): + mask = cv2.dilate(mask.astype(np.uint8), np.ones((radius, radius), np.uint8)) + return mask.astype(bool) + + # Compute "height from ground" for each object (median, robust to outliers) + heights = {} + obj_cloud = {} + for idx, obj in enumerate(objects): + oid = obj['oid'] + msk = obj['mask'] + pts = depth_to_points(msk) + obj_cloud[oid] = pts + if pts.shape[0] == 0: + heights[oid] = float("nan") + continue + vecs = pts - g_point # (N,3) + hts = vecs @ g_normal # project to normal + heights[oid] = np.median(hts) + + stacking = {obj['oid']: [] for obj in objects} + + # Compute mask overlap between objects + for idx, obj_a in enumerate(objects): + oid_a = obj_a['oid'] + orig_mask_a = np.array(obj_a["mask"]).astype(bool) + mask_a = dilate_mask(orig_mask_a, radius=10) + if not np.any(mask_a): + continue + for idx_b, obj_b in enumerate(objects): + oid_b = obj_b['oid'] + if oid_a == oid_b: + continue + orig_mask_b = np.array(obj_b["mask"]).astype(bool) + mask_b = dilate_mask(orig_mask_b, radius=10) + if not np.any(mask_b): + continue + overlap_mask = mask_a & mask_b + if not np.any(overlap_mask): + continue + # Heights at overlap + pts_a = depth_to_points(orig_mask_a) + pts_b = depth_to_points(orig_mask_b) + if pts_a.shape[0] == 0 or pts_b.shape[0] == 0: + continue + h_a = pts_a @ g_normal + h_b = pts_b @ g_normal + h_a_val = np.median(h_a) + h_b_val = np.median(h_b) + + # Stacked-on rule: + # If cam_ground_sign < 0 (camera is above), higher objects (along normal) are in front, thus "on top" + # If cam_ground_sign > 0 (camera is below), lower (along normal) are in front, thus "on top" + obj_a_name = obj_a["name"] + obj_b_name = obj_b["name"] + if np.isfinite(h_a_val) and np.isfinite(h_b_val): + if cam_ground_sign < 0: + # camera above ground & objects: higher-along-normal overlays lower + print("Camera is above ground") + if h_a_val > h_b_val + 0.01: + stacking[oid_b].append(oid_a) + print(f"Object {obj_a_name} is stacked on {obj_b_name}") + print(f"Object {obj_a_name} occludes {obj_b_name}") + else: + # camera below: lower-along-normal overlays higher + print("Camera is below ground") + if h_a_val < h_b_val - 0.01: + stacking[oid_b].append(oid_a) + print(f"Object {obj_b_name} is stacked on {obj_a_name}") + print(f"Object {obj_a_name} occludes {obj_b_name}") + + # Remove duplicates, sort + for oid in stacking: + stacking[oid] = list(sorted(set(stacking[oid]))) + + return stacking + + + + + +def clear_object(orig_img, orig_mask, whole_mask): + # Compute bbox from orig_mask + ys, xs = np.where(orig_mask) + if ys.size == 0 or xs.size == 0: + raise ValueError("orig_mask contains no foreground pixels") + x1, y1, x2, y2 = xs.min(), ys.min(), xs.max(), ys.max() + margin = int(0.1 * max(x2 - x1, y2 - y1)) + # expand bbox by margin, clamp to image boundaries + h, w = orig_img.shape[0], orig_img.shape[1] + x1 = max(x1 - margin, 0) + y1 = max(y1 - margin, 0) + x2 = min(x2 + margin, w - 1) + y2 = min(y2 + margin, h - 1) + crop_img = orig_img[y1:y2+1, x1:x2+1] + orig_mask = orig_mask[y1:y2+1, x1:x2+1] + new_mask = whole_mask[y1:y2+1, x1:x2+1] + new_mask = ~orig_mask & new_mask + + return crop_img, new_mask, orig_mask def save_object_png(orig_img: Image.Image, mask: np.ndarray, @@ -119,6 +265,32 @@ def object_mesh_generation(keys, key_scene_dicts, key_cfgs): 'tencent/Hunyuan3D-2', subfolder='hunyuan3d-paint-v2-0' ) rembg = BackgroundRemover() + + # hyperparameters + USE_FP16 = True + SEED = 42 + NUM_STEPS = 20 + STRENGTH = 0.99 + GUIDANCE_SCALE = 2.5 + + # Set up ObjectClear pipeline once + torch_dtype = torch.float16 if USE_FP16 else torch.float32 + variant = "fp16" if USE_FP16 else None + gpu_id = key_cfgs[keys[0]]["gpu"] # it has to be running on the same GPU + device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu') + generator = torch.Generator(device=device).manual_seed(SEED) + pipe = ObjectClearPipeline.from_pretrained_with_custom_modules( + "jixin0101/ObjectClear", + torch_dtype=torch_dtype, + apply_attention_guided_fusion=True, + cache_dir=None, + variant=variant, + ).to(device) + + sam_weights_path = "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt" + sam_cfg_path = "configs/sam2.1/sam2.1_hiera_l.yaml" + sam_predictor = SAM2ImagePredictor(build_sam2(sam_cfg_path, sam_weights_path)) + sam_predictor.model.to(device) for key in keys: print(f"[Info] Processing {key}...\n") @@ -135,16 +307,69 @@ def object_mesh_generation(keys, key_scene_dicts, key_cfgs): # fixed seed kept (not used internally by these calls but preserved for parity) seed = random.randint(0, 99999) # generate object mesh for each object + stacking = object_stacking(scene_dict) for idx, item in enumerate(objs): mask = item['mask'].astype(bool) name = item['name'] stem = f"{item['oid']}_{name}" png_path = out_dir / f"{stem}.png" - + regenerated_path = out_dir / f"{stem}_regenerated.png" + cut_mask_path = out_dir / f"{stem}_cut_mask.png" # 1) save transparent PNG - save_object_png(orig_img, mask, png_path) - print(f"[Info] [{key}] saved crop → {png_path}") - + if len(stacking[item['oid']]) > 0: + print(f"[Info] processing stacking for object {name}") + stack_mask = np.zeros_like(mask) + for oid in stacking[item['oid']]: + for obj in objs: + if obj["oid"] == oid: + break + stack_mask |= np.array(obj["mask"]).astype(bool) + crop_img, new_mask, orig_mask = clear_object(np.asarray(orig_img), mask, stack_mask) + crop_img = Image.fromarray(crop_img,mode="RGB") + crop_img = resize_by_short_side(crop_img, 512, resample=Image.BICUBIC) + new_mask = resize_by_short_side(Image.fromarray(new_mask.astype(np.uint8) * 255), 512, resample=Image.NEAREST) + new_mask.save(cut_mask_path) + orig_mask = resize_by_short_side(Image.fromarray(orig_mask.astype(np.uint8) * 255), 512, resample=Image.NEAREST) + + new_img = pipe( + prompt="remove the instance of object", + image=crop_img, + mask_image=new_mask, + generator=generator, + num_inference_steps=NUM_STEPS, + strength=STRENGTH, + guidance_scale=GUIDANCE_SCALE, + height=crop_img.height, + width=crop_img.width, + ) + crop_img = new_img.images[0] + sam_predictor.set_image(np.asarray(crop_img)) + ## may sample from orig mask. + orig_mask_np = np.array(orig_mask).astype(bool) + yx_locs = np.argwhere(orig_mask_np) + num_points = min(8, len(yx_locs)) + if num_points > 0: + selected_indices = np.random.choice(len(yx_locs), num_points, replace=False) + sampled_points = yx_locs[selected_indices] + point_coords = np.fliplr(sampled_points).copy() + point_labels = np.ones(num_points, dtype=int) + else: + point_coords = None + point_labels = None + + new_mask, scores, logits = sam_predictor.predict( + point_coords=point_coords, + point_labels=point_labels, + box=None, + multimask_output=False + ) + new_mask = new_mask[0].astype(bool) + crop_img.save(regenerated_path) + save_object_png(crop_img, new_mask, png_path) + print(f"[Info] [{key}] saved crop → {png_path}") + else: + save_object_png(orig_img, mask, png_path) + print(f"[Info] [{key}] saved crop → {png_path}") # 2) Hunyuan3D shape + texture img_rgba = Image.open(png_path).convert("RGBA") if img_rgba.mode == 'RGB': # fallback: ensure RGBA diff --git a/openreal2sim/reconstruction/tools/segmentation_mask_propagation.py b/openreal2sim/reconstruction/tools/segmentation_mask_propagation.py index accf4ae..634962b 100755 --- a/openreal2sim/reconstruction/tools/segmentation_mask_propagation.py +++ b/openreal2sim/reconstruction/tools/segmentation_mask_propagation.py @@ -140,8 +140,7 @@ def propagate_maks(segmented_video: object, output_directory: Path): def mask_propagation(keys, key_scene_dicts): for key in keys: print("propagating for", key) - with open(OUT_ROOT/key/"scene/scene.pkl", "rb") as f: - scene = pickle.load(f) + scene = key_scene_dicts[key] propagate_maks(scene, OUT_ROOT/key) key_scene_dicts[key] = scene return key_scene_dicts @@ -152,5 +151,10 @@ def mask_propagation(keys, key_scene_dicts): cfg_path = base_dir / "config" / "config.yaml" cfg = yaml.safe_load(cfg_path.open("r")) keys = cfg["keys"] - mask_propagation(keys) + key_scene_dicts = {} + for key in keys: + with open(OUT_ROOT/key/"scene/scene.pkl", "rb") as f: + scene = pickle.load(f) + key_scene_dicts[key] = scene + mask_propagation(keys, key_scene_dicts) \ No newline at end of file diff --git a/scripts/installation/install.py b/scripts/installation/install.py index 6ac9f5c..66591a7 100644 --- a/scripts/installation/install.py +++ b/scripts/installation/install.py @@ -27,6 +27,20 @@ def download_file(url, destination): urlretrieve(url, destination) print("Download complete.") +def download_with_gdown(file_id, destination): + """Downloads a file from Google Drive using gdown.""" + dest_dir = os.path.dirname(destination) + print(f"Ensuring directory exists: {dest_dir}") + os.makedirs(dest_dir, exist_ok=True) + file_name = os.path.basename(destination) + print(f"Downloading {file_name} from Google Drive (ID: {file_id}) to {destination}...") + # Use gdown command to download from Google Drive using file ID + run_command( + ["gdown", file_id, "-O", destination], + cwd=None + ) + print("Download complete.") + def main(): """Main function to set up all dependencies.""" # Ensure we are in the correct base directory @@ -34,8 +48,12 @@ def main(): os.chdir(base_dir) print(f"Working directory set to: {os.getcwd()}") + # --- Install gdown for Google Drive downloads --- + print("\n--- [1/9] Installing gdown for Google Drive downloads ---") + run_command([sys.executable, "-m", "pip", "install", "gdown"]) + # --- Mega-SAM Dependencies --- - print("\n--- [1/6] Setting up Mega-SAM dependencies ---") + print("\n--- [2/9] Setting up Mega-SAM dependencies ---") download_file( "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth", "third_party/mega-sam/Depth-Anything/checkpoints/depth_anything_vitl14.pth" @@ -46,13 +64,13 @@ def main(): ) # --- Segmentation Dependencies (Grounded-SAM-2) --- - print("\n--- [2/6] Downloading Segmentation model ---") + print("\n--- [3/9] Downloading Segmentation model ---") download_file( "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt", "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt" ) - print("\n--- [3/6] Building Segmentation CUDA extension ---") + print("\n--- [4/9] Building Segmentation CUDA extension ---") build_cuda_path = os.path.join(base_dir, "third_party/Grounded-SAM-2") run_command( [sys.executable, "build_cuda.py", "build_ext", "--inplace", "-v"], @@ -60,7 +78,7 @@ def main(): ) # --- FoundationPose Dependencies --- - print("\n--- [4/6] Downloading FoundationPose weights ---") + print("\n--- [5/9] Downloading FoundationPose weights ---") fp_weights_dir = os.path.join(base_dir, "third_party/FoundationPose/weights") os.makedirs(fp_weights_dir, exist_ok=True) snapshot_download( @@ -69,7 +87,7 @@ def main(): local_dir=fp_weights_dir ) - print("\n--- [5/6] Compiling FoundationPose C++ extension ---") + print("\n--- [6/9] Compiling FoundationPose C++ extension ---") fp_cpp_build_path = os.path.join(base_dir, "third_party/FoundationPose/mycpp/build") run_command(["rm", "-rf", fp_cpp_build_path]) os.makedirs(fp_cpp_build_path, exist_ok=True) @@ -79,7 +97,7 @@ def main(): ) run_command(["make", "-j11"], cwd=fp_cpp_build_path) - print("\n--- [6/6] Compiling FoundationPose bundlesdf CUDA extension ---") + print("\n--- [7/9] Compiling FoundationPose bundlesdf CUDA extension ---") fp_bundlesdf_path = os.path.join(base_dir, "third_party/FoundationPose/bundlesdf/mycuda") run_command(["rm", "-rf", "build"], cwd=fp_bundlesdf_path) run_command(["rm", "-rf", "*.egg-info"], cwd=fp_bundlesdf_path) @@ -88,6 +106,32 @@ def main(): cwd=fp_bundlesdf_path ) + # --- WiLoR Dependencies (Hand Extraction) --- + print("\n--- [8/9] Downloading WiLoR pretrained models ---") + wilor_models_dir = os.path.join(base_dir, "third_party/WiLoR/pretrained_models") + os.makedirs(wilor_models_dir, exist_ok=True) + download_file( + "https://huggingface.co/spaces/rolpotamias/WiLoR/resolve/main/pretrained_models/detector.pt", + os.path.join(wilor_models_dir, "detector.pt") + ) + download_file( + "https://huggingface.co/spaces/rolpotamias/WiLoR/resolve/main/pretrained_models/wilor_final.ckpt", + os.path.join(wilor_models_dir, "wilor_final.ckpt") + ) + + # --- Grasp Generation Checkpoints --- + print("\n--- [9/9] Downloading Grasp Generation checkpoints ---") + ckpt_dir = os.path.join(base_dir, "third_party/graspness_unofficial/ckpt") + os.makedirs(ckpt_dir, exist_ok=True) + download_with_gdown( + "10o5fc8LQsbI8H0pIC2RTJMNapW9eczqF", + os.path.join(ckpt_dir, "minkuresunet_kinect.tar") + ) + download_with_gdown( + "1RfdpEM2y0x98rV28d7B2Dg8LLFKnBkfL", + os.path.join(ckpt_dir, "minkuresunet_realsense.tar") + ) + print("\n\n--- All dependencies set up successfully! ---") if __name__ == "__main__":