diff --git a/.gitmodules b/.gitmodules
index 2277161..c1a2c88 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,15 @@
 [submodule "third_party/ObjectClear"] # for object clearing [reconstruction]
 	path = third_party/ObjectClear
 	url = git@github.com:PointsCoder/ObjectClear.git
+[submodule "third_party/graspness_unofficial"] # for grasp generation [motion]
+	path = third_party/graspness_unofficial
+	url = git@github.com:PointsCoder/graspness_unofficial.git
+[submodule "third_party/graspnetAPI"] # for grasp evaluation and utilities [motion]
+	path = third_party/graspnetAPI
+	url = git@github.com:PointsCoder/graspnetAPI.git
+[submodule "third_party/MinkowskiEngine"] # for sparse convolution operations [motion]
+	path = third_party/MinkowskiEngine
+	url = https://github.com/NVIDIA/MinkowskiEngine.git
+[submodule "third_party/WiLoR"] # for hand extraction [motion]
+	path = third_party/WiLoR
+	url = git@github.com:DynamoDuan/WiLoR.git
\ No newline at end of file
diff --git a/docker/compose.yml b/docker/compose.yml
index 3703950..d5cb736 100644
--- a/docker/compose.yml
+++ b/docker/compose.yml
@@ -1,6 +1,6 @@
 services:
   # Real To Sim [Master Container]
-  openreal2sim:
+  openreal2sim-new:
     build:
       context: ..
       dockerfile: docker/real2sim/Dockerfile
diff --git a/docker/real2sim/Dockerfile b/docker/real2sim/Dockerfile
index f36a807..caa5dad 100644
--- a/docker/real2sim/Dockerfile
+++ b/docker/real2sim/Dockerfile
@@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 # -------------------------------------------------------------------------
 RUN apt-get update && apt-get install -y --no-install-recommends \
     python3 python3-dev python3-pip python3-venv python-is-python3 \
-    build-essential cmake ninja-build pkg-config \
+    build-essential cmake ninja-build pkg-config libopenblas-dev \
     git curl wget ffmpeg bash libspatialindex-dev \
     && python3 -m pip install --upgrade --no-cache-dir pip wheel setuptools packaging \
     && rm -rf /var/lib/apt/lists/*
@@ -46,6 +46,9 @@ RUN pip install --no-cache-dir torch-scatter \
 COPY docker/real2sim/requirements.docker.txt         /tmp/requirements.docker.txt
 RUN pip install --no-cache-dir -r /tmp/requirements.docker.txt
 
+# Install chumpy separately (setup.py needs pip module, ensure build deps are available)
+RUN pip install --no-build-isolation --no-cache-dir "git+https://github.com/mattloper/chumpy"
+
 
 # -------------------------------------------------------------------------
 # Third-Party compiled Python deps
@@ -82,7 +85,18 @@ RUN cd / && wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.t
     cmake .. &&\
     make install
 
+RUN cd /
+COPY third_party/graspnetAPI /app/third_party/graspnetAPI
+RUN cd /app/third_party/graspnetAPI && pip install -e .
+
+COPY third_party/graspness_unofficial /app/third_party/graspness_unofficial
+RUN cd /app/third_party/graspness_unofficial/pointnet2 && python -m pip install --no-cache-dir -v --no-build-isolation .
+RUN cd /app/third_party/graspness_unofficial/knn && python -m pip install --no-cache-dir -v --no-build-isolation .
+
+COPY third_party/MinkowskiEngine /app/third_party/MinkowskiEngine
+RUN cd /app/third_party/MinkowskiEngine && python setup.py install --blas=openblas --force_cuda
+
 # -------------------------------------------------------------------------
 # Runtime env
 # -------------------------------------------------------------------------
-ENV PYTHONPATH=/app
+ENV PYTHONPATH=/app:/app/third_party:/app/third_party/graspness_unofficial/pointnet2:
diff --git a/docker/real2sim/requirements.docker.txt b/docker/real2sim/requirements.docker.txt
index 75533f2..c1d4ab5 100644
--- a/docker/real2sim/requirements.docker.txt
+++ b/docker/real2sim/requirements.docker.txt
@@ -20,6 +20,7 @@ scipy==1.15.3
 wandb==0.21.3
 gdown
 
+
 #__________________________________________________________________________________
 # 2) Related deps (torch-scatter is separate)
 ninja==1.11.1
@@ -61,4 +62,27 @@ Rtree
 # 8) Foundation Pose
 transformations==2024.6.1
 ruamel.yaml==0.18.6
-h5py==3.10.0
\ No newline at end of file
+h5py==3.10.0
+
+# 9) PromptDA
+pyliblzfse
+lightning==2.1.3
+termcolor
+tyro==0.9.2
+spaces
+trimesh 
+
+# 10) WiLoR
+pyrender
+pytorch-lightning
+scikit-image
+smplx==0.1.28
+yacs
+xtcocotools
+pandas
+hydra-submitit-launcher
+hydra-colorlog
+pyrootutils
+webdataset
+ultralytics==8.1.34
+# Note: chumpy installed separately in Dockerfile due to build isolation issues
diff --git a/openreal2sim/motion/README.md b/openreal2sim/motion/README.md
new file mode 100644
index 0000000..c892977
--- /dev/null
+++ b/openreal2sim/motion/README.md
@@ -0,0 +1,136 @@
+544## Motion & Grasp Preparation
+
+This directory processes human demonstrations into robot-friendly motion goals and grasp priors.  
+It consumes `outputs/<key>/scene/scene.pkl` and `simulation/scene.json` from the reconstruction pipeline and augments them with:
+
+- frame-aligned hand/keypoint observations
+- cleaned trajectories and task metadata
+- contact-based grasp points & approach directions
+- dense grasp proposals for every object
+
+This folder operates on `motion` in `scene.pkl`, and `scene.json`.
+
+The resulting assets are consumed by the simulation agents (`openreal2sim/simulation/*`) for replay, teleoperation, or policy training.
+
+---
+
+## Code Structure
+
+```
+openreal2sim/motion
+├── motion_manager.py 
+├── modules
+│   ├── hand_extraction.py
+│   ├── demo_motion_process.py 
+│   ├── grasp_point_extraction.py
+│   └── grasp_generation.py
+├── utils
+│   ├── grasp_utils.py
+│   └── visualize_scene.py
+└── README.md
+```
+
+Each module can be run independently, but `motion_manager.py` provides a stage-aware wrapper similar to `recon_agent.py`.
+
+---
+
+## Workflow
+
+### 1. Hand Extraction
+`modules/hand_extraction.py`
+
+- Detects hands with YOLO + SAM2, then lifts 3D hand pose using **WiLoR**.
+- Records per-frame keypoints, MANO global orientation, and binary masks under `scene_dict["motion"]`.
+- These signals define contact windows and guide later grasp localization.
+
+### 2. Demonstration Motion Processing
+`modules/demo_motion_process.py`
+
+- Reprojects FD-POSE / simple / hybrid trajectories into the world frame.
+- Picks the manipulated object by total displacement, classifies remaining objects as static, and decides which trajectory family (`traj_key`) best matches the demonstration.
+- Downsample the trajectory, exports a final GLB at the end pose, and writes helper metadata (`manipulated_oid`, `start_frame_idx`, `task_type`, `start/end_related`, `gripper_closed`, etc.) back into `simulation/scene.json`.
+- Whether a static object is start-related or end-related is determined by whether it is interleaved with the manipulated object in the x-y bounding box at corresponding timestep. We consider the start/end-related object as semantcially related with the manipulated object concerning the task.
+- Currently we support three type of tasks: a task is `simple_pick` if the gripper is finally closed, and there is no end-related object; a task is `simple_pick_place` if the manipulated object is final put onto the ground and gripper is opened, and if there is no end-related object; a task is `targetted_pick_place` if there exists end-related objects.
+
+
+### 3. Grasp Point & Direction Extraction
+`modules/grasp_point_extraction.py`
+
+- Aligns the manipulated mesh to the selected keyframe, renders it with PyTorch3D, and fuses depth/hand keypoints to locate the contact patch.
+- Projects the contact into object coordinates to obtain `grasp_point` and estimates a world-frame approach direction from MANO orientation.
+- Stores diagnostics (render overlays) under `outputs/<key>/simulation/debug/`.
+
+### 4. Grasp Proposal Generation
+`modules/grasp_generation.py`
+
+- Samples dense surface points per object, runs **GraspNet** to produce grasp candidates, and optionally applies NMS + score truncation.
+- Saves raw `.npy` proposals & `.ply` visualizations under `outputs/<key>/grasps/`.
+- For the manipulated object, rescores grasps using the contact point/direction hints and writes both `grasps` and `rescored_grasps` paths into `scene.json`.
+- There are three strategies to rescore grasps:
+    - `default`: use the z-align strategy to rescore grasps.
+    - `point_only`: only use the contact point to rescore grasps.
+    - `point_and_direction`: use both the contact point and the approach direction to rescore grasps.
+
+---
+
+## How to Use
+
+1. **Prerequisites**
+   - Run preprocess + reconstruction so that `outputs/<key>/scene/scene.pkl` and `simulation/scene.json` exist.
+   - Ensure third-party weights are in place:
+     - `third_party/WiLoR/pretrained_models` and `third_party/WiLoR/mano_data/MANO_RIGHT.pkl` 
+     - `third_party/Grounded-SAM-2/checkpoints`
+     - GraspNet checkpoints 
+
+2. **All-in-one pipeline**
+
+```bash
+python openreal2sim/motion/motion_manager.py
+```
+
+Optional flags mirror reconstruction:
+
+```bash
+python openreal2sim/motion/motion_manager.py \
+    --stage grasp_generation \
+    --key demo_video
+```
+
+3. **Stage-specific scripts**
+
+```bash
+python openreal2sim/motion/modules/hand_extraction.py
+
+python openreal2sim/motion/modules/demo_motion_process.py
+
+python openreal2sim/motion/modules/grasp_point_extraction.py
+
+
+python openreal2sim/motion/modules/grasp_generation.py \
+    --n_points 120000 --keep 200 --nms True
+```
+
+All scripts read keys from `config/config.yaml`; set `OPENREAL2SIM_KEYS` or pass `--key` through `motion_manager.py` to limit processing.
+
+---
+
+## Outputs
+
+Per key, after running the pipeline you should find:
+
+- `outputs/<key>/scene/scene.pkl`
+  - `motion.hand_kpts`: `[N,21,2]` MANO keypoints (image plane)
+  - `motion.hand_global_orient`: rotation matrices per frame
+  - `motion.hand_masks`: binary hand silhouettes
+- `outputs/<key>/simulation/scene.json`
+  - `manipulated_oid`, `traj_key`, `start_frame_idx`, `task_type`, `start_related`, `end_related`, `gripper_closed`
+  - `objects[oid].final_trajs`, 
+  - `objects[manipulated_oid].grasp_point`, `grasp_direction`, `grasps`, `rescored_grasps`
+- `outputs/<key>/motion/scene_end.glb`: fused background + final manipulated mesh.
+- `outputs/<key>/grasps/*.npy`: raw GraspNet candidates per object.
+- `outputs/<key>/simulation/debug/grasp_point_visualization_*.png`: sanity-check renders.
+
+These artifacts are consumed by the Isaac Lab / ManiSkill / MuJoCo simulation runners during trajectory replay and policy datasets export.
+
+---
+
diff --git a/openreal2sim/motion/modules/demo_motion_process.py b/openreal2sim/motion/modules/demo_motion_process.py
new file mode 100755
index 0000000..96a880a
--- /dev/null
+++ b/openreal2sim/motion/modules/demo_motion_process.py
@@ -0,0 +1,373 @@
+
+
+####
+
+### assign object types, determine start and end frame,  downsample traj, compute stacking sequence and assign mask.
+### TODO: 加一个ending的mesh进来 就这样。
+### A visualizer left.
+import yaml
+import numpy as np
+from pyquaternion import Quaternion
+from pathlib import Path
+import pickle
+import cv2
+import trimesh
+import sys
+import json
+import os
+base_dir = Path.cwd()
+sys.path.append(str(base_dir / "openreal2sim" / "motion" / "modules"))
+
+def recompute_traj(traj, c2w, extrinsics):
+    for i in range(len(traj)):
+        traj[i] = c2w @ np.linalg.inv(extrinsics[0]) @ extrinsics[i] @ np.linalg.inv(c2w) @ traj[i]
+    return traj
+
+def recompute_all_traj(scene_json_dict, scene_dict):
+    for obj_id, obj in scene_json_dict["objects"].items():
+       for key in 'fdpose_trajs', 'simple_trajs', 'hybrid_trajs':
+        traj_path = obj[key]
+        traj = np.load(traj_path)
+        traj = traj.reshape(-1, 4, 4)
+        c2w = np.array(scene_json_dict["camera"]["camera_opencv_to_world"]).astype(np.float64).reshape(4,4) 
+        extrinsics = np.array(scene_dict["extrinsics"]).astype(np.float64).reshape(-1,4,4) 
+        traj = recompute_traj(traj, c2w, extrinsics)
+        new_traj_path = traj_path.replace(".npy", "_recomputed.npy").replace("reconstruction", "motion")
+        scene_json_dict["objects"][obj_id][f"{key}_recomputed"] = str(new_traj_path)
+        np.save(new_traj_path, traj)
+
+
+def get_mask_from_frame(scene_dict, frame_index, oid):
+    mask = scene_dict["mask"][frame_index][oid]["mask"]
+    return mask
+
+
+def get_object_bbox(mesh_in_path):
+    mesh = trimesh.load(mesh_in_path)
+    bbox = mesh.bounds
+    return bbox
+
+
+
+def lift_traj(traj, height=0.02):
+    new_traj = []
+    for i in range(len(traj)):
+        new_traj.append(traj[i])
+    for i in range(1, len(new_traj) - 1):
+        new_traj[i][2][3] += height
+    return new_traj
+
+def determine_stacking_bbox(bbox_0, bbox_1):
+    """
+    Determine if two bboxes have intersection on x and y axes.
+
+    bbox_0, bbox_1: np.ndarray, (2,3) or (2,2) or (2,?) where axis 0 is (min,max) and axis 1 is x,y,(z)
+
+    Returns: True if x and y intervals overlap (i.e. stacking is possible), False otherwise
+
+    Generally this is enough for start frame.
+    """
+    # Extract x/y intervals
+    x0_min, x0_max = bbox_0[0][0], bbox_0[1][0]
+    y0_min, y0_max = bbox_0[0][1], bbox_0[1][1]
+    x1_min, x1_max = bbox_1[0][0], bbox_1[1][0]
+    y1_min, y1_max = bbox_1[0][1], bbox_1[1][1]
+    z0_min, z0_max = bbox_0[0][2], bbox_0[1][2]
+    z1_min, z1_max = bbox_1[0][2], bbox_1[1][2]
+
+    x_overlap = not (x0_max < x1_min or x1_max < x0_min)
+    y_overlap = not (y0_max < y1_min or y1_max < y0_min)
+    if x_overlap and y_overlap:
+        return True
+
+    return False
+
+
+
+def load_obj_masks(data: dict):
+    """
+    Return object list for frame-0:
+        [{'mask': bool array, 'name': name, 'bbox': (x1,y1,x2,y2)}, ...]
+    Filter out names: 'ground' / 'hand' / 'robot'
+    """
+    frame_objs = data.get(0, {})  # only frame 0
+    objs = []
+    for oid, item in frame_objs.items():
+        name = item["name"]
+        if name in ("ground", "hand", "robot"):
+            continue
+        objs.append({
+            "oid":  oid,
+            "mask":  item["mask"].astype(bool),
+            "name": name,
+            "bbox":  item["bbox"]          # used for cropping
+        })
+    # Keep original behavior: sort by mask area (desc)
+    objs.sort(key=lambda x: int(x["oid"]))
+    return objs
+
+
+
+def pose_distance(T1, T2):
+    # Translation distance
+    t1 = T1[:3, 3]
+    t2 = T2[:3, 3]
+    trans_dist = np.linalg.norm(t2 - t1)
+    # Rotation difference (angle)
+    R1 = T1[:3, :3]
+    R2 = T2[:3, :3]
+    dR = R2 @ R1.T
+    # numerical errors can make the trace slightly out of bounds
+    cos_angle = (np.trace(dR) - 1) / 2
+    cos_angle = np.clip(cos_angle, -1.0, 1.0)
+    angle = np.arccos(cos_angle)
+    return trans_dist, angle
+
+def build_mask_array(
+    oid: int,
+    scene_dict: dict[str, any]
+) -> np.ndarray:
+
+    H, W, N = scene_dict["height"], scene_dict["width"], scene_dict["n_frames"]
+    out = np.zeros((N, H, W), dtype=np.uint8)
+    for i in range(N):
+        m = scene_dict["mask"][i][oid]["mask"]
+        out[i] = m.astype(np.uint8)
+    return out
+
+
+
+def downsample_traj(trajectory, trans_threshold=0.05, rot_threshold=np.radians(20)):
+    """
+    Downsample pose trajectory where each element is a 4x4 transformation matrix (SE3).
+    Retains keyframes with sufficient translation or rotation from the previous kept frame.
+    Returns: list of indices for retained frames.
+    """
+    if len(trajectory) <= 2:
+        return list(range(len(trajectory)))
+
+
+    downsampled_indices = [0]
+    prev_idx = 0
+    for i in range(1, len(trajectory) - 1):
+        trans_dist, angle = pose_distance(trajectory[prev_idx], trajectory[i])
+        if trans_dist >= trans_threshold or angle >= rot_threshold:
+            downsampled_indices.append(i)
+            prev_idx = i
+    if len(downsampled_indices) > 0:
+        last_kept_idx = downsampled_indices[-1]
+        trans_dist, angle = pose_distance(trajectory[last_kept_idx], trajectory[-1])
+        if trans_dist > 0.6 * trans_threshold or angle > 0.6 * rot_threshold:
+            downsampled_indices.append(len(trajectory) - 1)
+        elif trans_dist > 0.1 * trans_threshold or angle > 0.1 * rot_threshold:
+            downsampled_indices[-1] = len(trajectory) - 1
+
+    return downsampled_indices
+
+def refine_end_frame(hand_masks, object_masks):
+    N = min(len(hand_masks), len(object_masks))
+    kernel = np.ones((10, 10), np.uint8)
+    for i in range(N-1, -1, -1):
+        if hand_masks[i] is None:
+            continue
+        hand_mask = hand_masks[i].astype(np.uint8)
+        obj_mask = object_masks[i].astype(np.uint8)
+        obj_mask_dilated = cv2.dilate(obj_mask, kernel, iterations=1)
+        overlap = (hand_mask & obj_mask_dilated).sum()
+        if overlap > 0:
+            return i + 1
+    return 0                 
+    
+
+def demo_motion_process(keys, key_scene_dicts, key_cfgs):
+    base_dir = Path.cwd()
+    for key in keys:
+        scene_dict = key_scene_dicts[key]
+        scene_json_dict_path = base_dir / f"outputs/{key}/simulation/scene.json"
+        with open(scene_json_dict_path, "r") as f:
+            scene_json_dict = json.load(f)
+        recompute_all_traj(scene_json_dict, scene_dict)
+        key_cfg = key_cfgs[key]
+        max_placement_oid = None
+        max_placement_distance = 0.0
+        placement_distances = {}
+        fd_traj_dict = {}
+        for obj_id, obj in scene_json_dict["objects"].items():
+            obj_traj = obj["simple_trajs_recomputed"]
+            obj_traj = np.load(obj_traj)
+            obj_traj = obj_traj.reshape(-1, 4, 4)
+            abs_distance, _ = pose_distance(obj_traj[0], obj_traj[-1])
+            fd_traj = obj["fdpose_trajs_recomputed"]
+            fd_traj = np.load(fd_traj)
+            fd_traj = fd_traj.reshape(-1, 4, 4)
+            fd_dist, fd_angle = pose_distance(fd_traj[0], fd_traj[-1])
+          
+            fd_traj_dict[obj_id] = {
+                "fd_distance": fd_dist,
+                "fd_angle": fd_angle
+            }
+            print(f"Object: {obj_id}, distance: {abs_distance}")
+            if abs_distance > max_placement_distance:
+                max_placement_distance = abs_distance
+                max_placement_oid = obj_id
+            scene_json_dict["objects"][obj_id]["type"] = "static"  
+            placement_distances[obj_id] = abs_distance
+        scene_json_dict["manipulated_oid"] = max_placement_oid
+        scene_dict["info"]["manipulated_oid"] = max_placement_oid
+        name = scene_json_dict["objects"][max_placement_oid]["name"]
+        scene_json_dict["objects"][max_placement_oid]["type"] = "manipulated"
+        print(f"Manipulated object: {max_placement_oid}, distance: {abs_distance}")
+        manipulated_fd_trajs_path = scene_json_dict["objects"][max_placement_oid]["fdpose_trajs_recomputed"]
+        manipulated_fd_trajs = np.load(manipulated_fd_trajs_path)
+        angle_displacement = [pose_distance(manipulated_fd_trajs[i], manipulated_fd_trajs[i+1])[1] for i in range(len(manipulated_fd_trajs)-1)]
+        angle_displacement_sum = sum(angle_displacement)
+        # manipulated_simple_trajs_path = scene_json_dict["objects"][max_placement_oid]["simple_trajs_recomputed"]
+        # manipulated_simple_trajs = np.load(manipulated_simple_trajs_path)
+        # pos_displacement = [pose_distance(manipulated_simple_trajs[i], manipulated_fd_trajs[i])[0] for i in range(len(manipulated_fd_trajs))]
+        fd_traj_dict.pop(max_placement_oid)
+        traj_key = 'fdpose_trajs_recomputed'
+        if len(fd_traj_dict.keys()) > 0:
+            max_fd_distance = max(fd_traj_dict.values(), key=lambda x: x["fd_distance"])
+            max_fd_angle = max(fd_traj_dict.values(), key=lambda x: x["fd_angle"])
+        else:
+            max_fd_distance = 0.0
+            max_fd_angle = 0.0
+        if angle_displacement_sum > 180 or max_fd_angle > np.radians(20):
+            traj_key = 'simple_trajs_recomputed'
+        else:
+            if max_fd_distance > 0.05:
+                traj_key = 'hybrid_trajs_recomputed'
+        print(f"Traj key: {traj_key}")
+        scene_json_dict["traj_key"] = traj_key
+        scene_dict["info"]["traj_key"] = traj_key
+        manipulated_trajs_path = scene_json_dict["objects"][max_placement_oid][traj_key]
+        manipulated_trajs = np.load(manipulated_trajs_path)
+        object_masks = build_mask_array(int(max_placement_oid), scene_dict)
+        hand_masks = scene_dict["motion"]["hand_masks"]  
+        end_frame = refine_end_frame(hand_masks, object_masks)
+        print(f"End frame: {end_frame}")
+        trajs = manipulated_trajs
+        if end_frame == len(manipulated_trajs) - 1:
+            scene_json_dict["gripper_closed"] = True
+        elif end_frame == 0:
+            if np.abs(manipulated_trajs[0][2][3]) < 0.03:
+                scene_json_dict["gripper_closed"] = False
+            else:
+                scene_json_dict["gripper_closed"] = True
+        else:
+            scene_json_dict["gripper_closed"] = False
+            #trajs = manipulated_trajs[:end_frame]
+        
+
+        downsampled_indices = downsample_traj(trajs, trans_threshold=0.03, rot_threshold=np.radians(20))
+        scene_json_dict["chosen_indices"] = downsampled_indices
+        trajs = trajs[downsampled_indices]
+        
+
+        trajs = lift_traj(trajs, height=0.02)
+        print(f"Downsampled indices: {[downsampled_indices]}")
+        downsampled_traj_path = base_dir / f"outputs/{key}/simulation" / f"{max_placement_oid}_final_traj.npy"
+        
+      
+        print(f"Trajs: {trajs[-1]}")
+        np.save(downsampled_traj_path, trajs)
+        scene_json_dict["objects"][max_placement_oid]["final_trajs"] = str(downsampled_traj_path)
+        #scene_json_dict["start_frame_idx"] = downsampled_indices[1]
+        scene_dict["info"]["start_frame_idx"] = downsampled_indices[1]
+        end_pose = trajs[-1]
+        mesh_in_path = scene_json_dict["objects"][max_placement_oid]["optimized"]
+        mesh = trimesh.load(mesh_in_path)
+        mesh.apply_transform(end_pose)
+        path = base_dir / f"outputs/{key}/simulation/{max_placement_oid}_{name}_end.glb"
+        mesh.export(path)
+        scene_json_dict["objects"][max_placement_oid]["end_mesh"] = str(path)
+        bbox_end = get_object_bbox(path)
+
+        start_related = []
+        end_related = []
+        mesh_in_path = scene_json_dict["objects"][max_placement_oid]["optimized"]
+        bbox_start = get_object_bbox(mesh_in_path)
+        for oid, obj in scene_json_dict["objects"].items():
+            if not isinstance(oid, int):
+                continue
+            if oid == max_placement_oid:
+                continue
+            mesh_in_path = obj["optimized"]
+            bbox = get_object_bbox(mesh_in_path)
+            if determine_stacking_bbox(bbox_start, bbox):
+                start_related.append(oid)
+            if determine_stacking_bbox(bbox_end, bbox):
+                end_related.append(oid)
+
+        print(f"Start related: {start_related}")
+        print(f"End related: {end_related}")
+
+        scene_json_dict["start_related"] = list(start_related)
+        scene_json_dict["end_related"] = list(end_related)
+
+        scene_path = scene_json_dict["scene_mesh"]["optimized"]
+        scene_mesh = trimesh.load(scene_path)
+        scene_mesh = scene_mesh + mesh
+        if not os.path.exists(base_dir / f"outputs/{key}/motion"):
+            os.makedirs(base_dir / f"outputs/{key}/motion")
+        path = base_dir / f"outputs/{key}/motion/scene_end.glb"
+        scene_mesh.export(path)
+
+
+
+        if len(end_related) > 0 :
+            target_oid = end_related[0]
+            scene_json_dict["target_oid"] = target_oid
+            scene_json_dict["task_type"] = "targetted_pick_place"
+            if len(scene_json_dict["task_desc"]) > 0:
+                task_desc = "Pick up the " + name + " and place it on the " + target_oid + "."
+                if task_desc not in scene_json_dict["task_desc"]:
+                    scene_json_dict["task_desc"].append(task_desc)
+            else:
+                task_desc = "Pick up the " + name + " and place it on the " + target_oid + "."
+                if task_desc not in scene_json_dict["task_desc"]:
+                    scene_json_dict["task_desc"] = [task_desc]
+        else:
+            if not scene_json_dict["gripper_closed"]:
+                scene_json_dict["task_type"] = "simple_pick_place"
+                task_desc = "Pick up the " + name + " and place it on the ground."
+                if len(scene_json_dict.get("task_desc", [])) == 0:
+                    scene_json_dict["task_desc"] = [task_desc]
+                if task_desc not in scene_json_dict["task_desc"]:
+                    scene_json_dict["task_desc"].append(task_desc)
+            else:
+                scene_json_dict["task_type"] = "simple_pick"
+                task_desc = "Pick up the " + name + "."
+                if len(scene_json_dict.get("task_desc", [])) == 0:
+                    scene_json_dict["task_desc"] = [task_desc]
+                if task_desc not in scene_json_dict["task_desc"]:
+                    scene_json_dict["task_desc"].append(task_desc)
+
+        print(f"Task description: {scene_json_dict['task_desc']}")
+        print(f"Task type: {scene_json_dict['task_type']}")
+        
+        with open(base_dir / f"outputs/{key}/simulation/scene.json", "w") as f:
+            json.dump(scene_json_dict, f, indent=2)
+        with open(base_dir / f"outputs/{key}/scene/scene.pkl", "wb") as f:
+            pickle.dump(scene_dict, f)
+        return key_scene_dicts
+
+
+if __name__ == "__main__":
+    base_dir = Path.cwd()
+    cfg_path = base_dir / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+    from utils.compose_config import compose_configs
+    key_cfgs = {key: compose_configs(key, cfg) for key in keys} 
+    print(f"Key cfgs: {key_cfgs}")
+    key_scene_dicts = {}
+    for key in keys:
+        scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl'
+        with open(scene_pkl, 'rb') as f:
+            scene_dict = pickle.load(f)
+        key_scene_dicts[key] = scene_dict
+    demo_motion_process(keys, key_scene_dicts, key_cfgs)
+
+        
+
diff --git a/openreal2sim/motion/modules/grasp_generation.py b/openreal2sim/motion/modules/grasp_generation.py
new file mode 100755
index 0000000..f0ec412
--- /dev/null
+++ b/openreal2sim/motion/modules/grasp_generation.py
@@ -0,0 +1,313 @@
+import os
+import sys
+import numpy as np
+import time
+import torch
+from typing import List
+import open3d as o3d
+from graspnetAPI.grasp import GraspGroup
+from pathlib import Path
+base_dir = Path.cwd()
+sys.path.append(str(base_dir / "third_party"))
+from graspness_unofficial.models.graspnet import GraspNet, pred_decode
+from graspness_unofficial.dataset.graspnet_dataset import minkowski_collate_fn
+from graspness_unofficial.utils.collision_detector import ModelFreeCollisionDetector
+from graspness_unofficial.utils.data_utils import CameraInfo, create_point_cloud_from_depth_image, get_workspace_mask
+from scipy.spatial.transform import Rotation as R
+from utils.grasp_utils import *
+from utils.compose_config import compose_configs
+import argparse
+import json
+import yaml
+
+
+def sample_surface(glb_path: str, n_points: int = 5000) -> np.ndarray:
+    """Load mesh and sample surface points."""
+    mesh = trimesh.load_mesh(glb_path)
+    if isinstance(mesh, trimesh.Scene):
+        mesh = list(mesh.geometry.values())[0]
+    pts, _ = trimesh.sample.sample_surface(mesh, n_points)
+    return np.array(pts, dtype=np.float32)
+
+
+def crop_pointcloud_quadrant(
+    points,
+    quadrant=1,
+    axes="xy",
+):
+    # Map axis letters to indices
+    axis_index = {"x": 0, "y": 1, "z": 2}
+    a1, a2 = axes[0], axes[1]
+    idx1, idx2 = axis_index[a1], axis_index[a2]
+
+    center_full = points.mean(axis=0)
+
+    c1, c2 = center_full[idx1], center_full[idx2]
+
+    ge = np.greater  # >
+
+    # Build boolean masks for the four quadrants
+    # Right/Left split along axis1, Top/Bottom split along axis2
+    right = ge(points[:, idx1], c1)  # axis1 >= c1  (or > c1)
+    top = ge(points[:, idx2], c2)  # axis2 >= c2  (or > c2)
+
+    # Quadrant mapping per convention documented above
+    if quadrant == 1:
+        mask = right & top
+    elif quadrant == 2:
+        mask = (~right) & top
+    elif quadrant == 3:
+        mask = (~right) & (~top)
+    elif quadrant == 4:
+        mask = right & (~top)
+    else:
+        raise ValueError("`quadrant` must be an integer in {1,2,3,4}.")
+
+    cropped_points = points[mask]
+    return cropped_points, mask
+
+
+def read_score_safe(gg, i: int, fallback: float) -> float:
+    """Try to read candidate score; fall back to a synthetic rank-based value."""
+    try:
+        return float(getattr(gg[i], "score"))
+    except Exception:
+        pass
+    try:
+        return float(gg.grasp_group_array[i][0])
+    except Exception:
+        return fallback
+
+
+def grasps_to_pointcloud(
+    gg, pts_per_gripper: int = 400, color=(1.0, 0.0, 0.0)
+) -> o3d.geometry.PointCloud:
+    """Sample each gripper mesh to points and tint with color."""
+    geoms = gg.to_open3d_geometry_list()  # list of TriangleMesh
+    out = o3d.geometry.PointCloud()
+    for g in geoms:
+        out += g.sample_points_uniformly(pts_per_gripper)
+    if len(out.points) > 0:
+        out.colors = o3d.utility.Vector3dVector(
+            np.tile(np.array(color, dtype=np.float32), (len(out.points), 1))
+        )
+    return out
+
+
+def save_vis_ply(
+    points_xyz: np.ndarray,
+    gg,
+    save_path: Path,
+    pts_per_gripper: int = 400,
+    cloud_color=(0.0, 1.0, 0.0),
+):
+    """Write a PLY: green object cloud + red ALL grasp candidates."""
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(points_xyz.astype(np.float32))
+    cloud.colors = o3d.utility.Vector3dVector(
+        np.tile(np.array(cloud_color, dtype=np.float32), (len(cloud.points), 1))
+    )
+    grasp_pcd = grasps_to_pointcloud(
+        gg, pts_per_gripper=pts_per_gripper, color=(1.0, 0.0, 0.0)
+    )
+    merged = cloud + grasp_pcd
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+    o3d.io.write_point_cloud(str(save_path), merged)
+    print(f"[VIS] saved {save_path}")
+
+
+def process_one_object(
+    gg_net: MyGraspNet,
+    obj_idx: int,
+    obj_dict: dict,
+    proposals_dir: Path,
+    keep: int | None,
+    nms: bool,
+    n_points: int,
+    overwrite: bool,
+    vis_pts_per_gripper: int = 400,
+) -> str | None:
+    """
+    Build full point cloud for one object, run GraspNet, export all candidates to NPZ,
+    save a PLY visualization, and return the NPZ path (string).
+    """
+    glb_path = obj_dict["optimized"]
+    if glb_path is None or (not os.path.exists(glb_path)):
+        print(
+            f"[WARN][{obj_dict['oid']}_{obj_dict['name']}] mesh not found -> {glb_path}"
+        )
+        return None
+
+    npy_path = proposals_dir / f"{obj_dict['oid']}_{obj_dict['name']}_grasp.npy"
+    vis_path = proposals_dir / f"{obj_dict['oid']}_{obj_dict['name']}_grasp_viz.ply"
+
+    # Build cloud & inference
+    pcs = sample_surface(glb_path, n_points=n_points)
+    # pcs = crop_pointcloud_quadrant(pcs, quadrant=3, axes="xy")[0]
+    pcs[..., 2] *= -1.0  # keep identical convention
+    gg = gg_net.inference(pcs)
+    if nms:
+        gg = gg.nms()
+    gg = gg.sort_by_score()
+    if isinstance(keep, int) and keep is not None and len(gg) > keep:
+        gg = gg[:keep]
+    if len(gg) == 0:
+        print(f"[WARN][{obj_dict['oid']}_{obj_dict['name']}] zero proposals")
+        return None
+
+    # Save visualization
+    save_vis_ply(pcs, gg, vis_path, pts_per_gripper=vis_pts_per_gripper)
+
+    for g_i in range(len(gg)):
+        translation = gg[g_i].translation
+        rotation = gg[g_i].rotation_matrix
+        translation = np.array([translation[0], translation[1], -translation[2]])
+        rotation[2, :] = -rotation[2, :]
+        rotation[:, 2] = -rotation[:, 2]
+        gg.grasp_group_array[g_i][13:16] = translation
+        gg.grasp_group_array[g_i][4:13] = rotation.reshape(-1)
+
+    gg.save_npy(npy_path)
+    print(
+        f"[OK][{obj_dict['oid']}_{obj_dict['name']}] saved {len(gg.grasp_group_array)} -> {npy_path.name}"
+    )
+
+    return str(npy_path.resolve()), gg
+
+
+def run_for_key(
+    key: str,
+    n_points: int,
+    keep: int | None,
+    nms: bool,
+    overwrite: bool,
+    vis_pts_per_gripper: int,
+    strategy: str,
+):
+    base_dir = Path.cwd()
+    out_dir = base_dir / "outputs"
+    scene_json = out_dir / key / "simulation" / "scene.json"
+    if not scene_json.exists():
+        raise FileNotFoundError(scene_json)
+
+    # Load scene.json once
+    scene_dict = json.load(open(scene_json))
+    objects = scene_dict.get("objects", {})
+    if not isinstance(objects, dict) or len(objects) == 0:
+        print(f"[WARN][{key}] scene_dict['objects'] is empty.")
+        return
+
+    # Prepare save dir
+    key_dir = out_dir / key
+    proposals_dir = key_dir / "grasps"
+    proposals_dir.mkdir(parents=True, exist_ok=True)
+
+    # Single GraspNet instance reused for all objects
+    net = MyGraspNet()
+
+    manipulated_oid = scene_dict["manipulated_oid"]
+    # Loop all objects
+    for i, obj in objects.items():
+        npy_path, gg = process_one_object(
+            gg_net=net,
+            obj_idx=i,
+            obj_dict=obj,
+            proposals_dir=proposals_dir,
+            keep=keep,
+            nms=nms,
+            n_points=n_points,
+            overwrite=overwrite,
+            vis_pts_per_gripper=vis_pts_per_gripper,
+        )
+        if npy_path is not None:
+            scene_dict["objects"][i]["grasps"] = npy_path
+            with open(scene_json, "w") as f:
+                json.dump(scene_dict, f, indent=2)
+            print(
+                f"[OK][{key}] scene.json updated with 'grasps' for {obj['oid']}_{obj['name']}."
+            )
+        if int(obj["oid"]) == int(manipulated_oid):
+            if scene_dict["objects"][i]["grasp_point"] is not None and strategy == "point_and_direction":
+                grasp_point = scene_dict["objects"][i]["grasp_point"]
+                grasp_direction = np.asarray(scene_dict["objects"][i]["grasp_direction"], dtype=np.float32)
+                if grasp_direction.shape == (3, 3):
+                    grasp_direction = grasp_direction[:, 0]
+                grasp_direction = np.asarray(grasp_direction, dtype=np.float32).reshape(-1)
+                if grasp_direction.shape[0] != 3 or np.linalg.norm(grasp_direction) < 1e-6:
+                    print(f"[WARN][{key}] Invalid grasp_direction for {obj['oid']}_{obj['name']} – falling back to default Z.")
+                    grasp_dir_hint = None
+                else:
+                    grasp_dir_hint = (grasp_direction / np.linalg.norm(grasp_direction)).tolist()
+                gg = get_best_grasp_with_hints(gg, grasp_point, grasp_dir_hint)
+            elif scene_dict["objects"][i]["grasp_point"] is not None and strategy == "point_only":
+                grasp_point = scene_dict["objects"][i]["grasp_point"]
+                gg = get_best_grasp_with_hints(gg, grasp_point, [0,0,1])
+            else:
+                z = [0,0,1] # default direction
+                gg = get_best_grasp_with_hints(gg, None, z)
+            npy_path = npy_path.replace(".npy", "_rescored.npy")
+            gg.save_npy(npy_path)
+            if npy_path is not None:
+                scene_dict["objects"][i]["rescored_grasps"] = npy_path
+                with open(scene_json, "w") as f:
+                    json.dump(scene_dict, f, indent=2)
+
+
+def grasp_generation(keys):
+    cfg_path = Path.cwd() / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+    key_cfgs = {key: compose_configs(key, cfg) for key in keys} 
+    for key in keys:
+        cfg = key_cfgs[key]
+        print(f"\n========== [GraspDet] Processing key: {key} ==========")
+        run_for_key(
+            key=key,
+            n_points=cfg["n_points"],
+            keep=cfg["keep"],
+            nms=cfg["nms"],
+            overwrite=cfg["overwrite"],
+            vis_pts_per_gripper=cfg["vis_pts_per_gripper"],
+            strategy=cfg["rescore_strategy"],
+        )
+
+    print('[Info] Grasp generation completed.')
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        "Export grasp proposals for ALL objects (batch over keys) and write paths into scene.json"
+    )
+    parser.add_argument("--n_points", type=int, default=100000)
+    parser.add_argument("--keep", type=int, default=None)  # set None to keep all
+    parser.add_argument("--nms", type=bool, default=True)
+    parser.add_argument("--overwrite", action="store_true")
+    parser.add_argument(
+        "--vis_pts_per_gripper",
+        type=int,
+        default=400,
+        help="points sampled per gripper mesh for PLY",
+    )
+    args = parser.parse_args()
+
+    # load keys from YAML
+    cfg_path = Path.cwd() / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+
+    for key in keys:
+        print(f"\n========== [GraspDet] Processing key: {key} ==========")
+        key_cfgs = compose_configs(key, cfg)
+        run_for_key(
+            key=key,
+            n_points=args.n_points,
+            keep=args.keep,
+            nms=args.nms,
+            overwrite=args.overwrite,
+            vis_pts_per_gripper=args.vis_pts_per_gripper,
+            key_cfgs=key_cfgs,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/openreal2sim/motion/modules/grasp_point_extraction.py b/openreal2sim/motion/modules/grasp_point_extraction.py
new file mode 100755
index 0000000..d0f0eb0
--- /dev/null
+++ b/openreal2sim/motion/modules/grasp_point_extraction.py
@@ -0,0 +1,434 @@
+### This part is used to generate the affordance of the object.
+### Include:
+### 1. Grasp point generation.
+### 2. Affordance map generation. Mainly for articulated object. This might be further combined with a PartSLIP network.
+### TODO: Human annotation.
+### The logic we use for grasp point generation here is: we select the frist frame of object-hand contact, compute the contact point, overlay the object and extract the 3D coordinate of the point.
+### Be careful that we transfer this point to the first frame using pose matrices and extrinsics.
+### TODO: This might be helpful to object selection in the first step.
+import pickle
+import pathlib
+from pathlib import Path
+import yaml
+import os
+import sys
+import torch
+import numpy as np
+import cv2
+import imageio
+import json
+import pytorch3d
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer import (
+    look_at_view_transform,
+    PerspectiveCameras,
+    RasterizationSettings,
+    MeshRenderer,
+    MeshRasterizer,
+    SoftPhongShader,
+    PointLights,
+    TexturesVertex,
+    BlendParams,
+)
+from pytorch3d.utils.camera_conversions import cameras_from_opencv_projection
+import trimesh
+
+
+def create_pytorch3d_camera(K, T_c2w, img_size, device="cuda:0"):
+    """
+    Create a PyTorch3D camera from OpenCV-style camera parameters.
+    
+    Args:
+        K: (3, 3) camera intrinsic matrix
+        T_c2w: (4, 4) camera-to-world transformation matrix
+        img_size: (H, W) or int, image size
+        device: device string
+    
+    Returns:
+        PerspectiveCameras object
+    """
+    device = torch.device(device if torch.cuda.is_available() else "cpu")
+    
+    # Parse image size
+    if isinstance(img_size, int):
+        H, W = img_size, img_size
+    else:
+        H, W = img_size
+    image_size = torch.tensor([[H, W]], dtype=torch.float32, device=device)
+    
+    # Convert camera-to-world to world-to-camera for OpenCV convention
+    # OpenCV projection: x_screen = K @ (R_w2c @ x_world + tvec_w2c)
+    T_w2c = np.linalg.inv(T_c2w)
+    R_w2c = T_w2c[:3, :3]  # world-to-camera rotation
+    tvec_w2c = T_w2c[:3, 3]  # world-to-camera translation
+    
+    # Convert to torch tensors with batch dimension
+    R_w2c_torch = torch.tensor(R_w2c, dtype=torch.float32, device=device).unsqueeze(0)  # (1, 3, 3)
+    tvec_w2c_torch = torch.tensor(tvec_w2c, dtype=torch.float32, device=device).unsqueeze(0)  # (1, 3)
+    K_torch = torch.tensor(K, dtype=torch.float32, device=device).unsqueeze(0)  # (1, 3, 3)
+    
+    # Create PyTorch3D camera using OpenCV convention
+    camera = cameras_from_opencv_projection(
+        R=R_w2c_torch,
+        tvec=tvec_w2c_torch,
+        camera_matrix=K_torch,
+        image_size=image_size,
+    ) 
+    R = camera.R.cpu().numpy()
+    R = R.reshape(3, 3)
+    T = camera.T.cpu().numpy()
+    T = T.reshape(3,)
+    fx = K[0,0]
+    fy = K[1,1]
+    cx = K[0,2]
+    cy = K[1,2]
+    height, width = image_size[0].cpu().numpy()
+    camera = PerspectiveCameras(
+        device=device,
+        focal_length=torch.tensor([[fx, fy]], dtype=torch.float32, device=device),
+        principal_point=torch.tensor([[cx, cy]], dtype=torch.float32, device=device),
+        R=[R],
+        T=[T],
+        image_size=torch.tensor([[height, width]], dtype=torch.float32, device=device),
+        in_ndc=False,
+    )
+    return camera
+
+
+def render_pytorch3d_rgbd(mesh, K, T_c2w, img_size, device="cuda:0"):
+    """
+    Given a trimesh mesh (in world coordinate system), render using pytorch3d.
+    Args:
+        mesh: trimesh.Trimesh, should be in world coordinate system.
+        K: (3,3) camera intrinsic matrix, fx, fy, cx, cy.
+        T_c2w: (4,4) camera-to-world transformation matrix.
+        img_size: (H, W), output image (and depth) size.
+        device: device string
+    Returns:
+        mask_img: (H,W) uint8 binary mask (1 for foreground, 0 for background)
+        depth_img: (H,W) float32 (Z buffer, 0 for background)
+    """
+    if not isinstance(mesh, trimesh.Trimesh):
+        if hasattr(mesh, 'geometry') and len(mesh.geometry) > 0:
+            mesh = list(mesh.geometry.values())[0]
+        else:
+            raise ValueError('mesh is not a valid trimesh.Trimesh!')
+    device = torch.device(device if torch.cuda.is_available() else "cpu")
+
+    verts = torch.tensor(np.asarray(mesh.vertices), dtype=torch.float32, device=device)
+    faces = torch.tensor(np.asarray(mesh.faces), dtype=torch.int64, device=device)
+    verts_rgb = torch.ones_like(verts)[None] * torch.tensor([[0.7, 0.7, 0.7]], dtype=torch.float32, device=device)
+    from pytorch3d.renderer import TexturesVertex
+    textures = TexturesVertex(verts_features=verts_rgb)
+
+    mesh_p3d = Meshes(verts=[verts], faces=[faces], textures=textures)
+
+    # Create camera using unified helper function
+    cameras = create_pytorch3d_camera(K, T_c2w, img_size, device=device)
+    
+    # Get image dimensions for rasterization
+    if isinstance(img_size, int):
+        H, W = img_size, img_size
+    else:
+        H, W = img_size
+    
+    blend_params = BlendParams(sigma=1e-4, gamma=1e-4)
+    raster_settings = RasterizationSettings(
+        image_size=(H, W),
+        blur_radius=0.0,
+        faces_per_pixel=8,
+        bin_size=1024,
+        max_faces_per_bin=1000000,
+    )
+    lights = PointLights(device=device, location=[[0.0, 0.0, 5.0]])
+
+    renderer = MeshRenderer(
+        rasterizer=MeshRasterizer(
+            cameras=cameras,
+            raster_settings=raster_settings
+        ),
+        shader=SoftPhongShader(
+            device=device,
+            cameras=cameras,
+            lights=lights,
+            blend_params=blend_params
+        ),
+    )
+    # Render RGB (used only for mask)
+    img = renderer(mesh_p3d).detach().cpu().numpy()[0][:,:,:3].clip(0, 1) * 255
+    img = img.astype(np.uint8)
+    # Depth renderer
+    class DepthShader(torch.nn.Module):
+        def __init__(self, device="cpu"):
+            super().__init__()
+            self.device = device
+        def forward(self, fragments, meshes, **kwargs):
+            return fragments.zbuf
+    depth_renderer = MeshRenderer(
+        rasterizer=MeshRasterizer(
+            cameras=cameras,
+            raster_settings=raster_settings
+        ),
+        shader=DepthShader(device=device)
+    )
+    depth = depth_renderer(mesh_p3d)[0, ..., 0].detach().cpu().numpy()  # (H, W)
+    mask = (depth > 0).astype(np.uint8)
+    return img, mask, depth
+
+
+def find_nearest_point(point, object_mask, erosion_size=8):
+    """
+    Given a point and an object binary mask (H,W), return the nearest point in the mask,
+    preferring points deeper inside the mask (away from edges).
+    
+    Args:
+        point: (x, y) point coordinates
+        object_mask: (H, W) binary mask
+        erosion_size: size of erosion kernel to remove edges (default: 5)
+    
+    Returns:
+        np.array([x, y]) or None if no valid point found
+    """
+    H, W = object_mask.shape
+    # Convert mask to uint8 for cv2 operations
+    mask_uint8 = (object_mask * 255).astype(np.uint8)
+    
+    # Erode the mask to remove edges and get inner region
+    kernel = np.ones((erosion_size, erosion_size), np.uint8)
+    eroded_mask = cv2.erode(mask_uint8, kernel, iterations=1)
+    eroded_mask = (eroded_mask > 0).astype(bool)
+    
+    # Try to find point in eroded mask first (deeper inside)
+    ys, xs = np.where(eroded_mask)
+    if len(xs) > 0:
+        dists = np.sqrt((xs - point[0]) ** 2 + (ys - point[1]) ** 2)
+        min_idx = np.argmin(dists)
+        return np.array([xs[min_idx], ys[min_idx]])
+    
+    # If eroded mask is empty, fall back to original mask
+    ys, xs = np.where(object_mask)
+    if len(xs) == 0:
+        return None
+    dists = np.sqrt((xs - point[0]) ** 2 + (ys - point[1]) ** 2)
+    min_idx = np.argmin(dists)
+    return np.array([xs[min_idx], ys[min_idx]])
+
+def get_bbox_mask_from_mask(mask):
+    ys, xs = np.where(mask)
+    if len(xs) == 0:
+        return None
+    x_min = np.min(xs)
+    x_max = np.max(xs)
+    y_min = np.min(ys)
+    y_max = np.max(ys)
+    return x_min, x_max, y_min, y_max
+
+def compute_contact_point(kpts_2d, object_mask):
+    """
+    Given 2D keypoints (N,2) and an object binary mask (H,W), return the mean keypoint location [x, y]
+    of all keypoints that fall inside the mask.
+    """
+    kpts_2d = np.asarray(kpts_2d)
+    H, W = object_mask.shape
+    inside = []
+    for kp in kpts_2d:
+        x, y = int(round(kp[0])), int(round(kp[1]))
+        if 0 <= y < H and 0 <= x < W:
+            if object_mask[y, x]:
+                inside.append(kp)
+    if len(inside) > 0:
+        point = np.mean(np.stack(inside, axis=0), axis=0)
+        px, py = int(round(point[0])), int(round(point[1]))
+        if 0 <= py < H and 0 <= px < W and object_mask[py, px]:
+            return point, True, True
+        else:
+            return point, True, False
+    else:
+        return np.mean(kpts_2d, axis=0), False, False
+
+def bbox_to_mask(bbox):
+    x, y, w, h = bbox
+    mask = np.zeros((h, w))
+    mask[y:y+h, x:x+w] = True
+    return mask
+
+
+def visualize_grasp_points(image, kpts_2d, contact_point, output_path):
+    """
+    Simple visualization of fingertip keypoints and contact point on image.
+    
+    Args:
+        image: Input image (H, W, 3) uint8 or float
+        kpts_2d: Fingertip keypoints array (N, 2)
+        contact_point: Contact point (2,)
+        output_path: Path to save visualization
+    """
+    # Prepare image
+    vis_image = image.copy()
+    if vis_image.dtype != np.uint8:
+        if vis_image.max() <= 1.0:
+            vis_image = (vis_image * 255).astype(np.uint8)
+        else:
+            vis_image = vis_image.astype(np.uint8)
+    
+    # Convert RGB to BGR for OpenCV
+    if vis_image.shape[-1] == 3:
+        vis_image = cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR)
+    
+    H, W = vis_image.shape[:2]
+    
+    # Draw fingertip keypoints (blue)
+    kpts_2d = np.asarray(kpts_2d)
+    for kp in kpts_2d:
+        kp_x, kp_y = int(round(kp[0])), int(round(kp[1]))
+        if 0 <= kp_y < H and 0 <= kp_x < W:
+            cv2.circle(vis_image, (kp_x, kp_y), 1, (255, 0, 0), -1)  # Blue filled circle
+    
+    # Draw contact point (red)
+    cp_x, cp_y = int(round(contact_point[0])), int(round(contact_point[1]))
+    if 0 <= cp_y < H and 0 <= cp_x < W:
+        cv2.circle(vis_image, (cp_x, cp_y), 1, (0, 0, 255), -1)  # Red filled circle
+    
+    # Convert back to RGB and save
+    vis_image_rgb = cv2.cvtColor(vis_image, cv2.COLOR_BGR2RGB)
+    imageio.imwrite(str(output_path), vis_image_rgb)
+    print(f"[Info] Saved grasp point visualization to: {output_path}")
+
+    
+def grasp_point_extraction(keys, key_scene_dicts, key_cfgs):
+    base_dir = Path.cwd()
+    for key in keys:
+        scene_dict = key_scene_dicts[key]
+        scene_dict["key"] = key  # Store key in scene_dict for visualization
+        key_cfg = key_cfgs[key]
+        single_grasp_point_generation(scene_dict, key_cfg, key, base_dir)
+        key_scene_dicts[key] = scene_dict
+        with open(base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f:
+            pickle.dump(scene_dict, f)
+    return key_scene_dicts
+
+def single_grasp_point_generation(scene_dict, key_cfg, key, base_dir):
+    """Generate grasp point for a single scene."""
+    # Setup
+    gpu_id = key_cfg["gpu"]
+    device = f"cuda:{gpu_id}"
+    start_frame_idx = scene_dict["info"]["start_frame_idx"]
+    i = start_frame_idx
+    manipulated_oid = int(scene_dict["info"]["manipulated_oid"])
+    
+    # Load camera parameters
+    T_c2w = np.array(scene_dict["info"]["camera"]["camera_opencv_to_world"]).astype(np.float64).reshape(4, 4)
+    K = np.array(scene_dict["intrinsics"]).astype(np.float32).reshape(3, 3)
+    img_size = scene_dict["height"], scene_dict["width"]
+    
+    # Load object model and trajectory
+    model_path = scene_dict["info"]["objects"][manipulated_oid]["optimized"]
+    model = trimesh.load(model_path)
+    
+    traj_key = scene_dict["info"]["traj_key"].replace("_recomputed", "")
+    print(f"[Info] Trajectory key: {traj_key}")
+    traj_path = scene_dict["info"]["objects"][manipulated_oid][traj_key]
+    traj = np.load(traj_path).reshape(-1, 4, 4)
+    start_pose = traj[start_frame_idx]
+    
+    # Transform model to world coordinate system
+    model = model.apply_transform(start_pose)
+    
+    # Render object to get mask and depth
+    rendered_image, mask_img, depth = render_pytorch3d_rgbd(model, K, T_c2w, img_size, device=device)
+    
+    # Find contact point from hand keypoints
+    kpts_2d = scene_dict["motion"]["hand_kpts"][i][[4, 8, 12, 16, 20]]
+    direction = scene_dict["motion"]["hand_global_orient"][i]
+    direction = direction.reshape(3, 3)
+    direction_world = T_c2w[:3, :3] @ direction
+    point_2d, is_close, is_inside = compute_contact_point(kpts_2d, mask_img)
+    if not is_close:
+        point_2d = find_nearest_point(point_2d, mask_img, erosion_size=key_cfg["affordance_erosion_pixel"])
+        if point_2d is None:
+            print(f"[Error] Failed to find valid contact point in mask")
+            return None
+    
+    x, y = int(round(point_2d[0])), int(round(point_2d[1]))
+    
+    # Visualize keypoints and contact point
+    vis_dir = base_dir / f"outputs/{key}/motion/debug"
+    vis_dir.mkdir(parents=True, exist_ok=True)
+    vis_path = vis_dir / f"grasp_point_visualization_frame_{i:06d}.png"
+    visualize_grasp_points(rendered_image, kpts_2d, point_2d, vis_path)
+   
+    # Get depth value at the contact point
+    z = depth[y, x] if 0 <= y < depth.shape[0] and 0 <= x < depth.shape[1] and depth[y, x] > 0 else 0.0
+    if z <= 0:
+        print(f"[Warning] Invalid depth at contact point ({x}, {y}), using nearest valid depth")
+        valid_mask = depth > 0
+        if np.any(valid_mask):
+            y_coords, x_coords = np.where(valid_mask)
+            dists = np.sqrt((x_coords - x)**2 + (y_coords - y)**2)
+            nearest_idx = np.argmin(dists)
+            z = depth[y_coords[nearest_idx], x_coords[nearest_idx]]
+        else:
+            print(f"[Error] No valid depth found, cannot unproject point")
+            return None
+    
+    # Create camera object for unprojection (reuse the same helper function)
+    camera = create_pytorch3d_camera(K, T_c2w, img_size, device=device)
+    
+    # Unproject 2D point to 3D using PyTorch3D. The rasterized depth is an NDC z-buffer
+    # in [0, 1]. Convert pixel coordinates to NDC [-1, 1] before calling unproject.
+    if isinstance(img_size, int):
+        H = W = img_size
+    else:
+        H, W = img_size
+    xy_coords = torch.tensor([[x, y]], dtype=torch.float32, device=device)
+    depth_values = torch.tensor([[z]], dtype=torch.float32, device=device)
+    points_camera = camera.unproject_points(
+        xy_depth=torch.cat([xy_coords, depth_values], dim=1),
+        world_coordinates=False,
+        scaled_depth_input=False,
+        from_ndc=False,
+    )
+    winner_point_3d = points_camera[0].cpu().numpy()  # (3,) in camera coordinates
+    print(f"[Info] Winner point 3D: {winner_point_3d}")
+    # Convert from camera coordinates to object coordinates
+    # winner_point_3d is in camera coordinate system
+    # start_pose is object pose in world coordinate system
+    # To get point in object coordinate system: 
+    # 1. camera -> world: T_c2w @ point_cam
+    # 2. world -> object: inv(start_pose) @ point_world
+    point_cam_homo = np.hstack([winner_point_3d, 1.0])  # (4,)
+    point_world = T_c2w @ point_cam_homo 
+    point_world = point_world / point_world[3]
+    winner_point_to_obj = np.linalg.inv(start_pose) @ point_world 
+    winner_point_to_obj = winner_point_to_obj[:3] / winner_point_to_obj[3]
+    scene_dict["info"]["objects"][manipulated_oid]["grasp_point"] = winner_point_to_obj.tolist()
+    scene_dict["info"]["objects"][manipulated_oid]["grasp_direction"] = direction_world.tolist()
+
+    scene_json_path = base_dir / f"outputs/{key}/simulation/scene.json"
+    with open(scene_json_path, "r") as f:
+        scene_json = json.load(f)
+    scene_json["objects"][str(manipulated_oid)]["grasp_point"] = winner_point_to_obj.tolist()
+    scene_json["objects"][str(manipulated_oid)]["grasp_direction"] = direction_world.tolist()
+    with open(scene_json_path, "w") as f:
+        json.dump(scene_json, f, indent=2)
+    return winner_point_to_obj
+
+
+
+if __name__ == "__main__":
+    base_dir = Path.cwd()
+    cfg_path = base_dir / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+    sys.path.append(str(base_dir / "openreal2sim" / "simulation"))
+    from utils.compose_config import compose_configs
+    key_cfgs = {key: compose_configs(key, cfg) for key in keys} 
+    print(f"Key cfgs: {key_cfgs}")
+    key_scene_dicts = {}
+    for key in keys:
+        scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl'
+        with open(scene_pkl, 'rb') as f:
+            scene_dict = pickle.load(f)
+        key_scene_dicts[key] = scene_dict
+    grasp_point_extraction(keys, key_scene_dicts, key_cfgs)
+
diff --git a/openreal2sim/motion/modules/hand_extraction.py b/openreal2sim/motion/modules/hand_extraction.py
new file mode 100755
index 0000000..3686ccb
--- /dev/null
+++ b/openreal2sim/motion/modules/hand_extraction.py
@@ -0,0 +1,191 @@
+## In this file: we use hand-object detector to decide the starting and ending frame of contact.
+## Then we use wilor to extract the hand info.
+
+
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import yaml
+from typing import List, Tuple, Dict, Optional
+import logging
+from pathlib import Path
+import pickle
+import logging
+import tqdm
+base_dir = Path.cwd()
+sys.path.append(str(base_dir / "openreal2sim" / "motion" / "modules"))
+sys.path.append(str(base_dir / "third_party" / "WiLoR"))
+sys.path.append(str(base_dir / "third_party" / "Grounded-SAM-2"))
+
+
+from wilor.utils.renderer import cam_crop_to_full
+from wilor.models import load_wilor
+from wilor.utils import recursive_to
+from wilor.datasets.vitdet_dataset import ViTDetDataset    
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor    
+from ultralytics import YOLO
+
+class WiLoRExtractor:
+    def __init__(self,
+                 model_path: str,
+                 cfg_path: str,
+                 yolo_weights_path: str,
+                 sam_weights_path: str,
+                 sam_cfg_path: str,
+                 device: str):
+        self._wilor_model, self._wilor_cfg = load_wilor(model_path, cfg_path)
+        self._wilor_model.eval()
+        self._yolo_detector = YOLO(yolo_weights_path)
+        self.device = torch.device(device)
+        self._sam_predictor = SAM2ImagePredictor(build_sam2(sam_cfg_path, sam_weights_path))
+        
+    def process(self, images: np.ndarray, batch_size: int = 16, rescale_factor: float = 1.0):
+        boxes = []
+        right = []
+        masks = []
+        self._wilor_model.to(self.device)
+        self._yolo_detector.to(self.device)
+        self._sam_predictor.model.to(self.device)
+        self._wilor_model.eval()
+
+        all_global_orient = []
+        all_kpts = []
+        all_masks = []
+        
+        for i in tqdm.tqdm(range(0, len(images)), desc="Detecting hands"):
+            batch = np.array(images)[i]
+            with torch.no_grad():
+                detections = self._yolo_detector.predict(batch, conf=0.3, verbose=False, save=False, show=False)[0]
+                
+                if len(detections.boxes.cls.cpu().detach().numpy()) == 0:
+                    all_global_orient.append(None)
+                    all_kpts.append(None)
+                    all_masks.append(None)
+                else:
+                    det = max(detections, key=lambda d: d.boxes.conf.cpu().detach().item())
+                    Bbox = det.boxes.data.cpu().detach().squeeze().numpy()
+                    cls_flag = det.boxes.cls.cpu().detach().squeeze().item()
+                    
+                    self._sam_predictor.set_image(batch)
+                    hand_masks, scores, logits = self._sam_predictor.predict(
+                        point_coords=None,
+                        point_labels=None,
+                        box=np.array([[Bbox[0], Bbox[1], Bbox[2], Bbox[3]]]),
+                        multimask_output=False
+                    )
+                    hand_masks = hand_masks[0]
+                    hand_masks = hand_masks.astype(bool)
+                    dataset = ViTDetDataset(self._wilor_cfg, batch, np.array([Bbox[:4]]), np.array([cls_flag]), rescale_factor=rescale_factor)
+                    dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)
+                    sign = False
+                    for batch in dataloader:
+                        assert sign == False
+                        batch = recursive_to(batch, torch.device(self.device))
+                        with torch.no_grad():
+                            out = self._wilor_model(batch)
+                        multiplier = 2 * batch['right'][0].cpu().numpy() - 1
+                        pred_cam = out['pred_cam'][0] 
+                        pred_cam[1] = multiplier * pred_cam[1]
+                        box_center =  batch['box_center'][0].float()
+                        box_size      = batch['box_size'][0].float()
+                        img_size      = batch['img_size'][0].float()
+                        scaled_focal_length = self._wilor_cfg.EXTRA.FOCAL_LENGTH / self._wilor_cfg.MODEL.IMAGE_SIZE * img_size.max()
+                        #import pdb; pdb.set_trace()
+                        pred_cam_t_full     = cam_crop_to_full(pred_cam.reshape(1, 3), box_center.reshape(1, 2), box_size.reshape(1, 1), img_size.reshape(1, 2), scaled_focal_length).detach().cpu().numpy()
+                        batch_size = batch['img'].shape[0]                    
+                        joints = out['pred_keypoints_3d'][0].detach().cpu().numpy()
+                        joints[:, 0] = multiplier * joints[:, 0]
+                        cam_t = pred_cam_t_full[0]
+                        kpts_2d = self.project_full_img(joints, cam_t, float(scaled_focal_length), img_size)
+                        all_kpts.append(kpts_2d)
+                        all_global_orient.append(out['pred_mano_params']['global_orient'][0,0].detach().cpu().numpy())
+                        all_masks.append(hand_masks)        
+                        sign = True
+        return all_kpts, all_global_orient, all_masks
+
+
+
+
+
+    def project_full_img(self, points, cam_trans, focal_length, img_res):
+        ''' we use simple K here. It works.'''
+        if not isinstance(points, torch.Tensor):
+            points = torch.tensor(points, dtype=torch.float32)
+        if not isinstance(cam_trans, torch.Tensor):
+            cam_trans = torch.tensor(cam_trans, dtype=torch.float32)
+        # Ensure numeric image resolution
+        try:
+            img_w = float(img_res[0])
+            img_h = float(img_res[1])
+        except Exception:
+            # Fallback for unexpected types
+            img_w, img_h =  float(img_res[0].item()), float(img_res[1].item())
+        K = torch.eye(3, dtype=torch.float32)
+        K[0, 0] = float(focal_length)
+        K[1, 1] = float(focal_length)
+        K[0, 2] = img_w / 2.0
+        K[1, 2] = img_h / 2.0
+        pts = points + cam_trans
+        pts = pts / pts[..., -1:]
+        V_2d = (K @ pts.T).T
+        return V_2d[..., :-1].detach().cpu().numpy()
+
+
+
+
+
+
+def hand_extraction(keys, key_scene_dicts, key_cfgs):
+    base_dir = Path.cwd()
+    model_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "wilor_final.ckpt"
+    cfg_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "model_config.yaml"
+    yolo_weights_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "detector.pt"
+    sam_cfg_path = "configs/sam2.1/sam2.1_hiera_l.yaml"
+    sam_weights_path = "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt"
+    
+    for key in keys:
+      
+        scene_dict = key_scene_dicts[key]
+        config = key_cfgs[key]
+        gpu_id = config['gpu']
+        device = f"cuda:{gpu_id}"
+        wilor_extractor = WiLoRExtractor(model_path=model_path, cfg_path=cfg_path, yolo_weights_path=yolo_weights_path, sam_weights_path=sam_weights_path, sam_cfg_path=sam_cfg_path, device=device)
+        #images = np.transpose(scene_dict["images"], (0, 3, 1, 2))
+        images = scene_dict["images"].astype(np.float32)
+        kpts, global_orient, masks = wilor_extractor.process(images)
+        if scene_dict.get("motion") is None:
+            scene_dict["motion"] = {}
+        scene_dict["motion"]["hand_kpts"] = kpts
+        scene_dict["motion"]["hand_global_orient"] = global_orient
+        scene_dict["motion"]["hand_masks"] = masks
+        key_scene_dicts[key] = scene_dict
+        with open(base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f:
+            pickle.dump(scene_dict, f)
+
+
+        return key_scene_dicts
+
+
+
+
+if __name__ == "__main__":
+    base_dir = Path.cwd()
+    cfg_path = base_dir / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+
+    from utils.compose_config import compose_configs
+    key_cfgs = {key: compose_configs(key, cfg) for key in keys} 
+    print(f"Key cfgs: {key_cfgs}")
+    key_scene_dicts = {}
+    for key in keys:
+        scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl'
+        with open(scene_pkl, 'rb') as f:
+            scene_dict = pickle.load(f)
+        key_scene_dicts[key] = scene_dict
+    hand_extraction(keys, key_scene_dicts, key_cfgs)
+
+    
diff --git a/openreal2sim/motion/motion_manager.py b/openreal2sim/motion/motion_manager.py
new file mode 100644
index 0000000..acc7dcf
--- /dev/null
+++ b/openreal2sim/motion/motion_manager.py
@@ -0,0 +1,86 @@
+from pathlib import Path
+import sys
+import pickle
+import yaml
+import argparse
+
+sys.path.append(str(Path.cwd() / "openreal2sim" / "motion" / "modules"))
+sys.path.append(str(Path.cwd() / "openreal2sim" / "motion" / "utils"))
+
+from utils.compose_config import compose_configs
+
+class MotionAgent:
+    def __init__(self, stage=None, key=None):
+        print('[Info] Initializing MotionAgent...')
+        self.base_dir = Path.cwd()
+        cfg_path = self.base_dir / "config" / "config.yaml"
+        cfg = yaml.safe_load(cfg_path.open("r"))
+        self.keys = [key] if key is not None else cfg["keys"]
+        self.key_cfgs = {key: compose_configs(key, cfg) for key in self.keys}
+        self.key_scene_dicts = {}
+        for key in self.keys:
+            scene_pkl = self.base_dir / f'outputs/{key}/scene/scene.pkl'
+            with open(scene_pkl, 'rb') as f:
+                scene_dict = pickle.load(f)
+            self.key_scene_dicts[key] = scene_dict
+        self.stages = [
+            #"hand_extraction",
+            "demo_motion_process",
+            "grasp_point_extraction",
+            "grasp_generation",
+        ]
+        if stage is not None:
+            if stage in self.stages:
+                start_idx = self.stages.index(stage)
+                self.stages = self.stages[start_idx:]
+            else:
+                print(f"[Warning] Stage '{stage}' not found. It must be one of {self.stages}. Running all stages by default.")
+        print('[Info] MotionAgent initialized.')
+    
+    def save_scene_dicts(self):
+        for key, scene_dict in self.key_scene_dicts.items():
+            with open(self.base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f:
+                pickle.dump(scene_dict, f)
+        print('[Info] Scene dictionaries saved.')
+
+    def hand_extraction(self):
+        from modules.hand_extraction import hand_extraction
+        self.key_scene_dicts = hand_extraction(self.keys, self.key_scene_dicts, self.key_cfgs)
+        print("[Info] Hand extraction completed.")
+
+    def demo_motion_process(self):
+        from modules.demo_motion_process import demo_motion_process
+        self.key_scene_dicts = demo_motion_process(self.keys, self.key_scene_dicts, self.key_cfgs)
+        print("[Info] Demo motion process completed.")
+
+    def grasp_point_extraction(self):
+        from modules.grasp_point_extraction import grasp_point_extraction
+        self.key_scene_dicts = grasp_point_extraction(self.keys, self.key_scene_dicts, self.key_cfgs)
+        print("[Info] Grasp point extraction completed.")
+
+    def grasp_generation(self):
+        from modules.grasp_generation import grasp_generation
+        grasp_generation(self.keys)
+        print("[Info] Grasp generation completed.")
+
+    def run(self):
+        if "hand_extraction" in self.stages:
+            self.hand_extraction()
+        if "demo_motion_process" in self.stages:
+            self.demo_motion_process()
+        if "grasp_point_extraction" in self.stages:
+            self.grasp_point_extraction()
+        if "grasp_generation" in self.stages:
+            self.grasp_generation()
+        self.save_scene_dicts()
+        print('[Info] MotionAgent run completed.')
+        return self.key_scene_dicts
+
+if __name__ == '__main__':
+    args = argparse.ArgumentParser()
+    args.add_argument('--stage', type=str, default=None, help='Starting from a certain stage')
+    args.add_argument('--key', type=str, default=None, help='Process a single key instead of all keys from config')
+    args = args.parse_args()
+
+    agent = MotionAgent(stage=args.stage, key=args.key)
+    scene_dicts = agent.run()
diff --git a/openreal2sim/motion/utils/compose_config.py b/openreal2sim/motion/utils/compose_config.py
new file mode 100755
index 0000000..6f30c1b
--- /dev/null
+++ b/openreal2sim/motion/utils/compose_config.py
@@ -0,0 +1,23 @@
+
+default_config = {
+    "gpu": "0",
+    "rescore_strategy": "point_only",
+    "n_points": 5000,
+    "keep": 10,
+    "nms": True,
+    "overwrite": False,
+    "vis_pts_per_gripper": 400,
+    "affordance_erosion_pixel": 5
+}
+
+def compose_configs(key_name: str, config: dict) -> dict:
+    ret_key_config = {} 
+    local_config = config["local"].get(key_name, {})
+    local_config = local_config.get("motion", {})
+    global_config = config.get("global", {})
+    global_config = global_config.get("motion", {})
+    for param in default_config.keys():
+        value = local_config.get(param, global_config.get(param, default_config[param]))
+        ret_key_config[param] = value
+    print(f"[Info] Config for {key_name}: {ret_key_config}")
+    return ret_key_config
\ No newline at end of file
diff --git a/openreal2sim/motion/utils/grasp_utils.py b/openreal2sim/motion/utils/grasp_utils.py
new file mode 100755
index 0000000..0c0a170
--- /dev/null
+++ b/openreal2sim/motion/utils/grasp_utils.py
@@ -0,0 +1,391 @@
+import os
+import sys
+import numpy as np
+import time
+import torch
+from typing import List
+import open3d as o3d
+from graspnetAPI.grasp import GraspGroup
+from pathlib import Path
+base_dir = Path.cwd()
+from graspness_unofficial.models.graspnet import GraspNet, pred_decode
+from graspness_unofficial.dataset.graspnet_dataset import minkowski_collate_fn
+from graspness_unofficial.utils.collision_detector import ModelFreeCollisionDetector
+from graspness_unofficial.utils.data_utils import CameraInfo, create_point_cloud_from_depth_image, get_workspace_mask
+from scipy.spatial.transform import Rotation as R
+import trimesh
+grasp_cfgs = {
+    "save_files": False,
+    "checkpoint_path": base_dir / "third_party" / "graspness_unofficial" / "ckpt" / "minkuresunet_kinect.tar", #hardcode path
+    "seed_feat_dim": 512,
+    "camera": "kinect",
+    "num_point": 80000,
+    "batch_size": 1,
+    "voxel_size": 0.001, # 0.005
+    "collision_thresh": 0.00001,
+    "voxel_size_cd": 0.01, # 0.01
+    "infer": True,
+}
+
+def my_worker_init_fn(worker_id):
+    np.random.seed(np.random.get_state()[1][0] + worker_id)
+    pass
+
+class MyGraspNet():
+    def __init__(self):
+        cfgs = grasp_cfgs
+        self.cfgs = cfgs
+        self.net = GraspNet(seed_feat_dim=cfgs["seed_feat_dim"], is_training=False)
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.net.to(self.device)
+        # Load checkpoint
+        checkpoint = torch.load(cfgs["checkpoint_path"])
+        self.net.load_state_dict(checkpoint['model_state_dict'])
+        start_epoch = checkpoint['epoch']
+        print("-> loaded checkpoint %s (epoch: %d)" % (cfgs["checkpoint_path"], start_epoch))
+
+        self.net.eval()
+
+    def inference(self, pcs):
+
+        data_dict = {'point_clouds': pcs.astype(np.float32),
+                    'coors': pcs.astype(np.float32) / self.cfgs["voxel_size"],
+                    'feats': np.ones_like(pcs).astype(np.float32)}
+        batch_data = minkowski_collate_fn([data_dict])
+        tic = time.time()
+        for key in batch_data:
+            if 'list' in key:
+                for i in range(len(batch_data[key])):
+                    for j in range(len(batch_data[key][i])):
+                        batch_data[key][i][j] = batch_data[key][i][j].to(self.device)
+            else:
+                batch_data[key] = batch_data[key].to(self.device)
+
+        # Forward pass
+        with torch.no_grad():
+            end_points = self.net(batch_data)
+            grasp_preds = pred_decode(end_points)
+
+        preds = grasp_preds[0].detach().cpu().numpy()
+        gg = GraspGroup(preds)
+
+        # collision detection
+        if self.cfgs["collision_thresh"] > 0:
+            cloud = data_dict['point_clouds']
+            mfcdetector = ModelFreeCollisionDetector(cloud, voxel_size=self.cfgs["voxel_size_cd"])
+            collision_mask = mfcdetector.detect(gg, approach_dist=0.05, collision_thresh=self.cfgs["collision_thresh"])
+            gg = gg[~collision_mask]
+
+        toc = time.time()
+        # print('inference time: %fs' % (toc - tic))
+        return gg
+
+def inference(cfgs, data_input):
+    batch_data = minkowski_collate_fn([data_input])
+    net = GraspNet(seed_feat_dim=cfgs["seed_feat_dim"], is_training=False)
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    net.to(device)
+
+    # Load checkpoint
+    checkpoint = torch.load(cfgs["checkpoint_path"])
+    net.load_state_dict(checkpoint['model_state_dict'])
+    start_epoch = checkpoint['epoch']
+    print("-> loaded checkpoint %s (epoch: %d)" % (cfgs["checkpoint_path"], start_epoch))
+
+    net.eval()
+    tic = time.time()
+
+    for key in batch_data:
+        if 'list' in key:
+            for i in range(len(batch_data[key])):
+                for j in range(len(batch_data[key][i])):
+                    batch_data[key][i][j] = batch_data[key][i][j].to(device)
+        else:
+            batch_data[key] = batch_data[key].to(device)
+
+    # Forward pass
+    with torch.no_grad():
+        end_points = net(batch_data)
+        grasp_preds = pred_decode(end_points)
+
+    preds = grasp_preds[0].detach().cpu().numpy()
+    gg = GraspGroup(preds)
+    # collision detection
+    if cfgs["collision_thresh"] > 0:
+        cloud = data_input['point_clouds']
+        mfcdetector = ModelFreeCollisionDetector(cloud, voxel_size=cfgs["voxel_size_cd"])
+        collision_mask = mfcdetector.detect(gg, approach_dist=0.05, collision_thresh=cfgs["collision_thresh"])
+        gg = gg[~collision_mask]
+
+    toc = time.time()
+    # print('inference time: %fs' % (toc - tic))
+    return gg
+
+def pointcloud_to_grasp(cfgs, pcs):
+    data_dict = {'point_clouds': pcs.astype(np.float32),
+                'coors': pcs.astype(np.float32) / cfgs["voxel_size"],
+                'feats': np.ones_like(pcs).astype(np.float32)}
+    gg = inference(cfgs, data_dict)
+    return gg
+
+def vis_grasp(pcs, gg):
+    gg = gg.nms()
+    gg = gg.sort_by_score()
+    keep = 1
+    if gg.__len__() > keep:
+        gg = gg[:keep]
+    grippers = gg.to_open3d_geometry_list()
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(pcs.astype(np.float32))
+    o3d.visualization.draw_geometries([cloud, *grippers])
+
+def grasp_to_pointcloud(grippers, gripper_points=1000, gripper_color=[1, 0, 0]):
+    grippers_pcd = o3d.geometry.PointCloud()
+    for gripper in grippers:
+        g_pcd = gripper.sample_points_uniformly(gripper_points)
+        grippers_pcd += g_pcd
+    grippers_pcd.colors = o3d.utility.Vector3dVector(np.tile(np.array([gripper_color]), (len(grippers_pcd.points), 1)))
+    return grippers_pcd
+
+
+def vis_save_grasp(points, gg, best_grasp, save_path, colors=None, grasp_position=None, place_position=None):
+    # visualize grasp pos, place pos, grasp poses, and pcd
+    cloud = o3d.geometry.PointCloud()
+    cloud.points = o3d.utility.Vector3dVector(points.astype(np.float32))
+
+    if colors is None:
+        cloud.colors = o3d.utility.Vector3dVector(np.tile(np.array([[0, 1, 0]]), (len(cloud.points), 1)))
+    elif isinstance(colors, np.ndarray):
+        cloud.colors = o3d.utility.Vector3dVector(colors.astype(np.float32))
+    elif isinstance(colors, list):
+        cloud.colors = o3d.utility.Vector3dVector(np.tile(np.array([colors]), (len(cloud.points), 1)))
+
+    if type(gg) == GraspGroup:
+        pcd_w_grasp = grasp_to_pointcloud(gg.to_open3d_geometry_list())
+    else:
+        pcd_w_grasp = grasp_to_pointcloud([gg.to_open3d_geometry()])
+
+    pcd_best_grasp = grasp_to_pointcloud([best_grasp.to_open3d_geometry()], gripper_color=[0, 0, 1])
+    pcd_w_grasp += pcd_best_grasp
+
+    if grasp_position is not None and place_position is not None:
+        pick_pcd = o3d.geometry.PointCloud()
+        place_pcd = o3d.geometry.PointCloud()
+        pick_pcd.points = o3d.utility.Vector3dVector(np.array(grasp_position).reshape(1,3).astype(np.float32))
+        place_pcd.points = o3d.utility.Vector3dVector(np.array(place_position).reshape(1,3).astype(np.float32))
+        pick_pcd.colors = place_pcd.colors = o3d.utility.Vector3dVector(np.array([[0,0,1]]).astype(np.float32))
+        pcd_w_grasp = pcd_w_grasp + pick_pcd + place_pcd
+
+    pcd_w_grasp += cloud
+
+    o3d.io.write_point_cloud(save_path, pcd_w_grasp)
+
+def get_pose_from_grasp(best_grasp):
+    grasp_position = best_grasp.translation
+    # convert rotation to isaacgym convention
+    delta_m = np.array([[0, 0, 1], [0, -1, 0], [1, 0, 0]])
+    rotation = np.dot(best_grasp.rotation_matrix, delta_m)
+    quaternion_grasp = R.from_matrix(rotation).as_quat()
+    quaternion_grasp = np.array([quaternion_grasp[3],quaternion_grasp[0],quaternion_grasp[1],quaternion_grasp[2]])
+    rotation_unit_vect = rotation[:,2]
+    # grasp_position -= 0.03 * rotation_unit_vect
+    return grasp_position, quaternion_grasp, rotation_unit_vect
+
+def get_best_grasp(gg, position, max_dis=0.05):
+    # get best grasp pose around the grasp position
+    best_grasp = None
+    for g in gg:
+        if np.linalg.norm(g.translation - position) < max_dis:
+            if best_grasp is None:
+                best_grasp = g
+            else:
+                if g.score > best_grasp.score:
+                    best_grasp = g
+    if best_grasp is None:
+        best_grasp = gg[0]
+    return best_grasp
+
+def get_best_grasp_z_aligned(gg):
+    # get best grasp pose that is facing the -z axis (for top-down grasping)
+    best_grasp = None
+    best_angle = np.inf
+
+    gravity_direction = np.array([0, 0, -1])  # -Z direction in world coordinates
+
+    for g in gg:
+
+        # approach vector is +X axis of the grasp frame, make it close to the -Z axis in the world frame
+        approach_vector = g.rotation_matrix[:, 0]
+        approach_vector /= np.linalg.norm(approach_vector)
+
+        # compute the angle between the approach vector and the gravity direction
+        angle = np.arccos(np.clip(np.dot(approach_vector, gravity_direction), -1.0, 1.0))
+
+        if angle < best_angle:
+            best_angle = angle
+            best_grasp = g
+        elif np.isclose(angle, best_angle) and g.score > best_grasp.score:
+            best_grasp = g
+
+    if best_grasp is None:
+        print("No best grasp found, falling back to the first grasp.")
+        best_grasp = gg[0]  # fallback
+
+    return best_grasp
+
+def get_best_grasp_z_aligned_and_y_aligned(gg,
+                                           desired_approach_dir=np.array([0, 0, -1]),
+                                           desired_width_dir=np.array([0, 1, 0]),
+                                           angle_tolerance=0.2):
+    """
+    Pick the grasp whose:
+      1) X-axis (approach) is closest to desired_approach_dir (default -Z),
+      2) Y-axis (gripper width) is closest to desired_width_dir (default +Y),
+      3) Score is highest if angles tie.
+
+    Grasp coordinate system:
+      - rotation_matrix[:, 0] → X-axis = Approach direction
+      - rotation_matrix[:, 1] → Y-axis = Gripper width direction
+      - rotation_matrix[:, 2] → Z-axis = Depth/thickness direction
+    """
+
+    best_grasp = None
+    # Track best angles and score
+    best_approach_angle = np.inf
+    best_width_angle = np.inf
+    best_score = -np.inf
+
+    # Normalize desired directions
+    desired_approach_dir = desired_approach_dir / np.linalg.norm(desired_approach_dir)
+    desired_width_dir = desired_width_dir / np.linalg.norm(desired_width_dir)
+
+    for g in gg:
+        # 1) Approach vector angle
+        approach_vec = g.rotation_matrix[:, 0]
+        approach_vec /= np.linalg.norm(approach_vec)
+        approach_angle = angle_between(approach_vec, desired_approach_dir)
+
+        # 2) Width vector angle
+        width_vec = g.rotation_matrix[:, 1]
+        width_vec /= np.linalg.norm(width_vec)
+        width_angle = angle_between(width_vec, desired_width_dir)
+
+        # 3) Compare to the "best" so far in a hierarchical manner
+        if approach_angle < best_approach_angle:
+            # Definitely better in terms of approach alignment => choose this
+            best_approach_angle = approach_angle
+            best_width_angle = width_angle
+            best_score = g.score
+            best_grasp = g
+        elif np.isclose(approach_angle, best_approach_angle, atol=angle_tolerance):
+            # Approach angles are essentially tied, compare width alignment
+            if width_angle < best_width_angle:
+                best_width_angle = width_angle
+                best_score = g.score
+                best_grasp = g
+            elif np.isclose(width_angle, best_width_angle, atol=angle_tolerance):
+                # Both angles tied, pick the higher score
+                if g.score > best_score:
+                    best_score = g.score
+                    best_grasp = g
+
+    if best_grasp is None and len(gg) > 0:
+        print("No valid grasp found using angle criteria. Falling back to the first grasp.")
+        best_grasp = gg[0]
+
+    return best_grasp
+
+
+def get_best_grasp_with_hints(gg: GraspGroup, point: List[float] = None, direction: List[float] = None):
+    """
+    Rescore all grasps using optional spatial and directional hints, then return a new
+    GraspGroup sorted by this custom score (best first). Does NOT mutate the original gg.
+
+    Scoring terms in [0, 1], combined by a weighted sum:
+      - dir_term: alignment between grasp approach (+X axis) and `direction`
+      - pt_term : proximity to `point` (RBF over distance)
+      - net_term: original network score normalized over gg
+
+    Args:
+        gg: GraspGroup from graspnetAPI.
+        point: (3,) world point. If provided, grasps closer to this point are preferred.
+        direction: (3,) world direction. If provided, grasps whose approach (+X) aligns
+                   with this direction are preferred.
+
+    Returns:
+        GraspGroup: a *new* group sorted by the custom score (descending).
+                    The best guess is result[0].
+    """
+    # --- Early exits ---
+    if gg is None or len(gg) == 0:
+        return gg
+
+    # Internal weights (you can tweak if needed)
+    w_dir = 1.0
+    w_pt  = 1.0
+    w_net = 0
+
+    # If hints are missing, zero-out the corresponding weights
+    if point is None:
+        w_pt = 0.0
+    if direction is None or (np.asarray(direction).shape != (3,)):
+        w_dir = 0.0
+
+    # Length-scale for the point proximity (meters). Similar to your 0.05 window.
+    sigma = 0.05
+    sigma2 = max(sigma * sigma, 1e-12)
+
+    # --- Gather per-grasp attributes ---
+    translations = []
+    approach_axes = []  # grasp frame +X
+    net_scores    = []
+    for g in gg:
+        translations.append(g.translation.astype(np.float64))
+        # Normalize +X axis as approach direction
+        ax = g.rotation_matrix[:, 0].astype(np.float64)
+        n  = np.linalg.norm(ax)
+        approach_axes.append(ax / n if n > 0 else np.array([1.0, 0.0, 0.0], dtype=np.float64))
+        net_scores.append(float(g.score))
+
+    translations = np.vstack(translations)          # (N,3)
+    approach_axes = np.vstack(approach_axes)        # (N,3)
+    net_scores = np.asarray(net_scores, dtype=np.float64)  # (N,)
+
+    # --- Normalize original network scores to [0,1] ---
+    if np.isfinite(net_scores).all() and (net_scores.max() > net_scores.min()):
+        net_term = (net_scores - net_scores.min()) / (net_scores.max() - net_scores.min())
+    else:
+        net_term = np.zeros_like(net_scores)
+
+    # --- Direction alignment term (cosine mapped to [0,1]) ---
+    if w_dir > 0.0:
+        d = np.asarray(direction, dtype=np.float64)
+        nd = np.linalg.norm(d)
+        if nd > 0:
+            d = d / nd
+            cosv = np.clip((approach_axes * d[None, :]).sum(axis=1), -1.0, 1.0)
+            dir_term = 0.5 * (cosv + 1.0)  # map [-1,1] -> [0,1]
+        else:
+            dir_term = np.zeros(len(gg), dtype=np.float64)
+    else:
+        dir_term = np.zeros(len(gg), dtype=np.float64)
+
+    # --- Point proximity term (RBF over Euclidean distance) ---
+    if w_pt > 0.0:
+        p = np.asarray(point, dtype=np.float64).reshape(1, 3)
+        dists = np.linalg.norm(translations - p, axis=1)
+        pt_term = np.exp(-0.5 * (dists * dists) / sigma2)  # in (0,1]
+    else:
+        pt_term = np.zeros(len(gg), dtype=np.float64)
+
+    # --- Combine ---
+    total_score = w_dir * dir_term + w_pt * pt_term + w_net * net_term
+    gg.scores = total_score
+    gg.sort_by_score()  # Sort in-place by the new score (best first)
+    return gg
+
+
+def angle_between(v1, v2):
+    """Utility to compute the angle between two normalized vectors in [0, π]."""
+    dot_val = np.clip(np.dot(v1, v2), -1.0, 1.0)
+    return np.arccos(dot_val)
diff --git a/openreal2sim/motion/utils/notification.py b/openreal2sim/motion/utils/notification.py
new file mode 100755
index 0000000..a36c546
--- /dev/null
+++ b/openreal2sim/motion/utils/notification.py
@@ -0,0 +1,42 @@
+import requests
+
+ntfy_url = ""
+
+def notify_success(pipeline_label):
+    try:
+        requests.post(
+            ntfy_url, 
+            data=pipeline_label,
+            headers={
+                "Title": f"Reconstruction completed",
+                "Tags": "white_check_mark"      
+            }
+        )
+    except:
+        pass
+
+def notify_started(pipeline_label):
+    try:
+        requests.post(
+            ntfy_url, 
+            data=pipeline_label,
+            headers={
+                "Title": f"Reconstruction started",
+                "Tags": "watch"      
+            }
+        )
+    except:
+        pass
+
+def notify_failed(pipeline_label):
+    try:
+        requests.post(
+            ntfy_url, 
+            data=pipeline_label,
+            headers={
+                "Title": f"Reconstruction failed",
+                "Tags": "x"      
+            }
+        )
+    except:
+        pass
\ No newline at end of file
diff --git a/openreal2sim/motion/utils/visualize_scene.py b/openreal2sim/motion/utils/visualize_scene.py
new file mode 100644
index 0000000..7d6cdd0
--- /dev/null
+++ b/openreal2sim/motion/utils/visualize_scene.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Aggregate (all frames) export for object trajectories in the world frame.
+
+Outputs:
+- One PLY containing:
+  * Background point cloud (downsampled once, light gray)
+  * For each frame (with stride) and each object:
+      - Object point cloud transformed by Δ_w[frame]
+      - Object center point at that frame (brighter color)
+  Optionally encodes time in alpha channel (RGBA): early=dark, late=bright.
+
+- (Optional) One GLB containing all frames at once (can be large):
+  * Background mesh once
+  * For each frame (with stride), one duplicated and transformed mesh per object.
+"""
+
+import json
+from pathlib import Path
+import argparse
+import numpy as np
+import trimesh
+import yaml
+
+
+def _create_arrow_mesh(origin: np.ndarray, direction: np.ndarray, length: float, radius: float = 0.01) -> trimesh.Trimesh:
+    """Construct an arrow mesh even if `trimesh.creation.arrow` is missing."""
+    direction = np.asarray(direction, dtype=np.float64)
+    norm = np.linalg.norm(direction)
+    if norm < 1e-9:
+        raise ValueError("Grasp direction must be non-zero.")
+    direction = direction / norm
+    shaft_length = length * 0.75
+    tip_length = length - shaft_length
+
+    shaft = trimesh.creation.cylinder(radius=radius, height=shaft_length, sections=18)
+    shaft.apply_translation([0.0, 0.0, shaft_length / 2.0])
+
+    tip = trimesh.creation.cone(radius=radius * 1.6, height=tip_length, sections=18)
+    tip.apply_translation([0.0, 0.0, shaft_length + tip_length])
+
+    arrow = trimesh.util.concatenate([shaft, tip])
+  
+    align_tf = trimesh.geometry.align_vectors(np.array([0.0, 0.0, 1.0]), direction)
+    arrow.apply_transform(align_tf)
+    arrow.apply_translation(origin)
+    return arrow
+
+
+# ---------------- utils ----------------
+def load_scene(scene_json: Path):
+    scene = json.loads(Path(scene_json).read_text())
+    bg_path = Path(scene["background"]["registered"])
+    objs = scene["objects"]
+    return scene, bg_path, objs
+
+def sample_surface_points(mesh: trimesh.Trimesh, n_pts: int) -> np.ndarray:
+    """
+    Return (N,3) uniformly sampled surface points.
+    Prefer sample_surface_even; fallback to sample_surface.
+    """
+    if n_pts <= 0:
+        return mesh.vertices.view(np.ndarray).copy()
+    try:
+        from trimesh.sample import sample_surface_even
+        pts, _ = sample_surface_even(mesh, n_pts)
+    except Exception:
+        from trimesh.sample import sample_surface
+        pts, _ = sample_surface(mesh, n_pts)
+    return pts.astype(np.float64, copy=False)
+
+def voxel_downsample(points: np.ndarray, voxel: float) -> np.ndarray:
+    """
+    Simple voxel downsampling by snapping to a voxel grid and
+    keeping the first sample per voxel.
+    """
+    if voxel is None or voxel <= 0 or len(points) == 0:
+        return points
+    grid = np.floor(points / float(voxel)).astype(np.int64)
+    _, keep_idx = np.unique(grid, axis=0, return_index=True)
+    keep_idx.sort()
+    return points[keep_idx]
+
+def transform_points(D: np.ndarray, pts: np.ndarray) -> np.ndarray:
+    """Row-vector transform: p' = p * R^T + t"""
+    R = D[:3, :3]
+    t = D[:3, 3]
+    return pts @ R.T + t
+
+def transform_point(D: np.ndarray, p: np.ndarray) -> np.ndarray:
+    R = D[:3, :3]
+    t = D[:3, 3]
+    return p @ R.T + t
+
+def color_palette(k: int) -> np.ndarray:
+    base = np.array([
+        [230, 57,  70],
+        [29,  161, 242],
+        [39,  174, 96],
+        [155, 89,  182],
+        [241, 196, 15],
+        [230, 126, 34],
+        [52,  152, 219],
+        [46,  204, 113],
+        [243, 156, 18],
+        [231, 76,  60],
+    ], dtype=np.uint8)
+    if k <= base.shape[0]:
+        return base[:k]
+    reps = int(np.ceil(k / base.shape[0]))
+    pal = np.vstack([base for _ in range(reps)])[:k]
+    return pal
+
+def save_scene_glb_all(bg_mesh, per_frame_meshes, out_path: Path):
+    """
+    Save a single GLB with background once and all per-frame object mesh copies.
+    per_frame_meshes: List[(node_name, trimesh.Trimesh)] for all frames.
+    """
+    sc = trimesh.Scene()
+    sc.add_geometry(bg_mesh, node_name="background")
+    for name, m in per_frame_meshes:
+        sc.add_geometry(m, node_name=name)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    sc.export(str(out_path), file_type="glb")
+
+# ---------------- core: single-key visualization (callable) ----------------
+def visualize_single_scene(
+    key: str,
+    out_dir: Path | None = None,          # None -> outputs/<key>/debug
+    *,
+    bg_samples: int = 60000,              # default background sample count
+    obj_samples: int = 12000,             # default object sample count
+    bg_voxel: float = 0.01,               # meters
+    obj_voxel: float = 0.005,             # meters
+    stride: int = 1,
+    save_glb_all: bool = False,
+    encode_time_in_alpha: bool = False,
+    mesh_key: str = "optimized",
+    traj_key: str = "final_trajs",
+) -> dict:
+    """
+    Render a single combined PLY (and optional GLB) for one key.
+
+    Args:
+        key: scene key under outputs/<key>/simulation/scene.json
+        out_dir: if None -> outputs/<key>/debug ; otherwise results write to <out_dir>/<key>/
+        bg_samples, obj_samples: sample counts
+        bg_voxel, obj_voxel: voxel sizes in meters
+        stride: frame stride
+        save_glb_all: also export a GLB with all frames (heavy)
+        encode_time_in_alpha: encode time as alpha in PLY colors
+        mesh_key: which object mesh field to use (default "fdpose"; also "refined")
+        traj_key: which trajectory field to use (default "trajs"; e.g., "trajs_hybrid")
+
+    Returns:
+        dict: {"ply": <Path>, "glb": <Path or None>, "frames": int, "points": int}
+    """
+    base = Path.cwd()
+    scene_json = base / "outputs" / key / "simulation" / "scene.json"
+    out_dir = base / "outputs" / key / "motion" / "debug"
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    scene, bg_path, objs = load_scene(scene_json)
+    assert bg_path.exists(), f"[{key}] background_aligned.glb not found: {bg_path}"
+    if len(objs.keys()) == 0:
+        raise RuntimeError(f"[{key}] scene.objects is empty")
+
+    print(f"[{key}] load background: {bg_path}")
+    bg_mesh = trimesh.load(str(bg_path), force='mesh')
+
+    # --- background point cloud (once) ---
+    bg_pts = sample_surface_points(bg_mesh, bg_samples)
+    bg_pts = voxel_downsample(bg_pts, bg_voxel)
+    bg_color = np.array([180, 180, 180], dtype=np.uint8)
+
+    # --- preload all objects ---
+    palette = color_palette(len(objs))
+    seq_lengths = []
+    obj_infos = []
+    for oid, obj in objs.items():
+        oid = obj["oid"]
+        oname = obj["name"]
+        name = f"{oid}_{oname}"
+
+        # 1) mesh field (e.g., fdpose or refined)
+        mesh_path = Path(obj.get(mesh_key, ""))
+        assert mesh_path.exists(), f"[{key}] object mesh '{mesh_key}' not found: {mesh_path}"
+
+        # 2) trajectory field (e.g., trajs or trajs_hybrid)
+        trajs_path = Path(obj.get(traj_key, ""))
+        if not trajs_path.exists():
+            traj_key = "fdpose_trajs"
+            trajs_path = Path(obj.get(traj_key, ""))
+            rel = np.load(str(trajs_path))
+            N = rel.shape[0]
+            seq_lengths.append(N)
+            rel = np.stack([np.eye(4) for _ in range(N)])
+        else:
+            rel = np.load(str(trajs_path))  # [N,4,4]
+            assert rel.ndim == 3 and rel.shape[1:] == (4,4), f"[{key}] trajs shape invalid: {rel.shape}"
+            N = rel.shape[0]
+            seq_lengths.append(N)
+        
+
+        print(f"[{key}] load object {name}: mesh={mesh_path.name} ({mesh_key}), trajs={trajs_path.name} ({traj_key})")
+        base_mesh = trimesh.load(str(mesh_path), force='mesh')  # world-space at frame 0
+
+
+        obj_pts0 = sample_surface_points(base_mesh, obj_samples)
+        obj_pts0 = voxel_downsample(obj_pts0, obj_voxel)
+        center0 = base_mesh.centroid.view(np.ndarray)
+
+        obj_infos.append({
+            "name": name,
+            "base_mesh": base_mesh,
+            "rel": rel.astype(np.float64),
+            "pts0": obj_pts0.astype(np.float64),
+            "center0": center0.astype(np.float64),
+            "color": palette[0].astype(np.uint8)
+        })
+        print(f"[{key}] object {name}: traj_len={N}, pts0={len(obj_pts0)}")
+
+    # unify sequence length by the shortest and apply stride
+    N_all = min(seq_lengths)
+    idxs = np.arange(0, N_all, max(1, int(stride)))
+    n_frames = len(idxs)
+    print(f"[{key}] aggregating {n_frames} frames into ONE PLY (and optional ONE GLB)")
+
+    # --- aggregate PLY ---
+    pts_all_list = []
+    cols_all_list = []
+
+    # add background first
+    pts_all_list.append(bg_pts)
+    if encode_time_in_alpha:
+        alpha_bg = np.full((len(bg_pts), 1), 255, dtype=np.uint8)  # background alpha = 255
+        cols_all_list.append(np.hstack([np.repeat(bg_color[None, :], len(bg_pts), axis=0), alpha_bg]))
+    else:
+        cols_all_list.append(np.repeat(bg_color[None, :], len(bg_pts), axis=0))
+
+    # optional GLB: collect per-frame object mesh copies
+    all_mesh_copies = []  # (name, mesh)
+
+    for k, i in enumerate(idxs):
+        # time alpha from 64 -> 255
+        if encode_time_in_alpha:
+            alpha_val = int(np.clip(64 + (191 * (k / max(1, n_frames - 1))), 64, 255))
+        for info in obj_infos:
+            Di = info["rel"][i]
+            # object cloud
+            pts_i = transform_points(Di, info["pts0"])
+            pts_all_list.append(pts_i)
+
+            if encode_time_in_alpha:
+                alpha = np.full((len(pts_i), 1), alpha_val, dtype=np.uint8)
+                cols = np.hstack([np.repeat(info["color"][None, :], len(pts_i), axis=0), alpha])
+            else:
+                cols = np.repeat(info["color"][None, :], len(pts_i), axis=0)
+            cols_all_list.append(cols)
+
+            # object center (brighter)
+            c_i = transform_point(Di, info["center0"])
+            pts_all_list.append(c_i.reshape(1, 3))
+            ctr_col = np.clip(info["color"].astype(int) + 50, 0, 255).astype(np.uint8)
+            if encode_time_in_alpha:
+                cols_all_list.append(np.array([[ctr_col[0], ctr_col[1], ctr_col[2], alpha_val]], dtype=np.uint8))
+            else:
+                cols_all_list.append(ctr_col.reshape(1, 3).astype(np.uint8))
+
+            # GLB accumulation (optional)
+            if save_glb_all:
+                m = info["base_mesh"].copy()
+                m.apply_transform(Di)
+                all_mesh_copies.append((f"{info['name']}_f{i:06d}", m))
+
+
+    # 处理抓取点（grasp_point），生成球体mesh，并按红色加入到 ply 和 glb 中
+    manipulated_oid = scene["manipulated_oid"]
+    grasp_point = np.array(scene["objects"][manipulated_oid]["grasp_point"])
+    grasp_point_radius = 0.01  # 球体半径可调
+
+    # 创建抓取点球体mesh，并移动到抓取点位置
+    grasp_point_mesh = trimesh.creation.icosphere(subdivisions=3, radius=grasp_point_radius)
+    grasp_point_mesh.apply_translation(grasp_point)
+
+
+    grasp_color = np.array([0, 0, 255], dtype=np.uint8)  # 蓝色
+    grasp_pts = grasp_point_mesh.vertices
+    grasp_cols = np.tile(grasp_color, (grasp_pts.shape[0], 1))
+    if encode_time_in_alpha:
+        grasp_cols = np.hstack([grasp_cols, np.full((grasp_pts.shape[0], 1), 255, dtype=np.uint8)])
+    pts_all_list.append(grasp_pts)
+    cols_all_list.append(grasp_cols)
+
+    grasp_direction = np.array(scene["objects"][manipulated_oid]["grasp_direction"], dtype=np.float32)
+    if grasp_direction.shape == (3, 3):
+        grasp_direction = grasp_direction[:, 0]
+    grasp_direction = grasp_direction.reshape(-1)
+    if grasp_direction.shape[0] != 3 or np.linalg.norm(grasp_direction) < 1e-6:
+        print(f"[WARN][{key}] Invalid grasp_direction recorded; defaulting to +Z.")
+        grasp_direction = np.array([0.0, 0.0, 1.0], dtype=np.float32)
+    else:
+        grasp_direction = grasp_direction / np.linalg.norm(grasp_direction)
+    grasp_direction_length = 0.1  # 箭头长度
+    grasp_direction_mesh = _create_arrow_mesh(grasp_point, grasp_direction, grasp_direction_length, radius=0.01)
+    grasp_direction_pts = grasp_direction_mesh.vertices
+    grasp_direction_cols = np.tile(grasp_color, (grasp_direction_pts.shape[0], 1))
+    if encode_time_in_alpha:
+        grasp_direction_cols = np.hstack([grasp_direction_cols, np.full((grasp_direction_pts.shape[0], 1), 255, dtype=np.uint8)])
+    pts_all_list.append(grasp_direction_pts)
+    cols_all_list.append(grasp_direction_cols)
+
+    # GLB文件也加入mesh
+    if save_glb_all:
+        all_mesh_copies.append(("grasp_point", grasp_point_mesh))
+
+    # 重新聚合所有点和颜色再写PLY
+    pts_all = np.vstack(pts_all_list)
+    cols_all = np.vstack(cols_all_list)
+
+    ply_path = out_dir / f"object_{traj_key}.ply"
+    point_cloud = trimesh.points.PointCloud(vertices=pts_all, colors=cols_all)
+    point_cloud.export(str(ply_path))
+    print(f"[{key}][PLY] aggregated: {ply_path}  (pts={pts_all.shape[0]:,})")
+
+    
+    # write single GLB (optional)
+    glb_path = None
+    if save_glb_all:
+        glb_path = out_dir / "scene_all_frames.glb"
+        save_scene_glb_all(bg_mesh, all_mesh_copies, glb_path)
+        print(f"[{key}][GLB] aggregated: {glb_path}  (meshes={len(all_mesh_copies) + 1:,} incl. background)")
+
+    return {"ply": ply_path, "glb": glb_path, "frames": int(n_frames), "points": int(len(pts_all))}
+
+def visualize_scene(keys):
+    for key in keys:
+        visualize_single_scene(key)
+        print(f"[done] key={key}")
+    print('[Info] Scene visualization completed.')
+
+# ---------------- main (batch over config keys) ----------------
+def parse_args():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--stride", type=int, default=5, help="frame stride (1 = all frames)")
+    ap.add_argument("--save_glb_all", action="store_true", help="export a single GLB containing all frames")
+    ap.add_argument("--encode_time_in_alpha", action="store_true", help="encode time as alpha in PLY colors")
+    ap.add_argument("--mesh_key", type=str, default="optimized", help="object mesh field name (e.g., fdpose or optimized)")
+    ap.add_argument("--traj_key", type=str, default="hybrid_trajs", help="trajectory field name (e.g., fdpose_trajs/simple_trajs/hybrid_trajs)")
+    return ap.parse_args()
+
+def main():
+    args = parse_args()
+    base = Path.cwd()
+    cfg = yaml.safe_load((base / "config" / "config.yaml").open("r"))
+    keys = cfg["keys"]
+
+    for key in keys:
+        print(f"========== [visualize object trajectories] key: {key} ==========\n")
+        visualize_single_scene(
+            key=key,
+            out_dir=None,                      # default -> outputs/<key>/debug
+            bg_samples=60000,
+            obj_samples=12000,
+            bg_voxel=0.01,
+            obj_voxel=0.005,
+            stride=args.stride,
+            save_glb_all=args.save_glb_all,
+            encode_time_in_alpha=args.encode_time_in_alpha,
+            mesh_key=args.mesh_key,
+            traj_key=args.traj_key,
+        )
+        print(f"[done] key={key}")
+
+if __name__ == "__main__":
+    main()
diff --git a/openreal2sim/reconstruction/modules/background_mesh_generation.py b/openreal2sim/reconstruction/modules/background_mesh_generation.py
index bcb014d..2e0d85a 100755
--- a/openreal2sim/reconstruction/modules/background_mesh_generation.py
+++ b/openreal2sim/reconstruction/modules/background_mesh_generation.py
@@ -16,8 +16,7 @@
             "background": {
                 "original": # original background mesh path,
             },
-            "groundplane_in_cam": {
-                "point":  # a point on the ground plane [x,y,z],
+b                "point":  # a point on the ground plane [x,y,z],
                 "normal": # the normal of the ground plane [x,y,z], 
             }
         }
@@ -32,6 +31,18 @@
 from PIL import Image
 
 # ──────────────────── util ─────────────────────
+def depth_to_pointcloud(depth, intrinsic):
+    H, W = depth.shape
+    u, v = np.meshgrid(np.arange(W), np.arange(H))
+    u = u.reshape(-1)
+    v = v.reshape(-1)
+    depth = depth.reshape(-1)
+    pointcloud = np.zeros((len(depth), 3))
+    pointcloud[:, 0] = (u - intrinsic[0, 2]) * depth / intrinsic[0, 0]
+    pointcloud[:, 1] = (v - intrinsic[1, 2]) * depth / intrinsic[1, 1]
+    pointcloud[:, 2] = depth
+    return pointcloud
+
 def get_boundary_edges(mesh: trimesh.Trimesh):
     edges = np.vstack([
         mesh.faces[:, [0, 1]],
@@ -164,10 +175,12 @@ def visualize_mesh_thickness_with_pointmap(scene_dir: Path,
     o3d.io.write_point_cloud(str(out_file), pcd)
     print(f"✓ thickness debug cloud saved: {out_file}")
 
+
 # ──────────────────── core ─────────────────────
 def background_mesh_generation(keys, key_scene_dicts, key_cfgs):
     base_dir = Path.cwd()
     for key in keys:
+        print(f"[Info] Processing {key}...\n")
         scene_dict = key_scene_dicts[key]
         cfg = key_cfgs[key]
         recon_dir = base_dir / f'outputs/{key}/reconstruction'
@@ -190,7 +203,7 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs):
         pcd = o3d.geometry.PointCloud()
         pcd.points = o3d.utility.Vector3dVector(plane_pts)
         plane_model, _ = pcd.segment_plane(distance_threshold=0.01,
-                                           ransac_n=3, num_iterations=1000)
+                                        ransac_n=10, num_iterations=2000)
         a, b, c, d = plane_model
         normal = np.array([a, b, c], np.float64)
         normal /= (np.linalg.norm(normal) + 1e-12)
@@ -205,11 +218,12 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs):
         p0 = plane_pts.mean(axis=0)
         signed = np.median((obj_pts - p0) @ normal)
         if signed < 0:
+            print(f"[Info] Flipping normal direction for {key} because the median signed distance is negative")
             normal = -normal
 
         save_plane_normal_path = recon_dir / "ground_normal.ply"
         visualize_plane_normal(plane_pts, normal, save_plane_normal_path,
-                               num_arrow_pts=200, normal_length=0.5)
+                            num_arrow_pts=200, normal_length=0.5)
 
         # background mesh thicken direction is opposite to ground normal (downward)
         direction = -normal
@@ -227,7 +241,7 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs):
             orig_F = 2 * (H - 1) * (W - 1)
             new_F  = 2 * (H2 - 1) * (W2 - 1)
             print(f"[i] simplified grid: ({H},{W}) -> ({H2},{W2}) "
-                  f"faces ~ {orig_F} -> {new_F}")
+                f"faces ~ {orig_F} -> {new_F}")
 
         # construct mesh
         verts = pmap_s[..., :3].reshape(-1, 3)
@@ -236,18 +250,28 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs):
             base = i * W2
             for j in range(W2 - 1):
                 faces += [[base + j, base + j + 1, base + j + W2],
-                          [base + j + 1, base + j + W2 + 1, base + j + W2]]
+                        [base + j + 1, base + j + W2 + 1, base + j + W2]]
         faces = np.asarray(faces, dtype=np.int32)
 
         uu, vv = np.meshgrid(np.linspace(0, 1, W2, dtype=np.float32),
-                             np.linspace(1, 0, H2, dtype=np.float32))
+                            np.linspace(1, 0, H2, dtype=np.float32))
         uv = np.stack([uu, vv], -1).reshape(-1, 2)
 
         mesh = trimesh.Trimesh(vertices=verts, faces=faces, process=False)
+        
+        face_normals = mesh.face_normals
+        if len(face_normals) > 0:
+            dot_products = np.dot(face_normals, normal)
+            avg_dot = np.mean(dot_products)
+            if avg_dot < 0:
+                print("[Info] Flipping face orientation to align with ground normal")
+                mesh.faces = np.flip(mesh.faces, axis=1)
+        
         mesh.visual = trimesh.visual.TextureVisuals(uv=uv, image=img)
 
         # flipping mesh normals for correct texturing
-        mesh.faces = mesh.faces[:, ::-1]
+        if signed > 0:
+            mesh.faces = mesh.faces[:, ::-1]
 
         thickness = cfg["bg_mesh_thickness"]
         mesh_thick = add_thickness_below_mesh_preserve_texture(mesh, thickness, direction)
@@ -272,6 +296,7 @@ def background_mesh_generation(keys, key_scene_dicts, key_cfgs):
             pickle.dump(scene_dict, f)
 
         print(f"[Info] [{key}] scene_dict updated.")
+
     
     return key_scene_dicts
 
diff --git a/openreal2sim/reconstruction/modules/background_pixel_inpainting.py b/openreal2sim/reconstruction/modules/background_pixel_inpainting.py
index d2601b8..4798a8b 100755
--- a/openreal2sim/reconstruction/modules/background_pixel_inpainting.py
+++ b/openreal2sim/reconstruction/modules/background_pixel_inpainting.py
@@ -30,10 +30,23 @@
 base_dir = Path.cwd()
 repo_dir = str(base_dir / 'third_party/ObjectClear')
 sys.path.append(repo_dir)
-
 from objectclear.pipelines import ObjectClearPipeline
 from objectclear.utils import resize_by_short_side
 
+# ---------------- add sam to path ----------------
+sam_dir = base_dir / "third_party/Grounded-SAM-2"
+sys.path.append(str(sam_dir))
+
+
+
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor    
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+def dilate_mask(mask, kernel_size=3):
+    return cv2.dilate(mask, np.ones((kernel_size, kernel_size), np.uint8), iterations=1)
+
+
 def background_pixel_inpainting(keys, key_scene_dicts, key_cfgs):
     
     # hyperparameters
@@ -120,6 +133,8 @@ def background_pixel_inpainting(keys, key_scene_dicts, key_cfgs):
 
             # Model was trained on 512 short side
             image = resize_by_short_side(image, 512, resample=Image.BICUBIC)
+            mask = dilate_mask(np.array(mask), kernel_size=10)
+            mask = Image.fromarray(mask)
             mask = resize_by_short_side(mask, 512, resample=Image.NEAREST)
 
             w, h = image.size
@@ -143,14 +158,51 @@ def background_pixel_inpainting(keys, key_scene_dicts, key_cfgs):
             # Save as background.jpg in the scene folder
             save_path = os.path.join(result_root, 'background.jpg')
             fused_img_pil.save(save_path)
-            
-        # Save in scene_dict
+
+        # ---------- run Plane Segmentation ----------
+        TEXT = "ground. plane. table top. desk top"
+        CFG  =  "configs/sam2.1/sam2.1_hiera_l.yaml"
+        CKPT = base_dir / "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt"
+        img_predictor = SAM2ImagePredictor(build_sam2(CFG, CKPT))
+        dino_proc  = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
+        dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
+                        "IDEA-Research/grounding-dino-base").to(device)
+
+        batch = dino_proc(images=[fused_img_pil], text = TEXT, return_tensors="pt")
+        batch={k:v.to(device) for k,v in batch.items()}
+        with torch.no_grad():
+            out=dino_model(pixel_values=batch["pixel_values"],
+                        input_ids=batch["input_ids"],
+                        attention_mask=batch.get("attention_mask"))
+        boxes = dino_proc.post_process_grounded_object_detection(out,batch["input_ids"],
+                                                            .25,.3,target_sizes=[np.asarray(fused_img_pil).shape[:2]])[0]["boxes"].cpu().numpy()
+        plane_masks = []
+        for box in boxes:
+            img_predictor.set_image(fused_img_pil)
+            m,*_=img_predictor.predict(box=box,multimask_output=False)
+            m = m[0] >.5
+            if np.sum(m) > 0:
+                plane_masks.append(m)
+        if plane_masks:
+            mask_shape = plane_masks[0].shape
+            plane_mask_img = np.zeros(mask_shape, dtype=np.uint8)
+            for idx, plane_mask in enumerate(plane_masks):
+                plane_mask_img[plane_mask > 0] = idx + 1
+            max_idx = len(plane_masks)
+            plane_mask_img_vis = cv2.applyColorMap(
+                (plane_mask_img * (255 // max(1,max_idx))).astype(np.uint8), cv2.COLORMAP_JET
+            )
+            plane_mask_img_vis[plane_mask_img == 0] = [0,0,0]
+            out_path = os.path.join(result_root, "plane_mask.jpg")
+            cv2.imwrite(out_path, plane_mask_img_vis)
+        
         if "recon" not in scene_dict:
             scene_dict["recon"] = {}
         scene_dict["recon"]["background"] = np.ascontiguousarray(np.array(fused_img_pil, dtype=np.uint8))
         scene_dict["recon"]["foreground"] = np.ascontiguousarray(np.array(image_orig,   dtype=np.uint8))
         scene_dict["recon"]["ground_mask"] = ground_accum if ground_accum is not None else None # H x W, bool
         scene_dict["recon"]["object_mask"] = mask_accum if mask_accum is not None else None   # H x W, bool
+        scene_dict["recon"]["plane_mask"] = plane_mask_img
         key_scene_dicts[key] = scene_dict
         with open(base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f:
             pickle.dump(scene_dict, f)
diff --git a/openreal2sim/reconstruction/modules/background_point_inpainting.py b/openreal2sim/reconstruction/modules/background_point_inpainting.py
index 70476df..0ed7c2d 100755
--- a/openreal2sim/reconstruction/modules/background_point_inpainting.py
+++ b/openreal2sim/reconstruction/modules/background_point_inpainting.py
@@ -43,6 +43,21 @@ def dilate_mask(binary_mask: np.ndarray, pixels: int = 4, shape: str = "ellipse"
     out = cv2.dilate(binary_mask.astype(np.uint8), k, iterations=1)
     return out.astype(bool)
 
+def fill_mask(mask: np.ndarray) -> np.ndarray:
+    """
+    Fill the mask with the flood fill algorithm.
+    """
+    mask = mask.astype(np.uint8)
+    filled = mask.copy()
+    inv_mask = 1 - mask
+    h, w = mask.shape
+    floodfilled = inv_mask.copy()
+    floodfilled_pad = np.pad(floodfilled, 1, mode='constant', constant_values=0)
+    cv2.floodFill(floodfilled_pad, None, (0,0), 255)
+    floodfilled_nohole = floodfilled_pad[1:-1,1:-1]
+    out_mask = ((floodfilled_nohole == 0) | (mask == 1)).astype(bool)
+    return out_mask
+
 # ─────────────────────────── Depth prediction  ──────────────────────────
 def run_moge_depth(img_rgb: np.ndarray, device: torch.device) -> np.ndarray:
     """MoGe-2 inference (unchanged). Returns float32 depth (H,W)."""
@@ -100,23 +115,35 @@ def robust_scale_shift_align(
     return float(a), float(b)
 
 def depth_to_points(depth: np.ndarray, K: np.ndarray) -> np.ndarray:
+    """
+    Convert a depth map to 3D points and exclude pixels with zero or non-finite depth.
+    Returns an (N,3) array containing only valid points (z>0 and finite).
+    """
     H, W = depth.shape
-    fx, fy, cx, cy = K[0,0], K[1,1], K[0,2], K[1,2]
+    fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
     ii, jj = np.meshgrid(np.arange(W), np.arange(H), indexing="xy")
-    z = depth.reshape(-1)
+    z = depth.reshape(-1).astype(np.float32)
     x = (ii.reshape(-1) - cx) * z / fx
     y = (jj.reshape(-1) - cy) * z / fy
-    return np.stack([x, y, z], 1)
+    pts = np.stack([x, y, z], axis=1)
+    valid = (z > 0) & (z < np.inf)
+    pts[~valid] = np.nan
+    return pts
+
+
 
 def plane_fill(depth0, K, ground, obj):
     H,W=depth0.shape; fx,fy,cx,cy=K[0,0],K[1,1],K[0,2],K[1,2]
 
-    # fit plane on ground region
     pts_ground = depth_to_points(depth0, K)[ground.ravel()]
-    pc = o3d.geometry.PointCloud(); pc.points = o3d.utility.Vector3dVector(pts_ground)
-    plane,_ = pc.segment_plane(distance_threshold=0.01, ransac_n=3, num_iterations=1000)
+    
+    
+    pc = o3d.geometry.PointCloud()
+    pc.points = o3d.utility.Vector3dVector(pts_ground)
+  
+    np.random.seed(42) 
+    plane , _= pc.segment_plane(distance_threshold=0.02, ransac_n=3, num_iterations=1000)
     a,b,c,d = plane
-
     # intersect rays with plane inside the object mask
     ys,xs = np.where(obj)
     dx = (xs - cx) / fx; dy = (ys - cy) / fy; dz = np.ones_like(dx)
@@ -128,9 +155,60 @@ def plane_fill(depth0, K, ground, obj):
     z_new = t[valid] * dz[valid]
 
     depth0_filled = depth0.copy()
-    depth0_filled[ys[valid], xs[valid]] = np.clip(z_new, 0, depth0.max())
+    
+    depth0_filled[ys[valid], xs[valid]] = np.clip(z_new, 0, 5 * depth0.max())
     return depth0_filled
 
+def hybrid_fill(depth0: np.ndarray, fg_img: np.ndarray, bg_img: np.ndarray, K: np.ndarray, 
+                           obj_msk: np.ndarray, device: torch.device, 
+                           plane_mask: np.ndarray = None) -> np.ndarray:
+    """
+    Generate depth using hybrid approach: 
+    - Ground regions use plane fill
+    - Other regions use MoGe depth prediction
+    
+    Args:
+        depth0: Original depth map
+        bg_img: Inpainted background image
+        K: Camera intrinsic matrix
+        obj_msk: Object mask
+        device: PyTorch device
+        existing_ground_mask: Existing ground mask to sample points from (H, W)
+        
+    Returns:
+        depth_bg: Generated background depth
+    """
+    print(f"[Info] Starting hybrid depth generation...")
+    depth_bg = depth0.copy()
+    kernel = np.ones((25, 25), np.uint8)  # prevent boundaries problem.
+    obj_msk = cv2.dilate(obj_msk.astype(np.uint8), kernel, iterations=1).astype(bool)
+    left_obj_msk = obj_msk.copy()
+    plane_masks = []
+    for plane_idx in range(1, plane_mask.max()):
+        plane_mask = (plane_mask == plane_idx).astype(bool)
+        plane_masks.append(plane_mask)
+    for plane_mask in plane_masks:
+        print(f"[Info] plane mask: {plane_mask.sum()}, plane mask and object mask: {(plane_mask & left_obj_msk).sum()}")
+        if (plane_mask & left_obj_msk).sum() > 0:
+            #depth_filled = plane_fill(depth0, K, plane_mask, left_obj_msk & plane_mask)
+            depth_filled = plane_fill(depth0, K, plane_mask, left_obj_msk & plane_mask)
+            depth_bg[plane_mask & left_obj_msk] = depth_filled[plane_mask & left_obj_msk]
+        left_obj_msk = left_obj_msk & ~plane_mask
+    depth_moge = run_moge_depth(bg_img, device)
+    
+    if left_obj_msk.sum() > 0:
+        a, b = robust_scale_shift_align(
+                pred_depth=depth_moge,
+                ref_depth=depth0,
+                mask= ~obj_msk.astype(np.uint8),
+                iters=5,
+                huber_delta=0.02
+            )
+        depth_align = (a * depth_moge + b).astype(np.float32)
+        depth_bg[left_obj_msk] = depth_align[left_obj_msk]
+
+    return depth_bg
+
 
 def export_cloud(pts: np.ndarray, colors: np.ndarray, out_path: Path):
     pts = pts.reshape(-1,3)
@@ -140,6 +218,10 @@ def export_cloud(pts: np.ndarray, colors: np.ndarray, out_path: Path):
     pcd.colors = o3d.utility.Vector3dVector(colors)
     o3d.io.write_point_cloud(str(out_path), pcd)
 
+def clear_nan(pts: np.ndarray, colors: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    valid_mask = np.isfinite(pts).all(axis=1)
+    return pts[valid_mask], colors[valid_mask]
+
 def background_point_inpainting(keys, key_scene_dicts, key_cfgs):
 
     for key in tqdm.tqdm(keys):
@@ -158,27 +240,28 @@ def background_point_inpainting(keys, key_scene_dicts, key_cfgs):
         if "ground_mask" not in scene_dict["recon"] or "object_mask" not in scene_dict["recon"]:
             print(f"[Warning] [{key}] 'recon' key missing 'ground_mask' or 'object_mask'; run background_pixel_inpainting first.")
             continue
-
-        ground_mask = scene_dict["recon"]["ground_mask"]  # H x W, bool
+        ground_mask = scene_dict["recon"]["ground_mask"]
+        plane_mask = scene_dict["recon"]["plane_mask"] if scene_dict["recon"]["plane_mask"] is not None else None
         object_mask   = scene_dict["recon"]["object_mask"]  # H x W, bool
-         # dilate the masks a bit more to remove boundary outliers
-        object_mask = dilate_mask(object_mask, pixels=key_cfg["obj_dilate_pixels"], shape="ellipse") 
+        # dilate the masks a bit more to remove boundary outliers
+        object_mask = fill_mask(object_mask)
         ground_mask = dilate_mask(ground_mask, pixels=key_cfg["ground_dilate_pixels"], shape="ellipse")
         fg_img = scene_dict["recon"]["foreground"]  # H x W x 3, uint8
         bg_img = scene_dict["recon"]["background"]  # H x W x 3, uint8
         depth0 = scene_dict["depths"][0]  # H x W, float32
         K      = scene_dict["intrinsics"].astype(np.float32)  # 3 x 3, float32
         H, W = depth0.shape
-
+        gpu_id = key_cfg["gpu"]
+        device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
         completion_mode = key_cfg["bg_completion_mode"]
         if completion_mode == "plane":
             # assume the masked region is the ground, and using ground plane for geometry completion
             depth_bg = plane_fill(depth0, K, ground_mask, object_mask)
+        elif completion_mode == "hybrid" and plane_mask is not None:
+            depth_bg = hybrid_fill(depth0, fg_img, bg_img, K, object_mask, device, plane_mask)
         else:
             # use monocular depth prediction for geometry completion
             print(f"[Info] [{key}] predicting depth with MoGe-2 ...")
-            gpu_id = key_cfg["gpu"]
-            device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
             depth_pd = run_moge_depth(bg_img, device)
 
             # align the depth scale of current bg depth prediction and original image depth prediction
@@ -197,11 +280,14 @@ def background_point_inpainting(keys, key_scene_dicts, key_cfgs):
         scene_dict["recon"]["bg_depth"] = depth_bg.astype(np.float32)
         scene_dict["recon"]["fg_depth"] = depth0.astype(np.float32)
         bg_pts = depth_to_points(depth_bg, K)
+
         bg_colors = bg_img.reshape(-1,3) / 255.
         bg_color_pts = np.concatenate([bg_pts.reshape(H,W,3), bg_colors.reshape(H,W,3)], -1)
         fg_pts = depth_to_points(depth0, K)
         fg_colors = fg_img.reshape(-1,3) / 255.
         fg_color_pts = np.concatenate([fg_pts.reshape(H,W,3), fg_colors.reshape(H,W,3)], -1)
+        bg_pts, bg_colors = clear_nan(bg_pts, bg_colors)
+        fg_pts, fg_colors = clear_nan(fg_pts, fg_colors)
         scene_dict["recon"]["bg_pts"] = bg_color_pts.astype(np.float32) # (H, W, 6) xyzrgb rgb in [0,1]
         scene_dict["recon"]["fg_pts"] = fg_color_pts.astype(np.float32) # (H, W, 6) xyzrgb rgb in [0,1]
         key_scene_dicts[key] = scene_dict
@@ -212,6 +298,7 @@ def background_point_inpainting(keys, key_scene_dicts, key_cfgs):
         export_cloud(fg_pts, fg_colors, recon_dir / "foreground_points.ply")
         export_cloud(bg_pts, bg_colors, recon_dir / "background_points.ply")
         print(f"[Info] [{key}] geometry inpainting done.\n")
+        
     return key_scene_dicts
 
 if __name__ == "__main__":
diff --git a/openreal2sim/reconstruction/modules/object_mesh_generation.py b/openreal2sim/reconstruction/modules/object_mesh_generation.py
index fd1d3de..bd336bc 100755
--- a/openreal2sim/reconstruction/modules/object_mesh_generation.py
+++ b/openreal2sim/reconstruction/modules/object_mesh_generation.py
@@ -27,20 +27,166 @@
 import numpy as np
 from PIL import Image
 import yaml
-
+import torch
+import cv2
 base_dir   = Path.cwd()
 output_dir = base_dir / "outputs"
 repo_dir   = str(base_dir / 'third_party/Hunyuan3D-2')
 sys.path.append(repo_dir)
 
-# --- Switched from Trellis to Hunyuan3D ---
+
 from hy3dgen.rembg import BackgroundRemover
 from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
 from hy3dgen.texgen import Hunyuan3DPaintPipeline
 from hy3dgen.shapegen import FaceReducer, FloaterRemover, DegenerateFaceRemover
 
+base_dir = Path.cwd()
+repo_dir = str(base_dir / 'third_party/ObjectClear')
+sys.path.append(repo_dir)
+from objectclear.pipelines import ObjectClearPipeline
+from objectclear.utils import resize_by_short_side
+
+
+sys.path.append(str(base_dir / 'third_party/Grounded-SAM-2'))
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor    
+
+# TODO: super-resolution
 # ------------------------------------------------------------------
 
+def object_stacking(single_scene_dict):
+    objects = load_obj_masks(single_scene_dict['mask'])
+    # objects is a list of dicts, convert it into a dict mapping oid to object dict for fast access.
+    objects_by_oid = {obj['oid']: obj for obj in objects}
+    if not objects or len(objects) <= 1:
+        # Nothing to stack
+        stacking = {oid: [] for oid in objects_by_oid}
+        return stacking
+
+    K      = single_scene_dict["intrinsics"]
+    depth  = single_scene_dict["depths"][0]
+    H, W   = depth.shape
+    ground = single_scene_dict["info"]["groundplane_in_cam"]
+    g_point = np.array(ground["point"]).reshape(3)
+    g_normal = np.array(ground["normal"]).reshape(3)
+    g_normal = g_normal/np.linalg.norm(g_normal)
+
+    # Camera center calculation: assume pinhole at [0,0,0]
+    cam_center = np.zeros(3)
+
+   
+    stacking_normal = g_normal / np.linalg.norm(g_normal)
+   
+    cam_ground_sign = np.sign(g_normal[2])  
+
+    def depth_to_points(mask):
+        # Project 2D pixel to camera coordinates
+        ys, xs = np.where(mask)
+        ds = depth[ys, xs]
+        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+        xs_ = (xs - cx) * ds / fx
+        ys_ = (ys - cy) * ds / fy
+        return np.stack([xs_, ys_, ds], axis=1)  # (N, 3)
+    
+    def dilate_mask(mask, radius=1):
+        mask = cv2.dilate(mask.astype(np.uint8), np.ones((radius, radius), np.uint8))
+        return mask.astype(bool)
+
+    # Compute "height from ground" for each object (median, robust to outliers)
+    heights = {}
+    obj_cloud = {}
+    for idx, obj in enumerate(objects):
+        oid = obj['oid']
+        msk = obj['mask']
+        pts = depth_to_points(msk)
+        obj_cloud[oid] = pts
+        if pts.shape[0] == 0:
+            heights[oid] = float("nan")
+            continue
+        vecs = pts - g_point  # (N,3)
+        hts = vecs @ g_normal  # project to normal
+        heights[oid] = np.median(hts)
+
+    stacking = {obj['oid']: [] for obj in objects}
+    
+    # Compute mask overlap between objects
+    for idx, obj_a in enumerate(objects):
+        oid_a = obj_a['oid']
+        orig_mask_a = np.array(obj_a["mask"]).astype(bool)
+        mask_a = dilate_mask(orig_mask_a, radius=10)
+        if not np.any(mask_a):
+            continue
+        for idx_b, obj_b in enumerate(objects):
+            oid_b = obj_b['oid']
+            if oid_a == oid_b:
+                continue
+            orig_mask_b = np.array(obj_b["mask"]).astype(bool)
+            mask_b = dilate_mask(orig_mask_b, radius=10)
+            if not np.any(mask_b):
+                continue
+            overlap_mask = mask_a & mask_b
+            if not np.any(overlap_mask):
+                continue
+            # Heights at overlap
+            pts_a = depth_to_points(orig_mask_a)
+            pts_b = depth_to_points(orig_mask_b)
+            if pts_a.shape[0] == 0 or pts_b.shape[0] == 0:
+                continue
+            h_a = pts_a @ g_normal
+            h_b = pts_b @ g_normal
+            h_a_val = np.median(h_a)
+            h_b_val = np.median(h_b)
+
+            # Stacked-on rule:
+            # If cam_ground_sign < 0 (camera is above), higher objects (along normal) are in front, thus "on top"
+            # If cam_ground_sign > 0 (camera is below), lower (along normal) are in front, thus "on top"
+            obj_a_name = obj_a["name"]
+            obj_b_name = obj_b["name"]
+            if np.isfinite(h_a_val) and np.isfinite(h_b_val):
+                if cam_ground_sign < 0:
+                    # camera above ground & objects: higher-along-normal overlays lower
+                    print("Camera is above ground")
+                    if h_a_val > h_b_val + 0.01:
+                        stacking[oid_b].append(oid_a)
+                        print(f"Object {obj_a_name} is stacked on {obj_b_name}")
+                        print(f"Object {obj_a_name} occludes {obj_b_name}")
+                else:
+                    # camera below: lower-along-normal overlays higher
+                    print("Camera is below ground")
+                    if h_a_val < h_b_val - 0.01:
+                        stacking[oid_b].append(oid_a)
+                        print(f"Object {obj_b_name} is stacked on {obj_a_name}")
+                        print(f"Object {obj_a_name} occludes {obj_b_name}")
+
+    # Remove duplicates, sort
+    for oid in stacking:
+        stacking[oid] = list(sorted(set(stacking[oid])))
+
+    return stacking
+
+
+
+
+
+def clear_object(orig_img, orig_mask, whole_mask):
+    # Compute bbox from orig_mask
+    ys, xs = np.where(orig_mask)
+    if ys.size == 0 or xs.size == 0:
+        raise ValueError("orig_mask contains no foreground pixels")
+    x1, y1, x2, y2 = xs.min(), ys.min(), xs.max(), ys.max()
+    margin = int(0.1 * max(x2 - x1, y2 - y1))
+    # expand bbox by margin, clamp to image boundaries
+    h, w = orig_img.shape[0], orig_img.shape[1]
+    x1 = max(x1 - margin, 0)
+    y1 = max(y1 - margin, 0)
+    x2 = min(x2 + margin, w - 1)
+    y2 = min(y2 + margin, h - 1)
+    crop_img = orig_img[y1:y2+1, x1:x2+1]
+    orig_mask = orig_mask[y1:y2+1, x1:x2+1]
+    new_mask = whole_mask[y1:y2+1, x1:x2+1]
+    new_mask = ~orig_mask & new_mask
+
+    return crop_img, new_mask, orig_mask
 
 def save_object_png(orig_img: Image.Image,
                     mask: np.ndarray,
@@ -119,6 +265,32 @@ def object_mesh_generation(keys, key_scene_dicts, key_cfgs):
         'tencent/Hunyuan3D-2', subfolder='hunyuan3d-paint-v2-0'
     )
     rembg = BackgroundRemover()
+    
+    # hyperparameters
+    USE_FP16 = True
+    SEED = 42
+    NUM_STEPS = 20
+    STRENGTH = 0.99
+    GUIDANCE_SCALE = 2.5
+
+    # Set up ObjectClear pipeline once
+    torch_dtype = torch.float16 if USE_FP16 else torch.float32
+    variant = "fp16" if USE_FP16 else None
+    gpu_id = key_cfgs[keys[0]]["gpu"] # it has to be running on the same GPU
+    device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu')
+    generator = torch.Generator(device=device).manual_seed(SEED)
+    pipe = ObjectClearPipeline.from_pretrained_with_custom_modules(
+        "jixin0101/ObjectClear",
+        torch_dtype=torch_dtype,
+        apply_attention_guided_fusion=True,
+        cache_dir=None,
+        variant=variant,
+    ).to(device)
+
+    sam_weights_path = "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt"
+    sam_cfg_path = "configs/sam2.1/sam2.1_hiera_l.yaml"
+    sam_predictor = SAM2ImagePredictor(build_sam2(sam_cfg_path, sam_weights_path))
+    sam_predictor.model.to(device)
 
     for key in keys:
         print(f"[Info] Processing {key}...\n")
@@ -135,16 +307,69 @@ def object_mesh_generation(keys, key_scene_dicts, key_cfgs):
         # fixed seed kept (not used internally by these calls but preserved for parity)
         seed = random.randint(0, 99999)
         # generate object mesh for each object
+        stacking = object_stacking(scene_dict)
         for idx, item in enumerate(objs):
             mask  = item['mask'].astype(bool)
             name = item['name']
             stem  = f"{item['oid']}_{name}"
             png_path = out_dir / f"{stem}.png"
-
+            regenerated_path = out_dir / f"{stem}_regenerated.png"
+            cut_mask_path = out_dir / f"{stem}_cut_mask.png"
             # 1) save transparent PNG
-            save_object_png(orig_img, mask, png_path)
-            print(f"[Info] [{key}] saved crop → {png_path}")
-
+            if len(stacking[item['oid']]) > 0:
+                print(f"[Info] processing stacking for object {name}")
+                stack_mask = np.zeros_like(mask)
+                for oid in stacking[item['oid']]:
+                    for obj in objs:
+                        if obj["oid"] == oid:
+                            break
+                    stack_mask |= np.array(obj["mask"]).astype(bool)
+                crop_img, new_mask, orig_mask = clear_object(np.asarray(orig_img), mask, stack_mask)
+                crop_img = Image.fromarray(crop_img,mode="RGB")
+                crop_img = resize_by_short_side(crop_img, 512, resample=Image.BICUBIC)
+                new_mask = resize_by_short_side(Image.fromarray(new_mask.astype(np.uint8) * 255), 512, resample=Image.NEAREST)
+                new_mask.save(cut_mask_path)
+                orig_mask = resize_by_short_side(Image.fromarray(orig_mask.astype(np.uint8) * 255), 512, resample=Image.NEAREST)
+                
+                new_img = pipe(
+                    prompt="remove the instance of object",
+                    image=crop_img,
+                    mask_image=new_mask,
+                    generator=generator,
+                    num_inference_steps=NUM_STEPS,
+                    strength=STRENGTH,
+                    guidance_scale=GUIDANCE_SCALE,
+                    height=crop_img.height,
+                    width=crop_img.width,
+                )
+                crop_img = new_img.images[0]
+                sam_predictor.set_image(np.asarray(crop_img))
+                ## may sample from orig mask.
+                orig_mask_np = np.array(orig_mask).astype(bool)
+                yx_locs = np.argwhere(orig_mask_np)
+                num_points = min(8, len(yx_locs))
+                if num_points > 0:
+                    selected_indices = np.random.choice(len(yx_locs), num_points, replace=False)
+                    sampled_points = yx_locs[selected_indices]
+                    point_coords = np.fliplr(sampled_points).copy()
+                    point_labels = np.ones(num_points, dtype=int)
+                else:
+                    point_coords = None
+                    point_labels = None
+
+                new_mask, scores, logits = sam_predictor.predict(
+                    point_coords=point_coords,
+                    point_labels=point_labels,
+                    box=None,
+                    multimask_output=False
+                )
+                new_mask = new_mask[0].astype(bool)
+                crop_img.save(regenerated_path)
+                save_object_png(crop_img, new_mask, png_path)
+                print(f"[Info] [{key}] saved crop → {png_path}")
+            else:
+                save_object_png(orig_img, mask, png_path)
+                print(f"[Info] [{key}] saved crop → {png_path}")
             # 2) Hunyuan3D shape + texture
             img_rgba = Image.open(png_path).convert("RGBA")
             if img_rgba.mode == 'RGB':  # fallback: ensure RGBA
diff --git a/openreal2sim/reconstruction/tools/segmentation_mask_propagation.py b/openreal2sim/reconstruction/tools/segmentation_mask_propagation.py
index accf4ae..634962b 100755
--- a/openreal2sim/reconstruction/tools/segmentation_mask_propagation.py
+++ b/openreal2sim/reconstruction/tools/segmentation_mask_propagation.py
@@ -140,8 +140,7 @@ def propagate_maks(segmented_video: object, output_directory: Path):
 def mask_propagation(keys, key_scene_dicts):
     for key in keys:
         print("propagating for", key)
-        with open(OUT_ROOT/key/"scene/scene.pkl", "rb") as f:
-            scene = pickle.load(f)
+        scene = key_scene_dicts[key]
         propagate_maks(scene, OUT_ROOT/key)
         key_scene_dicts[key] = scene
     return key_scene_dicts
@@ -152,5 +151,10 @@ def mask_propagation(keys, key_scene_dicts):
     cfg_path = base_dir / "config" / "config.yaml"
     cfg = yaml.safe_load(cfg_path.open("r"))
     keys = cfg["keys"]
-    mask_propagation(keys)
+    key_scene_dicts = {}
+    for key in keys:
+        with open(OUT_ROOT/key/"scene/scene.pkl", "rb") as f:
+            scene = pickle.load(f)
+        key_scene_dicts[key] = scene
+    mask_propagation(keys, key_scene_dicts)
     
\ No newline at end of file
diff --git a/scripts/installation/install.py b/scripts/installation/install.py
index 6ac9f5c..66591a7 100644
--- a/scripts/installation/install.py
+++ b/scripts/installation/install.py
@@ -27,6 +27,20 @@ def download_file(url, destination):
     urlretrieve(url, destination)
     print("Download complete.")
 
+def download_with_gdown(file_id, destination):
+    """Downloads a file from Google Drive using gdown."""
+    dest_dir = os.path.dirname(destination)
+    print(f"Ensuring directory exists: {dest_dir}")
+    os.makedirs(dest_dir, exist_ok=True)
+    file_name = os.path.basename(destination)
+    print(f"Downloading {file_name} from Google Drive (ID: {file_id}) to {destination}...")
+    # Use gdown command to download from Google Drive using file ID
+    run_command(
+        ["gdown", file_id, "-O", destination],
+        cwd=None
+    )
+    print("Download complete.")
+
 def main():
     """Main function to set up all dependencies."""
     # Ensure we are in the correct base directory
@@ -34,8 +48,12 @@ def main():
     os.chdir(base_dir)
     print(f"Working directory set to: {os.getcwd()}")
 
+    # --- Install gdown for Google Drive downloads ---
+    print("\n--- [1/9] Installing gdown for Google Drive downloads ---")
+    run_command([sys.executable, "-m", "pip", "install", "gdown"])
+
     # --- Mega-SAM Dependencies ---
-    print("\n--- [1/6] Setting up Mega-SAM dependencies ---")
+    print("\n--- [2/9] Setting up Mega-SAM dependencies ---")
     download_file(
         "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth",
         "third_party/mega-sam/Depth-Anything/checkpoints/depth_anything_vitl14.pth"
@@ -46,13 +64,13 @@ def main():
     )
 
     # --- Segmentation Dependencies (Grounded-SAM-2) ---
-    print("\n--- [2/6] Downloading Segmentation model ---")
+    print("\n--- [3/9] Downloading Segmentation model ---")
     download_file(
         "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt",
         "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt"
     )
 
-    print("\n--- [3/6] Building Segmentation CUDA extension ---")
+    print("\n--- [4/9] Building Segmentation CUDA extension ---")
     build_cuda_path = os.path.join(base_dir, "third_party/Grounded-SAM-2")
     run_command(
         [sys.executable, "build_cuda.py", "build_ext", "--inplace", "-v"],
@@ -60,7 +78,7 @@ def main():
     )
 
     # --- FoundationPose Dependencies ---
-    print("\n--- [4/6] Downloading FoundationPose weights ---")
+    print("\n--- [5/9] Downloading FoundationPose weights ---")
     fp_weights_dir = os.path.join(base_dir, "third_party/FoundationPose/weights")
     os.makedirs(fp_weights_dir, exist_ok=True)
     snapshot_download(
@@ -69,7 +87,7 @@ def main():
         local_dir=fp_weights_dir
     )
 
-    print("\n--- [5/6] Compiling FoundationPose C++ extension ---")
+    print("\n--- [6/9] Compiling FoundationPose C++ extension ---")
     fp_cpp_build_path = os.path.join(base_dir, "third_party/FoundationPose/mycpp/build")
     run_command(["rm", "-rf", fp_cpp_build_path])
     os.makedirs(fp_cpp_build_path, exist_ok=True)
@@ -79,7 +97,7 @@ def main():
     )
     run_command(["make", "-j11"], cwd=fp_cpp_build_path)
 
-    print("\n--- [6/6] Compiling FoundationPose bundlesdf CUDA extension ---")
+    print("\n--- [7/9] Compiling FoundationPose bundlesdf CUDA extension ---")
     fp_bundlesdf_path = os.path.join(base_dir, "third_party/FoundationPose/bundlesdf/mycuda")
     run_command(["rm", "-rf", "build"], cwd=fp_bundlesdf_path) 
     run_command(["rm", "-rf", "*.egg-info"], cwd=fp_bundlesdf_path) 
@@ -88,6 +106,32 @@ def main():
         cwd=fp_bundlesdf_path
     )
 
+    # --- WiLoR Dependencies (Hand Extraction) ---
+    print("\n--- [8/9] Downloading WiLoR pretrained models ---")
+    wilor_models_dir = os.path.join(base_dir, "third_party/WiLoR/pretrained_models")
+    os.makedirs(wilor_models_dir, exist_ok=True)
+    download_file(
+        "https://huggingface.co/spaces/rolpotamias/WiLoR/resolve/main/pretrained_models/detector.pt",
+        os.path.join(wilor_models_dir, "detector.pt")
+    )
+    download_file(
+        "https://huggingface.co/spaces/rolpotamias/WiLoR/resolve/main/pretrained_models/wilor_final.ckpt",
+        os.path.join(wilor_models_dir, "wilor_final.ckpt")
+    )
+
+    # --- Grasp Generation Checkpoints ---
+    print("\n--- [9/9] Downloading Grasp Generation checkpoints ---")
+    ckpt_dir = os.path.join(base_dir, "third_party/graspness_unofficial/ckpt")
+    os.makedirs(ckpt_dir, exist_ok=True)
+    download_with_gdown(
+        "10o5fc8LQsbI8H0pIC2RTJMNapW9eczqF",
+        os.path.join(ckpt_dir, "minkuresunet_kinect.tar")
+    )
+    download_with_gdown(
+        "1RfdpEM2y0x98rV28d7B2Dg8LLFKnBkfL",
+        os.path.join(ckpt_dir, "minkuresunet_realsense.tar")
+    )
+
     print("\n\n--- All dependencies set up successfully! ---")
 
 if __name__ == "__main__":