diff --git a/.gitmodules b/.gitmodules
index 2277161..adfa870 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,21 @@
 [submodule "third_party/ObjectClear"] # for object clearing [reconstruction]
 	path = third_party/ObjectClear
 	url = git@github.com:PointsCoder/ObjectClear.git
+[submodule "third_party/graspness_unofficial"] # for grasp generation [motion]
+	path = third_party/graspness_unofficial
+	url = git@github.com:PointsCoder/graspness_unofficial.git
+[submodule "third_party/graspnetAPI"] # for grasp evaluation and utilities [motion]
+	path = third_party/graspnetAPI
+	url = git@github.com:PointsCoder/graspnetAPI.git
+[submodule "third_party/MinkowskiEngine"] # for sparse convolution operations [motion]
+	path = third_party/MinkowskiEngine
+	url = git@github.com:DynamoDuan/MinkowskiEngine.git
+[submodule "third_party/WiLoR"] # for hand extraction [motion]
+	path = third_party/WiLoR
+	url = git@github.com:DynamoDuan/WiLoR.git
+[submodule "third_party/curobo"]
+	path =  third_party/curobo
+	url = git@github.com:DynamoDuan/curobo.git
+[submodule "third_party/IsaacLab"]
+	path =  third_party/IsaacLab
+	url = git@github.com:PointsCoder/IsaacLab.git
\ No newline at end of file
diff --git a/config/config.yaml b/config/config.yaml
index 3e4c775..5fd12f5 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,7 +1,7 @@
 keys:
-  - demo_image
+#
   - demo_video
-  - demo_genvideo
+  # - demo_genvideo
 
 # global settings, can be overridden by local settings
 global:
diff --git a/docker/isaaclab/Dockerfile b/docker/isaaclab/Dockerfile
index 47eecc4..ba3898e 100644
--- a/docker/isaaclab/Dockerfile
+++ b/docker/isaaclab/Dockerfile
@@ -25,55 +25,32 @@ ENV CMAKE_BUILD_PARALLEL_LEVEL=2
 ENV CUDA_HOME=/usr/local/cuda
 ENV FORCE_CUDA=1
 
-RUN useradd -ms /bin/bash isaac && chown -R isaac:isaac /app
-ENV HOME=/home/isaac
-USER isaac
-ENV PATH=$HOME/.local/bin:$PATH
-
 # RUN pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu118 \
 #     torch==2.5.1+cu118 torchvision==0.20.1+cu118 torchaudio==2.5.1+cu118
 
-RUN pip install torch==2.7.0 torchvision==0.22.0 --index-url https://download.pytorch.org/whl/cu128
-
-# graspnetapi (will install open3d & change numpy version from 1.26.4 to 1.23.4)
-RUN git clone https://github.com/PointsCoder/graspnetAPI.git "$HOME/graspnetAPI"
-RUN cd "$HOME/graspnetAPI" && pip install -e .
-
+RUN pip install torch==2.7.0 torchvision==0.22.0 --index-url https://download.pytorch.org/whl/cu128 
+RUN pip install transforms3d
 
-ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0;12.0"
-# graspness (will compile pointnet2, knn)
-RUN git clone https://github.com/PointsCoder/graspness_unofficial.git "$HOME/third_party/graspness_unofficial"
-RUN cd "$HOME/third_party/graspness_unofficial/pointnet2" && python -m pip install --user -v --no-build-isolation .
-# RUN cd "$HOME/third_party/graspness_unofficial/knn" && python -m pip install --user -v --no-build-isolation . # (knn is not necessary)
 
 USER root
 # Patch shared_ptr_base.h
 RUN sed -i 's|auto __raw = __to_address(__r.get())|auto __raw = std::__to_address(__r.get())|' \
     /usr/include/c++/11/bits/shared_ptr_base.h
 
-USER isaac
-# MinkowskiEngine
-RUN git clone https://github.com/songlin/MinkowskiEngine.git "$HOME/MinkowskiEngine"
-RUN cd "$HOME/MinkowskiEngine" && git checkout cuda-12.8-cc-12.0  && export CUDA_HOME=/usr/local/cuda; python setup.py install --user --blas=openblas --force_cuda
-
-# checkpoints
-RUN pip install gdown && \
-    mkdir -p "$HOME/third_party/graspness_unofficial/ckpt" && \
-    gdown 10o5fc8LQsbI8H0pIC2RTJMNapW9eczqF -O "$HOME/third_party/graspness_unofficial/ckpt/minkuresunet_kinect.tar" && \
-    gdown 1RfdpEM2y0x98rV28d7B2Dg8LLFKnBkfL -O "$HOME/third_party/graspness_unofficial/ckpt/minkuresunet_realsense.tar"
-
 ENV TORCH_CUDA_ARCH_LIST="8.9+PTX"
-# curobo
-RUN git clone https://github.com/NVlabs/curobo.git "$HOME/curobo" && \
-    cd "$HOME/curobo" && pip install -e . --no-build-isolation --user
+# curobo - install to /opt to avoid being overwritten by volume mount
+COPY third_party/curobo /app/third_party/curobo
+RUN cd /app/third_party/curobo && \
+     pip install -e . --no-build-isolation
 
 # Isaac Sim 4.5.0
 RUN pip install 'isaacsim[all,extscache]==4.5.0' --extra-index-url https://pypi.nvidia.com
 
 # Revert the fixed pytorch version
+# Install IsaacLab to /opt to avoid being overwritten by volume mount
 ENV TERM=xterm-256color
-RUN git clone https://github.com/PointsCoder/IsaacLab.git "$HOME/IsaacLab" && \
-    cd "$HOME/IsaacLab" && \
+COPY third_party/IsaacLab /app/third_party/IsaacLab
+RUN cd /app/third_party/IsaacLab && \
     sed -i 's|torch==2.5.1|torch==2.7.0|' source/isaaclab/setup.py && \
     sed -i 's|torch==2.5.1|torch==2.7.0|' source/isaaclab_tasks/setup.py && \
     sed -i 's|torch==2.5.1|torch==2.7.0|' source/isaaclab_rl/setup.py && \
@@ -81,6 +58,8 @@ RUN git clone https://github.com/PointsCoder/IsaacLab.git "$HOME/IsaacLab" && \
 
 # usd conversion 
 # curobo changes the numpy version from 1.23.4 to 2.x, need to get it back
-RUN pip install loguru rich numpy==1.23.4
+RUN pip install loguru rich numpy==1.23.4 transforms3d
 
-ENV PYTHONPATH=/app:$HOME/third_party:$HOME/third_party/graspness_unofficial/pointnet2
+# Ensure isaaclab can be imported
+# IsaacLab is installed to /opt/IsaacLab to avoid being overwritten by volume mount
+ENV PYTHONPATH=/app:/app/third_party:/app/third_party/IsaacLab/source
diff --git a/docker/real2sim/Dockerfile b/docker/real2sim/Dockerfile
index f36a807..caa5dad 100644
--- a/docker/real2sim/Dockerfile
+++ b/docker/real2sim/Dockerfile
@@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 # -------------------------------------------------------------------------
 RUN apt-get update && apt-get install -y --no-install-recommends \
     python3 python3-dev python3-pip python3-venv python-is-python3 \
-    build-essential cmake ninja-build pkg-config \
+    build-essential cmake ninja-build pkg-config libopenblas-dev \
     git curl wget ffmpeg bash libspatialindex-dev \
     && python3 -m pip install --upgrade --no-cache-dir pip wheel setuptools packaging \
     && rm -rf /var/lib/apt/lists/*
@@ -46,6 +46,9 @@ RUN pip install --no-cache-dir torch-scatter \
 COPY docker/real2sim/requirements.docker.txt         /tmp/requirements.docker.txt
 RUN pip install --no-cache-dir -r /tmp/requirements.docker.txt
 
+# Install chumpy separately (setup.py needs pip module, ensure build deps are available)
+RUN pip install --no-build-isolation --no-cache-dir "git+https://github.com/mattloper/chumpy"
+
 
 # -------------------------------------------------------------------------
 # Third-Party compiled Python deps
@@ -82,7 +85,18 @@ RUN cd / && wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.t
     cmake .. &&\
     make install
 
+RUN cd /
+COPY third_party/graspnetAPI /app/third_party/graspnetAPI
+RUN cd /app/third_party/graspnetAPI && pip install -e .
+
+COPY third_party/graspness_unofficial /app/third_party/graspness_unofficial
+RUN cd /app/third_party/graspness_unofficial/pointnet2 && python -m pip install --no-cache-dir -v --no-build-isolation .
+RUN cd /app/third_party/graspness_unofficial/knn && python -m pip install --no-cache-dir -v --no-build-isolation .
+
+COPY third_party/MinkowskiEngine /app/third_party/MinkowskiEngine
+RUN cd /app/third_party/MinkowskiEngine && python setup.py install --blas=openblas --force_cuda
+
 # -------------------------------------------------------------------------
 # Runtime env
 # -------------------------------------------------------------------------
-ENV PYTHONPATH=/app
+ENV PYTHONPATH=/app:/app/third_party:/app/third_party/graspness_unofficial/pointnet2:
diff --git a/docker/real2sim/requirements.docker.txt b/docker/real2sim/requirements.docker.txt
index 75533f2..c1d4ab5 100644
--- a/docker/real2sim/requirements.docker.txt
+++ b/docker/real2sim/requirements.docker.txt
@@ -20,6 +20,7 @@ scipy==1.15.3
 wandb==0.21.3
 gdown
 
+
 #__________________________________________________________________________________
 # 2) Related deps (torch-scatter is separate)
 ninja==1.11.1
@@ -61,4 +62,27 @@ Rtree
 # 8) Foundation Pose
 transformations==2024.6.1
 ruamel.yaml==0.18.6
-h5py==3.10.0
\ No newline at end of file
+h5py==3.10.0
+
+# 9) PromptDA
+pyliblzfse
+lightning==2.1.3
+termcolor
+tyro==0.9.2
+spaces
+trimesh 
+
+# 10) WiLoR
+pyrender
+pytorch-lightning
+scikit-image
+smplx==0.1.28
+yacs
+xtcocotools
+pandas
+hydra-submitit-launcher
+hydra-colorlog
+pyrootutils
+webdataset
+ultralytics==8.1.34
+# Note: chumpy installed separately in Dockerfile due to build isolation issues
diff --git a/h5py/demo_video/episode_020.hdf5 b/h5py/demo_video/episode_020.hdf5
new file mode 100644
index 0000000..bf5a261
Binary files /dev/null and b/h5py/demo_video/episode_020.hdf5 differ
diff --git a/openreal2sim/simulation/isaaclab-old/README.md b/openreal2sim/simulation/isaaclab-old/README.md
new file mode 100755
index 0000000..199f68d
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/README.md
@@ -0,0 +1,52 @@
+# IsaacLab
+
+## Installation
+
+**Pull the pre-built docker image**
+
+If you want to use a pre-built image from Docker Hub, you can pull it directly:
+   ```bash
+   docker pull ghcr.io/pointscoder/isaaclab:dev
+   docker tag ghcr.io/pointscoder/isaaclab:dev isaaclab:dev
+   ```
+
+**On the host machine and before entering the container**, run
+```
+xhost +local:
+```
+This is to allow the container to access the host's X server for IsaacSim GUI.
+
+Then, launch the container:
+```
+HOST_UID=$(id -u) HOST_GID=$(id -g) docker compose -p "$USER" -f docker/compose.yml up -d isaaclab 
+```
+and enter it:
+```
+HOST_UID=$(id -u) HOST_GID=$(id -g) docker compose -p "$USER" -f docker/compose.yml exec isaaclab bash
+```
+
+## Preprocessing
+
+We need to convert the textured meshes to USD format. This is done by running the following script:
+```
+python openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
+```
+
+Then, we generate grasp proposals for each object in the scene:
+```
+python openreal2sim/simulation/isaaclab/sim_preprocess/grasp_generation.py
+```
+
+## Running the simulation
+
+We provide heuristic policies using grasping and motion planning:
+```
+python openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
+```
+
+## Replay Robot Trajectories
+
+We can also replay the recorded robot trajectories in IsaacSim:
+```
+python openreal2sim/simulation/isaaclab/sim_replay_trajectories.py --demo_dir <path_to_demo_directory>
+```
\ No newline at end of file
diff --git a/openreal2sim/simulation/isaaclab-old/demo/README.md b/openreal2sim/simulation/isaaclab-old/demo/README.md
new file mode 100644
index 0000000..17d04e2
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/demo/README.md
@@ -0,0 +1,137 @@
+### Data Collection and Generation ###
+This folder is used for collecting manipulation data and augmenting it with spatial transformation.
+
+The demo pipeline consists of three main stages:
+1. **USD Conversion** - Converts scene data to USD format for simulation
+2. **Heuristic Manipulation** - Performs object manipulation using heuristic policies with grasping and motion planning. This will create a new folder in `task` and `h5py` folder for each key. In the task folder, a json is formulated, and required assets for the scene are structured. 
+3. **Randomized Rollout** - Generates randomized scene configurations and collects successful manipulation trajectories. This will add demos to the `task` folder and trajectory configs to the task config, and add new h5py file to the `h5py` folder.
+
+### Code Structure ###
+
+```
+demo/
+├── sim_agent.sh                    # Main pipeline script that runs all stages
+├── sim_heuristic_manip.py          # Heuristic manipulation policy implementation
+├── sim_randomize_rollout.py        # Randomized rollout for data augmentation
+└── envs/                           # Environment configuration and utilities
+    ├── task_cfg.py                 # Task configuration classes (TaskCfg, TrajectoryCfg, etc.)
+    ├── task_construct.py           # Task construction and serialization utilities
+    ├── randomizer.py               # Scene randomization for data augmentation
+    └── running_cfg.py              # Runtime configuration management
+```
+
+### Running Script ###
+Please run ` bash openreal2sim/simulation/isaaclab/demo/sim_agent.sh` to collect the demo. This will iterate through the data folder and run the three stages for each input key.
+
+### Running Config Explanation ###
+There are two editable configs for this script:
+
+First, running device and headless option are defined in the bash script.
+
+Second, the randomizer config, environment num and total trajectry num are all defined in the `running_cfg.py` file. Please edit that file for your own needs.
+
+
+#### TaskCfg (Main Task Configuration)
+
+The top-level configuration that defines a complete manipulation task:
+
+- **task_key** (str): Unique identifier for the task
+- **task_id** (int): Sequential task ID
+- **task_desc** (List[str]): Description of the task
+- **task_type** (TaskType): Type of task (SIMPLE_PICK, SIMPLE_PICK_PLACE, TARGETTED_PICK_PLACE)
+- **bg_rgb_path** (str): Path to background RGB image
+- **background_path** (str): Path to background mesh (GLB format)
+- **background_usd_path** (str): Path to background USD file
+- **camera_info** (CameraInfo): Camera intrinsic and extrinsic parameters
+- **manipulated_oid** (int): Object ID of the manipulated object
+- **start_related** (List[int]): List of object IDs related to the start state
+- **end_related** (List[int]): List of object IDs related to the end state
+- **objects** (List[ObjectCfg]): List of all objects in the scene
+- **reference_trajectory** (TrajectoryCfg, optional): Reference trajectory generated by heuristic manipulation
+- **generated_trajectories** (List[TrajectoryCfg]): List of randomized trajectories for data augmentation
+
+#### TrajectoryCfg (Trajectory Configuration)
+
+Defines a specific manipulation trajectory:
+
+- **robot_pose** (List[float]): Robot base pose [x, y, z, qw, qx, qy, qz] in world frame
+- **object_poses** (Dict[int, List[float]]): Initial poses of all objects, keyed by object ID
+- **object_trajectory** (List[List[float]]): Object trajectory as sequence of poses [x, y, z, qw, qx, qy, qz]
+- **final_gripper_close** (bool): Whether gripper should be closed at the end
+- **success_metric** (SuccessMetric): Criteria for task success
+- **pregrasp_pose** (List[float], optional): Pre-grasp end-effector pose in world frame
+- **grasp_pose** (List[float], optional): Grasp end-effector pose in world frame
+- **robot_type** (RobotType, optional): Type of robot (FRANKA or UR5)
+
+#### SuccessMetric (Success Criteria)
+
+Defines how task success is evaluated:
+
+- **success_metric_type** (SuccessMetricType): Type of success metric
+  - `SIMPLE_LIFT`: Object must be lifted by a certain height
+  - `TARGET_POINT`: Object must reach a specific target pose
+  - `TARGET_PLANE`: Object must reach a target plane (height)
+- **final_gripper_close** (bool): Whether gripper should be closed at success
+- **lift_height** (float, optional): Required lift height for SIMPLE_LIFT
+- **ground_value** (float, optional): Target height for TARGET_PLANE
+- **end_pose** (List[float], optional): Target pose [x, y, z, qw, qx, qy, qz] for TARGET_POINT
+
+#### ObjectCfg (Object Configuration)
+
+Defines an object in the scene:
+
+- **object_id** (int): Unique object identifier
+- **object_name** (str): Name of the object
+- **mesh_path** (str): Path to object mesh file (GLB format)
+- **usd_path** (str): Path to object USD file
+
+#### CameraInfo (Camera Configuration)
+
+Camera intrinsic and extrinsic parameters:
+
+- **width, height** (float): Image dimensions
+- **fx, fy** (float): Focal lengths
+- **cx, cy** (float): Principal point
+- **camera_opencv_to_world** (List[List[float]]): 4x4 transformation matrix from OpenCV to world frame
+- **camera_position** (List[float]): Camera position [x, y, z]
+- **camera_heading_wxyz** (List[float]): Camera orientation quaternion [w, x, y, z]
+
+#### Configuration Workflow
+
+1. **Initial Task Creation**: `construct_task_config()` creates a `TaskCfg` from scene reconstruction data
+2. **Reference Trajectory**: `add_reference_trajectory()` adds the first successful trajectory from heuristic manipulation
+3. **Data Augmentation**: `add_generated_trajectories()` adds randomized trajectories from rollout stage
+4. **Serialization**: Task configs are saved as JSON files in `tasks/<key>/task.json`
+5. **Loading**: `load_task_cfg()` reconstructs `TaskCfg` objects from JSON files
+
+#### Example Task Configuration Structure
+
+```json
+{
+  "task_key": "demo_video",
+  "task_id": 0,
+  "task_type": "TARGETTED_PICK_PLACE",
+  "manipulated_oid": 2,
+  "objects": [
+    {
+      "object_id": 2,
+      "object_name": "pen",
+      "mesh_path": "tasks/demo_video/object_2.glb",
+      "usd_path": "tasks/demo_video/object_2.usd"
+    }
+  ],
+  "reference_trajectory": {
+    "robot_pose": [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
+    "object_poses": {"2": [0.1, 0.2, 0.05, 1.0, 0.0, 0.0, 0.0]},
+    "object_trajectory": [[...], [...], ...],
+    "pregrasp_pose": [0.1, 0.2, 0.15, 1.0, 0.0, 0.0, 0.0],
+    "grasp_pose": [0.1, 0.2, 0.1, 1.0, 0.0, 0.0, 0.0],
+    "success_metric": {
+      "success_metric_type": "TARGET_POINT",
+      "end_pose": [0.3, 0.4, 0.1, 1.0, 0.0, 0.0, 0.0],
+      "final_gripper_close": false
+    }
+  },
+  "generated_trajectories": [...]
+}
+```
diff --git a/openreal2sim/simulation/isaaclab-old/demo/envs/randomizer.py b/openreal2sim/simulation/isaaclab-old/demo/envs/randomizer.py
new file mode 100755
index 0000000..fde28fa
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/demo/envs/randomizer.py
@@ -0,0 +1,348 @@
+import numpy as np
+import random
+from scipy.spatial.transform import Rotation as R
+from scipy.spatial.transform import Slerp
+from .task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg
+
+import torch
+import transforms3d
+
+def pose_to_mat(pos, quat):
+    if torch.is_tensor(pos):  pos  = pos.cpu().numpy()
+    if torch.is_tensor(quat): quat = quat.cpu().numpy()
+    m = np.eye(4, dtype=np.float32)
+    m[:3, :3] = transforms3d.quaternions.quat2mat(quat)
+    m[:3,  3] = pos
+    return m
+
+def mat_to_pose(mat):
+    pos  = mat[:3, 3]
+    quat = transforms3d.quaternions.mat2quat(mat[:3, :3])
+    return pos, quat
+
+
+
+class Randomizer(TaskCfg):
+    def __init__(self, task_cfg: TaskCfg):
+        self.task_cfg = task_cfg
+
+    
+    def generate_randomized_scene_cfg(self, grid_dist: float, grid_num: int, angle_random_range: float, angle_random_num: int, traj_randomize_num:int, scene_randomize_num: int, robot_pose_randomize_range, robot_pose_randomize_angle, robot_pose_randomize_num):
+        # Step 1: Generate candidate transforms
+      
+        candidate_transforms = []
+        for i in range(-grid_num, grid_num):
+            for j in range(-grid_num, grid_num):
+                random_angles = np.random.uniform(-angle_random_range, angle_random_range, angle_random_num)
+                for angle in random_angles:
+                    orig = np.eye(4)
+                    orig[0, 3] = i * grid_dist
+                    orig[1, 3] = j * grid_dist
+                    orig[0, 0] = np.cos(angle)
+                    orig[0, 1] = -np.sin(angle)
+                    orig[1, 0] = np.sin(angle)
+                    orig[1, 1] = np.cos(angle)
+                    candidate_transforms.append(orig)
+        
+        # Step 2: Generate traj_randomize_num combinations of (start_pose, end_pose)
+        traj_pose_pairs = []
+        for _ in range(traj_randomize_num):
+            start_pose = random.choice(candidate_transforms)
+            if self.task_cfg.task_type == TaskType.SIMPLE_PICK:
+                end_pose = start_pose.copy()  # For SIMPLE_PICK, end same as start
+            else:
+                end_pose = random.choice(candidate_transforms)
+            traj_pose_pairs.append((start_pose, end_pose))
+        
+        # Step 3: Generate scene_randomize_num combinations for other objects
+        other_object_ids = [obj.object_id for obj in self.task_cfg.objects 
+                           if obj.object_id != self.task_cfg.manipulated_oid and obj.object_id not in self.task_cfg.start_related and obj.object_id not in self.task_cfg.end_related]
+        
+        scene_poses_combinations = []
+        for _ in range(scene_randomize_num):
+            # Create list of (oid, pose) pairs
+            other_object_poses = [(oid, random.choice(candidate_transforms)) 
+                                  for oid in other_object_ids]
+            scene_poses_combinations.append(other_object_poses)
+        
+        # Step 4: Generate robot_pose_randomize_num random robot poses
+        # robot_pose format: [x, y, z, w, x, y, z] (position + quaternion wxyz)
+        ref_traj = self.task_cfg.reference_trajectory
+        assert ref_traj is not None, "Reference trajectory is not found"
+        ref_robot_pose = np.array(ref_traj.robot_pose)
+        robot_pose_mat = pose_to_mat(ref_robot_pose[:3], ref_robot_pose[3:7])
+        
+        robot_poses = []
+        for _ in range(robot_pose_randomize_num):
+            # Random translation within range
+            random_trans = np.random.uniform(
+                -robot_pose_randomize_range, 
+                robot_pose_randomize_range, 
+                3
+            )
+            random_rot = np.random.uniform(
+                -robot_pose_randomize_angle, 
+                robot_pose_randomize_angle, 
+            )
+
+            rotate_matrix = np.eye(4)
+            rotate_matrix[:3, :3] = R.from_euler('z', random_rot).as_matrix()
+            new_robot_pose = robot_pose_mat @ rotate_matrix
+            new_robot_pose[:3, 3] += random_trans
+            # Combine position and quaternion [x, y, z, w, x, y, z]
+            pos, quat = mat_to_pose(new_robot_pose)
+            robot_pose_7d = np.concatenate([pos, quat])
+            robot_poses.append(robot_pose_7d.tolist())
+        
+        # Step 5: Generate trajectories for all combinations
+        generated_trajectories = []
+        ref_traj = self.task_cfg.reference_trajectory
+        for start_pose, end_pose in traj_pose_pairs:
+            for other_object_poses in scene_poses_combinations:
+                for robot_pose in robot_poses:
+                    # Generate trajectory for manipulated object
+                    if self.task_cfg.task_type == TaskType.SIMPLE_PICK:
+                        new_traj_mats = []
+                        for traj_pose_7d in ref_traj.object_trajectory:
+                            pose_mat = pose_to_mat(np.array(traj_pose_7d[:3]), np.array(traj_pose_7d[3:7]))
+                            new_traj_mats.append(start_pose @ pose_mat)
+                        # Convert back to 7D format
+                        new_traj_7d = []
+                        for mat in new_traj_mats:
+                            pos, quat = mat_to_pose(mat)
+                            new_traj_7d.append(np.concatenate([pos, quat]).tolist())
+                        
+                        # Transform pregrasp and grasp poses
+                        if ref_traj.pregrasp_pose:
+                            pregrasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.pregrasp_pose[:3]), np.array(ref_traj.pregrasp_pose[3:7]))
+                            pos, quat = mat_to_pose(pregrasp_mat)
+                            pregrasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            pregrasp_pose = None
+                            
+                        if ref_traj.grasp_pose:
+                            grasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.grasp_pose[:3]), np.array(ref_traj.grasp_pose[3:7]))
+                            pos, quat = mat_to_pose(grasp_mat)
+                            grasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            grasp_pose = None
+                    else:
+                        # Convert reference trajectory to mat format
+                        ref_traj_mats = []
+                        for traj_pose_7d in ref_traj.object_trajectory:
+                            ref_traj_mats.append(pose_to_mat(np.array(traj_pose_7d[:3]), np.array(traj_pose_7d[3:7])))
+                        ref_traj_mats = np.array(ref_traj_mats)
+                        
+                        new_traj_mats = self.compute_new_traj(start_pose, end_pose, ref_traj_mats)
+                        
+                        # Convert back to 7D format
+                        new_traj_7d = []
+                        for mat in new_traj_mats:
+                            pos, quat = mat_to_pose(mat)
+                            new_traj_7d.append(np.concatenate([pos, quat]).tolist())
+                        #new_traj_7d = lift_traj(new_traj_7d)
+                        # Transform pregrasp and grasp poses
+                        if ref_traj.pregrasp_pose:
+                            pregrasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.pregrasp_pose[:3]), np.array(ref_traj.pregrasp_pose[3:7]))
+                            pos, quat = mat_to_pose(pregrasp_mat)
+                            pregrasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            pregrasp_pose = None
+                            
+                        if ref_traj.grasp_pose:
+                            grasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.grasp_pose[:3]), np.array(ref_traj.grasp_pose[3:7]))
+                            pos, quat = mat_to_pose(grasp_mat)
+                            grasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            grasp_pose = None
+                    
+                    # Apply scene randomization to other objects (if needed, update object_poses here)
+                    # other_object_poses is now a list of (oid, pose) pairs
+                    
+                    # Create success metric    
+
+                    ref_end_pose_mat = pose_to_mat(
+                        np.array(ref_traj.success_metric.end_pose[:3]), 
+                        np.array(ref_traj.success_metric.end_pose[3:7])
+                    )
+                    end_pose_metric_mat = end_pose @ ref_end_pose_mat
+                    pos, quat = mat_to_pose(end_pose_metric_mat)
+                    end_pose_metric_7d = np.concatenate([pos, quat]).tolist()
+
+                    success_metric_type = ref_traj.success_metric.success_metric_type
+                    if success_metric_type == SuccessMetricType.SIMPLE_LIFT:
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.SIMPLE_LIFT,
+                            lift_height=ref_traj.success_metric.lift_height,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    elif success_metric_type == SuccessMetricType.TARGET_POINT:
+                        # Convert end_pose from 7D to mat, transform, then back to 7D
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.TARGET_POINT,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    elif success_metric_type == SuccessMetricType.TARGET_PLANE:
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.TARGET_PLANE,
+                            ground_value=ref_traj.success_metric.ground_value,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    
+                    # Store object poses in 7D format
+                    object_poses = {}
+                    for oid, pose_mat in other_object_poses:
+                        pos, quat = mat_to_pose(pose_mat)
+                        object_poses[oid] = np.concatenate([pos, quat]).tolist()
+                    
+                    # Add start and end related objects
+                    start_pos, start_quat = mat_to_pose(start_pose)
+                    start_pose_7d = np.concatenate([start_pos, start_quat]).tolist()
+                    for oid in self.task_cfg.start_related:
+                        object_poses[oid] = start_pose_7d
+                    
+                    end_pos, end_quat = mat_to_pose(end_pose)
+                    end_pose_7d = np.concatenate([end_pos, end_quat]).tolist()
+                    for oid in self.task_cfg.end_related:
+                        object_poses[oid] = end_pose_7d
+                    
+                    object_poses[self.task_cfg.manipulated_oid] = start_pose_7d
+                    
+                    robot_type = ref_traj.robot_type
+                    generated_trajectories.append(TrajectoryCfg(
+                        robot_pose=robot_pose,
+                        object_poses=object_poses,
+                        object_trajectory=new_traj_7d,
+                        final_gripper_close=ref_traj.final_gripper_close,
+                        pregrasp_pose=pregrasp_pose,
+                        grasp_pose=grasp_pose,
+                        success_metric=success_metric,
+                        robot_type=robot_type
+                    ))
+        
+        # Total trajectories = traj_randomize_num * scene_randomize_num * robot_pose_randomize_num
+        assert len(generated_trajectories) == traj_randomize_num * scene_randomize_num * robot_pose_randomize_num, \
+            f"Expected {traj_randomize_num * scene_randomize_num * robot_pose_randomize_num} trajectories, got {len(generated_trajectories)}"
+      
+        random.shuffle(generated_trajectories)
+        return generated_trajectories
+
+    @staticmethod
+    def compute_new_traj(start_trans: np.ndarray, end_trans: np.ndarray, reference_traj: np.ndarray, 
+                        interp_mode: str = 'slerp') -> np.ndarray:
+        """
+        Compute interpolated trajectory using Real2Render2Real's relative shape preservation method.
+        
+        This maintains the relative motion pattern of the reference trajectory by normalizing
+        the progress along the reference and mapping it to the new start and end positions.
+        
+        Args:
+            start_trans: 4x4 transformation matrix from original start to new start
+            end_trans: 4x4 transformation matrix from original end to new end  
+            reference_traj: Reference trajectory in SE(3), shape (N, 4, 4) or (N, 16)
+            interp_mode: 'linear' or 'slerp' for rotation interpolation (default: 'slerp')
+            
+        Returns:
+            New interpolated trajectory with same shape as reference_traj
+        """
+        # Ensure reference_traj is in (N, 4, 4) format
+        if reference_traj.ndim == 2 and reference_traj.shape[1] == 16:
+            reference_traj = reference_traj.reshape(-1, 4, 4)
+        elif reference_traj.ndim == 3 and reference_traj.shape[1] == 4 and reference_traj.shape[2] == 4:
+            reference_traj = reference_traj.copy()
+        else:
+            raise ValueError(f"Invalid reference_traj shape: {reference_traj.shape}, expected (N, 4, 4) or (N, 16)")
+        
+        N = len(reference_traj)
+        if N == 0:
+            return reference_traj.copy()
+        
+        # Get start and end poses from reference trajectory
+        ref_start = reference_traj[0].copy()
+        ref_end = reference_traj[-1].copy()
+        
+        # Compute new start and end poses
+        new_start_mat = start_trans @ ref_start
+        new_end_mat = end_trans @ ref_end
+        
+        # Convert to 7D format using mat_to_pose
+        ref_traj_7d = []
+        for pose_mat in reference_traj:
+            pos, quat = mat_to_pose(pose_mat)
+            ref_traj_7d.append(np.concatenate([pos, quat]))
+        ref_traj_7d = np.array(ref_traj_7d)
+        
+        pos_start, quat_start = mat_to_pose(new_start_mat)
+        pos_end, quat_end = mat_to_pose(new_end_mat)
+        
+        # Initialize new trajectory
+        new_traj_7d = np.zeros_like(ref_traj_7d)
+        
+        # Normalize time steps
+        t = np.linspace(0, 1, N)
+        
+        # Split into position and rotation components
+        pos_orig = ref_traj_7d[:, :3]
+        quat_orig = ref_traj_7d[:, 3:7]  # wxyz format (from transforms3d)
+        
+        ref_start_pos, ref_start_quat = mat_to_pose(ref_start)
+        ref_end_pos, ref_end_quat = mat_to_pose(ref_end)
+        
+        # Interpolate positions: preserve relative shape from reference trajectory
+        for axis in range(3):
+            ref_range = ref_end_pos[axis] - ref_start_pos[axis]
+            if np.abs(ref_range) > 1e-10:
+                # Normalize progress along reference trajectory
+                normalized_progress = (pos_orig[:, axis] - ref_start_pos[axis]) / ref_range
+                # Map to new range
+                new_traj_7d[:, axis] = pos_start[axis] + (pos_end[axis] - pos_start[axis]) * normalized_progress
+            else:
+                # If no change in reference, use direct transformation
+                new_traj_7d[:, axis] = pos_orig[:, axis] + (pos_start[axis] - ref_start_pos[axis])
+        
+        # Interpolate rotations using SLERP
+        if interp_mode == 'slerp':
+            # Use scipy Slerp for spherical linear interpolation
+            # Convert wxyz to xyzw for scipy
+            quat_start_xyzw = np.array([quat_start[1], quat_start[2], quat_start[3], quat_start[0]])
+            quat_end_xyzw = np.array([quat_end[1], quat_end[2], quat_end[3], quat_end[0]])
+            
+            # Create Slerp interpolator
+            key_rots = R.from_quat([quat_start_xyzw, quat_end_xyzw])
+            key_times = [0, 1]
+            slerp = Slerp(key_times, key_rots)
+            
+            # Interpolate for all time steps
+            interp_rots = slerp(t)
+            quat_interp_xyzw = interp_rots.as_quat()
+            
+            # Convert back to wxyz format
+            new_traj_7d[:, 3] = quat_interp_xyzw[:, 3]  # w
+            new_traj_7d[:, 4] = quat_interp_xyzw[:, 0]  # x
+            new_traj_7d[:, 5] = quat_interp_xyzw[:, 1]  # y
+            new_traj_7d[:, 6] = quat_interp_xyzw[:, 2]  # z
+        else:  # linear
+            # Linear interpolation (needs normalization)
+            for i in range(N):
+                new_traj_7d[i, 3:7] = (1 - t[i]) * quat_start + t[i] * quat_end
+                new_traj_7d[i, 3:7] /= np.linalg.norm(new_traj_7d[i, 3:7])
+        
+        # Convert back to SE(3) matrices using pose_to_mat
+        new_traj = []
+        for pose_7d in new_traj_7d:
+            pose_mat = pose_to_mat(pose_7d[:3], pose_7d[3:7])
+            new_traj.append(pose_mat)
+        new_traj = np.array(new_traj)
+        
+        return new_traj
+
+
+    def select_randomized_cfg(self, train_set_size: int):
+        return random.choice(self.task_cfg.generated_trajectories[train_set_size:])
+
+
+
+
diff --git a/openreal2sim/simulation/isaaclab-old/demo/envs/running_cfg.py b/openreal2sim/simulation/isaaclab-old/demo/envs/running_cfg.py
new file mode 100644
index 0000000..7f89f9d
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/demo/envs/running_cfg.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import math
+from dataclasses import asdict, dataclass
+from typing import Dict
+
+
+@dataclass(frozen=True)
+class RandomizerConfig:
+    """Configuration bundle for `Randomizer.generate_randomized_scene_cfg`."""
+
+    grid_dist: float = 0.03
+    grid_num: int = 3
+    angle_random_range: float = math.pi / 20.0
+    angle_random_num: int = 10
+    traj_randomize_num: int = 20
+    scene_randomize_num: int = 20
+    robot_pose_randomize_range: float = 0.03
+    robot_pose_randomize_angle: float = math.pi / 180.0
+    robot_pose_randomize_num: int = 10
+
+    def to_kwargs(self) -> Dict[str, float | int]:
+        """Return a shallow dict representation that can be splatted into the randomizer."""
+        return asdict(self)
+
+
+@dataclass(frozen=True)
+class HeuristicConfig:
+    """Configuration for the heuristic manipulation stage."""
+
+    num_envs: int = 1
+    num_trials: int = 10
+
+
+@dataclass(frozen=True)
+class RandomizeRolloutConfig:
+    """Rollout-related knobs (e.g., required number of successful trajectories)."""
+
+    total_num: int = 50
+    num_envs: int = 10
+
+
+@dataclass(frozen=True)
+class RunningConfig:
+    """Top-level container bundling all per-scene tunables."""
+
+    randomizer: RandomizerConfig = RandomizerConfig()
+    rollout: RandomizeRolloutConfig = RandomizeRolloutConfig()
+    heuristic: HeuristicConfig = HeuristicConfig()
+
+
+DEFAULT_RUNNING_CONFIG = RunningConfig()
+
+# Users can override the defaults per-scene key by adding entries here.
+RUNNING_CONFIGS: Dict[str, RunningConfig] = {
+    "default": DEFAULT_RUNNING_CONFIG,
+    "demo_video": DEFAULT_RUNNING_CONFIG,
+}
+
+
+def get_randomizer_config(key: str) -> RandomizerConfig:
+    """Return the config associated with `key`, falling back to the default."""
+    return RUNNING_CONFIGS.get(key, DEFAULT_RUNNING_CONFIG).randomizer
+
+
+def get_rollout_config(key: str) -> RandomizeRolloutConfig:
+    """Return rollout config for a scene key."""
+    return RUNNING_CONFIGS.get(key, DEFAULT_RUNNING_CONFIG).rollout
+
+
+def get_heuristic_config(key: str) -> HeuristicConfig:
+    """Return heuristic manipulation config for a scene key."""
+    return RUNNING_CONFIGS.get(key, DEFAULT_RUNNING_CONFIG).heuristic
diff --git a/openreal2sim/simulation/isaaclab-old/demo/sim_agent.sh b/openreal2sim/simulation/isaaclab-old/demo/sim_agent.sh
new file mode 100644
index 0000000..051c7fb
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/demo/sim_agent.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd /app && pwd)"
+
+# Default parameters
+KEY=""
+KEY_FOLDER="${ROOT_DIR}/outputs"
+gpu_id="1"
+HEADLESS="--headless"
+
+usage() {
+  cat <<'EOF'
+Usage: sim_agent.sh [options]
+
+Options:
+  --key <name>           Single scene key to process (optional)
+  --folder <path>        Directory whose immediate subfolders are treated as keys
+  --device <str>         Device for simulations, e.g. cuda:0 (default: cuda:0)
+  --no-headless          Disable headless mode (passes through to AppLauncher)
+  -h, --help             Show this message and exit
+
+For each resolved key, the script runs:
+  1. USD conversion (`isaaclab/sim_preprocess/usd_conversion.py`)
+  2. Heuristic manipulation (`isaaclab/sim_heuristic_manip.py`)
+  3. Randomized rollout (`isaaclab/sim_randomize_rollout.py`)
+EOF
+}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --key)
+      KEY="${2:?Missing value for --key}"
+      shift 2
+      ;;
+    --folder)
+      KEY_FOLDER="${2:?Missing value for --folder}"
+      shift 2
+      ;;
+    --device)
+      DEVICE="${2:?Missing value for --device}"
+      shift 2
+      ;;
+    --no-headless)
+      HEADLESS=""
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "[ERR] Unknown option: $1" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+done
+
+# Determine which stages to run
+export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}"
+cd "${ROOT_DIR}"
+
+declare -a PIPELINE=("usd_conversion" "sim_heuristic_manip" "sim_randomize_rollout")
+
+HEADLESS_ARGS=()
+if [[ -n "${HEADLESS}" ]]; then
+  HEADLESS_ARGS+=("${HEADLESS}")
+fi
+
+run_stage() {
+  local stage="$1"
+  local key="$2"
+  export CUDA_VISIBLE_DEVICES="${gpu_id}"
+  echo "=============================="
+  echo "[RUN] Stage: ${stage} | Key: ${key}"
+  echo "=============================="
+
+  case "${stage}" in
+    usd_conversion)
+      python openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py "${HEADLESS_ARGS[@]}"
+      ;;
+    sim_heuristic_manip)
+      echo "Setting CUDA_VISIBLE_DEVICES to ${CUDA_VISIBLE_DEVICES}"
+      python openreal2sim/simulation/isaaclab/demo/sim_heuristic_manip.py \
+        --key "${key}" \
+        "${HEADLESS_ARGS[@]}"
+      ;;
+    sim_randomize_rollout)
+      python openreal2sim/simulation/isaaclab/demo/sim_randomize_rollout.py \
+        --key "${key}" \
+        "${HEADLESS_ARGS[@]}"
+      ;;
+    *)
+      echo "[ERR] Unsupported stage '${stage}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
+collect_keys() {
+  local -n _out=$1
+  if [[ -n "${KEY}" ]]; then
+    _out+=("${KEY}")
+  fi
+  if [[ -n "${KEY_FOLDER}" ]]; then
+    local folder_abs
+    folder_abs="$(cd "${KEY_FOLDER}" && pwd)"
+    while IFS= read -r -d '' subdir; do
+      _out+=("$(basename "${subdir}")")
+    done < <(find "${folder_abs}" -mindepth 1 -maxdepth 1 -type d -print0)
+  fi
+}
+
+declare -a KEYS=()
+collect_keys KEYS
+
+if [[ ${#KEYS[@]} -eq 0 ]]; then
+  echo "[ERR] Please specify --key and/or --folder with at least one subdirectory." >&2
+  exit 1
+fi
+
+for k in "${KEYS[@]}"; do
+  echo "########## Processing key: ${k} ##########"
+  for stage in "${PIPELINE[@]}"; do
+    run_stage "${stage}" "${k}"
+  done
+done
+
+echo "[DONE] Processed keys: ${KEYS[*]}"
+
diff --git a/openreal2sim/simulation/isaaclab-old/learning/hybrid_ppo.py b/openreal2sim/simulation/isaaclab-old/learning/hybrid_ppo.py
new file mode 100755
index 0000000..a5a8ff0
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/learning/hybrid_ppo.py
@@ -0,0 +1,533 @@
+# learning/ppo_hybrid.py
+# -*- coding: utf-8 -*-
+"""
+Hybrid PPO: Categorical (proposal index at t=0) + Gaussian (offset at t=0) + Gaussian (residual for t>0).
+Action sent to env is always concatenated as:
+    [ residual_6 | proposal_onehot_P | offset_6 ]
+Env consumes:
+    - t == 0: uses (proposal_onehot_P, offset_6); ignores residual_6
+    - t > 0 : uses residual_6; ignores the rest
+"""
+
+from __future__ import annotations
+from pathlib import Path
+from typing import Tuple, Iterable, Dict
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+
+# ========================= Hybrid Actor-Critic =========================
+class HybridActorCritic(nn.Module):
+    """
+    Policy head structure:
+      - proposal_logits: (B, P)          [Categorical at t=0]
+      - offset_mu/std:   (B, P, 6)        [Gaussian at t=0, choose row by sampled idx]
+      - residual_mu/std: (B, 6)           [Gaussian at t>0]
+      - value:           (B,)
+    """
+    def __init__(self, obs_dim: int, num_proposals: int, hidden_sizes: Tuple[int, ...] = (256, 256)):
+        super().__init__()
+        self.P = int(num_proposals)
+
+        # Shared trunk
+        layers = []
+        last = obs_dim
+        for hs in hidden_sizes:
+            layers += [nn.Linear(last, hs), nn.ReLU(inplace=True)]
+            last = hs
+        self.shared = nn.Sequential(*layers)
+
+        # Heads
+        self.proposal_head = nn.Linear(last, self.P)         # logits for Categorical
+        self.offset_mu     = nn.Linear(last, self.P * 6)     # per-proposal 6D mean
+        self.offset_logstd = nn.Parameter(torch.full((self.P, 6), -0.5))  # learnable, proposal-wise
+        self.residual_mu   = nn.Linear(last, 6)              # 6D mean for residual
+        self.residual_logstd = nn.Parameter(torch.full((6,), -0.5))       # learnable
+
+        self.v_head        = nn.Linear(last, 1)
+
+    def forward(self, obs: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
+        Returns a dict of all distribution params and value.
+        Shapes:
+          - proposal_logits: (B,P)
+          - offset_mu:       (B,P,6)
+          - offset_std:      (P,6)  (broadcast on batch)
+          - residual_mu:     (B,6)
+          - residual_std:    (6,)   (broadcast on batch)
+          - value:           (B,)
+        """
+        x = self.shared(obs)
+        proposal_logits = self.proposal_head(x)                              # (B,P)
+        offset_mu = self.offset_mu(x).view(-1, self.P, 6)                    # (B,P,6)
+        offset_std = torch.exp(self.offset_logstd)                           # (P,6)
+        residual_mu = self.residual_mu(x)                                    # (B,6)
+        residual_std = torch.exp(self.residual_logstd)                       # (6,)
+        value = self.v_head(x).squeeze(-1)
+        return {
+            "proposal_logits": proposal_logits,
+            "offset_mu": offset_mu,
+            "offset_std": offset_std,
+            "residual_mu": residual_mu,
+            "residual_std": residual_std,
+            "value": value,
+        }
+
+    @staticmethod
+    def _one_hot(indices: torch.Tensor, P: int) -> torch.Tensor:
+        """indices: (B,), return onehot (B,P) float32."""
+        B = indices.shape[0]
+        onehot = torch.zeros(B, P, dtype=torch.float32, device=indices.device)
+        onehot.scatter_(1, indices.view(-1, 1), 1.0)
+        return onehot
+
+    def act_sample(self, obs: torch.Tensor, is_step0: bool) -> Dict[str, torch.Tensor]:
+        """
+        Sample action and return everything needed for PPO rollout at this step.
+        At t=0: sample idx ~ Categorical, offset ~ N(mu[idx], std[idx]), build env_action.
+        At t>0: sample residual ~ N(mu, std), build env_action with zeros for the rest.
+        Returns dict with keys:
+          - env_action: (B, 6 + P + 6)
+          - logp:       (B,)
+          - entropy:    (B,)
+          - value:      (B,)
+          - cache: any tensors needed for debugging (optional)
+        """
+        outs = self.forward(obs)
+        P = outs["proposal_logits"].shape[1]
+        B = obs.shape[0]
+
+        if is_step0:
+            # Categorical over proposals
+            cat = torch.distributions.Categorical(logits=outs["proposal_logits"])
+            idx = cat.sample()  # (B,)
+            onehot = self._one_hot(idx, P)  # (B,P)
+
+            # Gather offset distribution of the selected proposal
+            mu_sel = outs["offset_mu"][torch.arange(B, device=obs.device), idx, :]           # (B,6)
+            std_sel = outs["offset_std"][idx, :] if outs["offset_std"].dim() == 2 else outs["offset_std"]  # (B?,6)
+            # outs["offset_std"] shape is (P,6), need to gather then expand to (B,6)
+            std_sel = outs["offset_std"][idx, :]  # (B,6)
+
+            gauss_off = torch.distributions.Normal(mu_sel, std_sel)
+            u_off = gauss_off.rsample()  # (B,6)
+            u_off_tanh = torch.tanh(u_off)  # keep action in [-1,1] space for env’s clamp
+            # Tanh correction
+            logp_off = gauss_off.log_prob(u_off) - torch.log(1.0 - u_off_tanh.pow(2) + 1e-6)
+            logp_off = logp_off.sum(-1)  # (B,)
+
+            logp = cat.log_prob(idx) + logp_off
+
+            # Entropy (encourage exploration at t=0): categorical + offset gaussian entropy
+            ent = cat.entropy() + gauss_off.entropy().sum(-1)
+
+            # Residual part zero at t=0 (env ignores it anyway)
+            residual = torch.zeros(B, 6, dtype=torch.float32, device=obs.device)
+
+            # Build env action: [residual_6 | onehot_P | offset6]
+            env_action = torch.cat([residual, onehot, u_off_tanh], dim=-1)  # (B, 6+P+6)
+
+            return {
+                "env_action": env_action,
+                "logp": logp,
+                "entropy": ent,
+                "value": outs["value"],
+            }
+        else:
+            # Only residual Gaussian matters
+            gauss_res = torch.distributions.Normal(outs["residual_mu"], outs["residual_std"])
+            u_res = gauss_res.rsample()          # (B,6)
+            a_res = torch.tanh(u_res)            # (B,6)
+            logp_res = gauss_res.log_prob(u_res) - torch.log(1.0 - a_res.pow(2) + 1e-6)
+            logp_res = logp_res.sum(-1)          # (B,)
+            ent = gauss_res.entropy().sum(-1)    # (B,)
+
+            # proposal onehot & offset set to zero placeholders
+            P = outs["proposal_logits"].shape[1]
+            onehot = torch.zeros(B, P, dtype=torch.float32, device=obs.device)
+            off6   = torch.zeros(B, 6, dtype=torch.float32, device=obs.device)
+
+            env_action = torch.cat([a_res, onehot, off6], dim=-1)  # (B,6+P+6)
+            return {
+                "env_action": env_action,
+                "logp": logp_res,
+                "entropy": ent,
+                "value": outs["value"],
+            }
+
+    def log_prob_recompute(self, obs: torch.Tensor, env_action: torch.Tensor, is_step0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Recompute log_prob and entropy under NEW network params for PPO update.
+        env_action: (B, 6+P+6) = [res6 | onehot_P | off6]
+        Returns (logp, entropy) both (B,)
+        """
+        outs = self.forward(obs)
+        B = env_action.shape[0]
+        P = outs["proposal_logits"].shape[1]
+
+        res6  = env_action[:, :6]
+        onehot = env_action[:, 6:6+P]
+        off6 = env_action[:, 6+P:]
+
+        if is_step0:
+            # Recover idx from onehot
+            idx = onehot.argmax(dim=-1)  # (B,)
+
+            # logp for categorical
+            cat = torch.distributions.Categorical(logits=outs["proposal_logits"])
+            logp_cat = cat.log_prob(idx)  # (B,)
+
+            # logp for selected offset
+            mu_sel  = outs["offset_mu"][torch.arange(B, device=obs.device), idx, :]  # (B,6)
+            std_sel = outs["offset_std"][idx, :]                                     # (B,6)
+            gauss_off = torch.distributions.Normal(mu_sel, std_sel)
+            # Convert bounded action back to pre-squash space via atanh
+            a = off6.clamp(-0.999999, 0.999999)
+            atanh_a = 0.5 * (torch.log1p(a + 1e-6) - torch.log1p(-a + 1e-6))
+            logp_off = gauss_off.log_prob(atanh_a) - torch.log(1.0 - a.pow(2) + 1e-6)
+            logp_off = logp_off.sum(-1)
+
+            logp = logp_cat + logp_off
+            ent = cat.entropy() + gauss_off.entropy().sum(-1)
+            return logp, ent
+        else:
+            gauss_res = torch.distributions.Normal(outs["residual_mu"], outs["residual_std"])
+            a = res6.clamp(-0.999999, 0.999999)
+            atanh_a = 0.5 * (torch.log1p(a + 1e-6) - torch.log1p(-a + 1e-6))
+            logp_res = gauss_res.log_prob(atanh_a) - torch.log(1.0 - a.pow(2) + 1e-6)
+            logp_res = logp_res.sum(-1)
+            ent = gauss_res.entropy().sum(-1)
+            return logp_res, ent
+
+
+# ========================= Rollout Buffer (Hybrid) =========================
+class HybridRolloutBuffer:
+    """
+    Stores per-timestep data for PPO with fixed horizon.
+    We keep the merged env_action (for env stepping and PPO recompute),
+    together with scalar logp/value/reward/done/timeout.
+    """
+    def __init__(self, horizon: int, num_envs: int, obs_dim: int, act_dim: int, device: torch.device):
+        self.h, self.n, self.d, self.k = horizon, num_envs, obs_dim, act_dim
+        self.device = device
+        self.reset()
+
+    def reset(self):
+        H, N, D, K, dev = self.h, self.n, self.d, self.k, self.device
+        self.obs      = torch.zeros(H, N, D, device=dev, dtype=torch.float32)
+        self.actions  = torch.zeros(H, N, K, device=dev, dtype=torch.float32)  # [res6|1hotP|off6]
+        self.logp     = torch.zeros(H, N, device=dev, dtype=torch.float32)
+        self.rewards  = torch.zeros(H, N, device=dev, dtype=torch.float32)
+        self.dones    = torch.zeros(H, N, device=dev, dtype=torch.bool)
+        self.values   = torch.zeros(H, N, device=dev, dtype=torch.float32)
+        self.timeouts = torch.zeros(H, N, device=dev, dtype=torch.bool)
+        self.ptr = 0
+
+    def add(self, *, obs, env_action, logp, rew, done, val, t_out):
+        i = self.ptr
+        self.obs[i]      = obs.detach()
+        self.actions[i]  = env_action.detach()
+        self.logp[i]     = logp.detach()
+        self.rewards[i]  = rew.detach().squeeze(-1)
+        self.dones[i]    = done.detach().bool()
+        self.values[i]   = val.detach()
+        self.timeouts[i] = t_out.detach().bool()
+        self.ptr += 1
+
+    @torch.no_grad()
+    def compute_gae(self, last_value: torch.Tensor, gamma=0.99, lam=0.95):
+        H, N = self.h, self.n
+        adv = torch.zeros(H, N, device=self.device, dtype=torch.float32)
+        last_gae = torch.zeros(N, device=self.device, dtype=torch.float32)
+        for t in reversed(range(H)):
+            if t == H - 1:
+                next_nonterminal = (~self.dones[t] | self.timeouts[t]).float()
+                next_values = last_value
+            else:
+                next_nonterminal = (~self.dones[t + 1] | self.timeouts[t + 1]).float()
+                next_values = self.values[t + 1]
+            delta = self.rewards[t] + gamma * next_values * next_nonterminal - self.values[t]
+            last_gae = delta + gamma * lam * next_nonterminal * last_gae
+            adv[t] = last_gae
+        ret = adv + self.values
+        return adv, ret
+
+    def iterate_minibatches(self, batch_size: int, num_epochs: int) -> Iterable:
+        H, N = self.h, self.n
+        total = H * N
+        obs   = self.obs.view(total, -1)
+        acts  = self.actions.view(total, -1)
+        logp  = self.logp.view(total)
+        values= self.values.view(total)
+
+        def idx_batches():
+            idx = torch.randperm(total, device=self.device)
+            for i in range(0, total, batch_size):
+                j = idx[i: i + batch_size]
+                yield j
+
+        for _ in range(num_epochs):
+            for j in idx_batches():
+                yield j, obs[j], acts[j], logp[j], values[j]
+
+
+# ========================= Helpers =========================
+def _as_tensor(x, device: torch.device, dtype: torch.dtype | None = None) -> torch.Tensor:
+    if isinstance(x, torch.Tensor):
+        if dtype is not None and x.dtype != dtype:
+            x = x.to(dtype)
+        return x.to(device)
+    return torch.as_tensor(x, device=device, dtype=dtype)
+
+@torch.no_grad()
+def _reset_env(env, device: torch.device):
+    rst = env.reset()
+    obs = rst["obs"]
+    return _as_tensor(obs, device=device, dtype=torch.float32)
+
+def _choose_minibatch_size(total_batch: int, num_envs: int) -> int:
+    mb = max(num_envs * 2, 32)
+    while total_batch % mb != 0 and mb > 8:
+        mb //= 2
+    if total_batch % mb != 0:
+        mb = num_envs
+    return mb
+
+
+@torch.no_grad()
+def _partial_reset_and_merge_obs(env, next_obs: torch.Tensor, done_t: torch.Tensor, device: torch.device):
+    """
+    If any env is done:
+      1) Try env.reset_done(done_mask) which returns {"obs": (M,D)} or (N,D)
+      2) Else try env.reset(env_ids=idxs)
+      3) Else fallback to global reset()
+    Then merge the reset obs into next_obs only for those done envs and return merged tensor.
+    """
+    if not bool(done_t.any()):
+        return next_obs  # nothing to do
+
+    idxs = torch.nonzero(done_t, as_tuple=False).squeeze(-1)  # (M,)
+    M = int(idxs.numel())
+    if M == 0:
+        return next_obs
+
+    rst = env.reset(env_ids=idxs)
+    new_obs = _as_tensor(rst["obs"], device=device, dtype=torch.float32)
+    if new_obs.shape[0] == next_obs.shape[0]:
+        return new_obs
+    merged = next_obs.clone()
+    merged[idxs] = new_obs
+    return merged
+
+# ========================= PPO Train (Hybrid) =========================
+def ppo_train_hybrid(
+    *,
+    env,
+    net: HybridActorCritic,
+    optimizer: optim.Optimizer,
+    device: torch.device,
+    total_epochs: int,
+    horizon: int,
+    mini_epochs: int,
+    batch_size_hint: int,
+    clip_eps: float,
+    value_coef: float,
+    entropy_coef: float,
+    gamma: float,
+    gae_lambda: float,
+    run_dir: Path,
+    save_every: int = 50,
+):
+    """
+    Hybrid PPO training loop:
+      - t=0 uses Categorical(index) + Gaussian(offset)
+      - t>0 uses Gaussian(residual)
+    Assumptions about env:
+      - action_space.shape[0] == (6 + P + 6)
+      - step() consumes the onehot & offset at t=0, and residual at t>0
+    """
+    from torch.utils.tensorboard import SummaryWriter
+
+    print("#####################################horizon =", horizon)
+
+    run_dir.mkdir(parents=True, exist_ok=True)
+    ckpt_best = run_dir / "best.pt"
+
+    num_envs = getattr(env, "num_envs", None)
+    if num_envs is None:
+        raise ValueError("env.num_envs is required for batch sizing.")
+    obs_dim = env.observation_space.shape[0]
+    act_dim = env.action_space.shape[0]  # 6 + P + 6
+
+    writer = SummaryWriter(log_dir=run_dir)
+
+    print(f"[INFO][Hybrid PPO] epochs={total_epochs}, horizon={horizon}, num_envs={num_envs}, "
+          f"obs_dim={obs_dim}, act_dim={act_dim}")
+    buffer = HybridRolloutBuffer(horizon=horizon, num_envs=num_envs, obs_dim=obs_dim, act_dim=act_dim, device=device)
+
+    total_batch = horizon * num_envs if batch_size_hint <= 0 else int(batch_size_hint)
+    minibatch_size = _choose_minibatch_size(total_batch, num_envs)
+
+    # initial reset
+    obs = _reset_env(env, device)
+    best_mean_return = -1e9
+
+    for epoch in range(1, total_epochs + 1):
+        buffer.reset()
+
+        ep_return_per_env = torch.zeros(num_envs, device=device, dtype=torch.float32)
+        finished_episode_returns = []  # Python list[float]
+        ep_track_return_per_env = torch.zeros(num_envs, device=device, dtype=torch.float32)
+        finished_episode_track_returns = []  # Python list[float]
+
+        # ===== rollout =====
+        for t in range(horizon):
+            is_step0 = (t == 0)  # because your env uses fixed horizon episodes
+            out = net.act_sample(obs, is_step0=is_step0)
+            env_action = out["env_action"].detach()
+            logp = out["logp"].detach()
+            val  = out["value"].detach()
+
+            step_out = env.step(env_action)  # action shape (N, 6+P+6)
+            next_obs_any, reward_any, done_any, infos = step_out[0]["obs"], step_out[1], step_out[2], step_out[3]
+
+            next_obs = _as_tensor(next_obs_any, device=device, dtype=torch.float32)
+            rew_t    = _as_tensor(reward_any,  device=device, dtype=torch.float32)
+            done_t   = _as_tensor(done_any,    device=device, dtype=torch.bool)
+            t_out    = _as_tensor(infos.get("time_outs", torch.zeros_like(done_t)), device=device, dtype=torch.bool)
+
+            buffer.add(obs=obs, env_action=env_action, logp=logp, rew=rew_t, done=done_t, val=val, t_out=t_out)
+
+            # episode return bookkeeping
+            r_track_t = _as_tensor(infos["r_track"], device=device, dtype=torch.float32).view_as(rew_t)
+            ep_return_per_env += rew_t.squeeze(1)                  # (N,)
+            ep_track_return_per_env += r_track_t.squeeze(1)        # (N,)
+
+            done_or_timeout = (done_t | t_out)          # (N,)
+            if bool(done_or_timeout.any()):
+                finished_episode_returns.extend(
+                    ep_return_per_env[done_or_timeout].detach().cpu().tolist()
+                )
+                finished_episode_track_returns.extend(
+                    ep_track_return_per_env[done_or_timeout].detach().cpu().tolist()
+                )
+                ep_return_per_env[done_or_timeout] = 0.0
+                ep_track_return_per_env[done_or_timeout] = 0.0
+
+            # reset env if done or timeout
+            next_obs = _partial_reset_and_merge_obs(env, next_obs, done_t, device)
+            obs = next_obs
+
+        # ===== bootstrap value =====
+        with torch.no_grad():
+            last_v = net.forward(obs)["value"]  # (N,)
+
+        # ===== GAE / returns =====
+        adv, ret = buffer.compute_gae(last_value=last_v.detach(), gamma=gamma, lam=gae_lambda)
+
+        # normalize advantage
+        adv_flat = adv.view(-1)
+        ret_flat = ret.view(-1)
+        adv_flat = (adv_flat - torch.mean(adv_flat)) / (torch.std(adv_flat) + 1e-8)
+
+        # ===== PPO updates =====
+        policy_loss_epoch = 0.0
+        value_loss_epoch  = 0.0
+        entropy_epoch     = 0.0
+        num_updates       = 0
+
+        for idx, obs_b, act_b, old_logp_b, old_v_b in buffer.iterate_minibatches(minibatch_size, mini_epochs):
+            # Figure out which rows are step0 in this flattened view:
+            # Original layout is (H,N) -> flattened to (H*N,)
+            # Rows [0*N : 1*N) correspond to t=0
+            H, N = buffer.h, buffer.n
+            is_step0_mask = (idx // N) == 0  # boolean per-sample
+
+            # Recompute logp & entropy under NEW params (mixture by timestep type)
+            logp_new = torch.empty_like(old_logp_b)
+            ent_new  = torch.empty_like(old_logp_b)
+
+            if bool(is_step0_mask.any()):
+                mask = is_step0_mask
+                lp0, en0 = net.log_prob_recompute(obs_b[mask], act_b[mask], is_step0=True)
+                logp_new[mask] = lp0
+                ent_new[mask]  = en0
+            if bool((~is_step0_mask).any()):
+                mask = ~is_step0_mask
+                lp1, en1 = net.log_prob_recompute(obs_b[mask], act_b[mask], is_step0=False)
+                logp_new[mask] = lp1
+                ent_new[mask]  = en1
+
+            ratio = torch.exp(logp_new - old_logp_b)
+
+            adv_b = adv_flat[idx]
+            ret_b = ret_flat[idx]
+
+            surrogate1 = ratio * adv_b
+            surrogate2 = torch.clamp(ratio, 1.0 - clip_eps, 1.0 + clip_eps) * adv_b
+            policy_loss = -torch.min(surrogate1, surrogate2).mean()
+
+            # Clipped value loss (same as your original)
+            # Re-run value head:
+            v_b = net.forward(obs_b)["value"]
+            value_clipped = old_v_b + (v_b - old_v_b).clamp(-clip_eps, clip_eps)
+            value_losses = (v_b - ret_b).pow(2)
+            value_losses_clipped = (value_clipped - ret_b).pow(2)
+            value_loss = 0.5 * torch.max(value_losses, value_losses_clipped).mean()
+
+            loss = policy_loss + value_coef * value_loss - entropy_coef * ent_new.mean()
+
+            optimizer.zero_grad(set_to_none=True)
+            loss.backward()
+            nn.utils.clip_grad_norm_(net.parameters(), 0.5)
+            optimizer.step()
+
+            policy_loss_epoch += float(policy_loss.detach())
+            value_loss_epoch  += float(value_loss.detach())
+            entropy_epoch     += float(ent_new.mean().detach())
+            num_updates       += 1
+
+        mean_return = float(ret.mean().detach())
+        print(f"[E{epoch:04d}] return={mean_return:7.3f}  "
+              f"pi={policy_loss_epoch/max(1,num_updates):7.4f}  "
+              f"v={value_loss_epoch/max(1,num_updates):7.4f}  "
+              f"H={entropy_epoch/max(1,num_updates):7.4f}")
+
+        # Explained variance
+        ev = float(1.0 - torch.var(ret_flat - buffer.values.view(-1)) / (torch.var(ret_flat) + 1e-12))
+
+        # TB logs
+        writer.add_scalar("return_mean", mean_return, epoch)
+        writer.add_scalar("policy_loss", policy_loss_epoch / max(1, num_updates), epoch)
+        writer.add_scalar("value_loss", value_loss_epoch / max(1, num_updates), epoch)
+        writer.add_scalar("entropy", entropy_epoch / max(1, num_updates), epoch)
+        writer.add_scalar("explained_variance", ev, epoch)
+        if len(finished_episode_returns) > 0:
+            mean_ep_return = float(sum(finished_episode_returns) / len(finished_episode_returns))
+            writer.add_scalar("episode_return_mean", mean_ep_return, epoch)
+        if len(finished_episode_track_returns) > 0:
+            mean_ep_track_return = float(sum(finished_episode_track_returns) / len(finished_episode_track_returns))
+            writer.add_scalar("episode_tracking_return_mean", mean_ep_track_return, epoch)
+
+        # Save best & periodic
+        if mean_return > best_mean_return:
+            best_mean_return = mean_return
+            torch.save({
+                "model": net.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "epoch": epoch,
+                "best_return": best_mean_return
+            }, ckpt_best)
+
+        if (save_every > 0) and (epoch % save_every == 0):
+            ckpt_path = run_dir / f"epoch_{epoch:04d}_ret_{mean_return:.2f}.pt"
+            torch.save({
+                "model": net.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "epoch": epoch,
+                "return": mean_return
+            }, ckpt_path)
+
+    return {"best_mean_return": best_mean_return, "ckpt_best": ckpt_best}
diff --git a/openreal2sim/simulation/isaaclab-old/learning/simple_ppo.py b/openreal2sim/simulation/isaaclab-old/learning/simple_ppo.py
new file mode 100755
index 0000000..0cfbf76
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/learning/simple_ppo.py
@@ -0,0 +1,331 @@
+# ppo_utils.py
+from __future__ import annotations
+from pathlib import Path
+from typing import Tuple, Iterable
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.tensorboard import SummaryWriter
+
+# ========================= Actor-Critic =========================
+class ActorCritic(nn.Module):
+    """Simple MLP actor-critic. Actor outputs mean in pre-squash space; action = tanh(N(mu, sigma))."""
+    def __init__(self, obs_dim: int, act_dim: int, hidden_sizes: Tuple[int, ...] = (256, 256)):
+        super().__init__()
+        layers = []
+        last = obs_dim
+        for hs in hidden_sizes:
+            layers += [nn.Linear(last, hs), nn.ReLU(inplace=True)]
+            last = hs
+        self.shared = nn.Sequential(*layers)
+        self.mu_head = nn.Linear(last, act_dim)                       # pre-squash mean
+        self.log_std = nn.Parameter(torch.full((act_dim,), -0.5))     # trainable log_std
+        self.v_head  = nn.Linear(last, 1)
+
+    def forward(self, obs: torch.Tensor):
+        """obs: (B, obs_dim) -> mu: (B, act_dim), std: (act_dim,), v: (B,)"""
+        x = self.shared(obs)
+        mu = self.mu_head(x)
+        v  = self.v_head(x).squeeze(-1)
+        std = torch.exp(self.log_std)
+        return mu, std, v
+
+    @staticmethod
+    def sample_tanh_gaussian(mu: torch.Tensor, std: torch.Tensor):
+        """Return action in [-1,1], log_prob with tanh correction, and pre-squash sample u."""
+        dist = torch.distributions.Normal(mu, std)
+        u = dist.rsample()  # reparameterized
+        a = torch.tanh(u)
+        # log|det(Jacobian of tanh)| = sum log(1 - tanh(u)^2)
+        log_prob = dist.log_prob(u) - torch.log(1.0 - a.pow(2) + 1e-6)
+        return a, log_prob.sum(-1), u
+
+    @staticmethod
+    def log_prob_tanh_gaussian(mu: torch.Tensor, std: torch.Tensor, a: torch.Tensor):
+        """Compute log_prob(a) for tanh(N(mu,std)) using atanh(a)."""
+        a = a.clamp(min=-0.999999, max=0.999999)
+        atanh_a = 0.5 * (torch.log1p(a + 1e-6) - torch.log1p(-a + 1e-6))
+        dist = torch.distributions.Normal(mu, std)
+        log_prob = dist.log_prob(atanh_a) - torch.log(1.0 - a.pow(2) + 1e-6)
+        return log_prob.sum(-1)
+
+
+# ========================= Rollout Buffer =========================
+class RolloutBuffer:
+    """On-policy PPO rollout buffer with fixed horizon (all tensors on a single device)."""
+    def __init__(self, horizon: int, num_envs: int, obs_dim: int, act_dim: int, device: torch.device):
+        self.h, self.n, self.d, self.k = horizon, num_envs, obs_dim, act_dim
+        self.device = device
+        self.reset()
+
+    def reset(self):
+        H, N, D, K, dev = self.h, self.n, self.d, self.k, self.device
+        self.obs      = torch.zeros(H, N, D, device=dev, dtype=torch.float32)
+        self.actions  = torch.zeros(H, N, K, device=dev, dtype=torch.float32)
+        self.logp     = torch.zeros(H, N, device=dev, dtype=torch.float32)
+        self.rewards  = torch.zeros(H, N, device=dev, dtype=torch.float32)
+        self.dones    = torch.zeros(H, N, device=dev, dtype=torch.bool)
+        self.values   = torch.zeros(H, N, device=dev, dtype=torch.float32)
+        self.timeouts = torch.zeros(H, N, device=dev, dtype=torch.bool)
+        self.ptr = 0
+
+    def add(self, *, obs, act, logp, rew, done, val, t_out):
+        """Store DETACHED copies to keep env and rollout outside the grad graph."""
+        i = self.ptr
+        self.obs[i]      = obs.detach()
+        self.actions[i]  = act.detach()
+        self.logp[i]     = logp.detach()
+        self.rewards[i]  = rew.detach().squeeze(-1)  # allow (B,1) or (B,)
+        self.dones[i]    = done.detach().bool()
+        self.values[i]   = val.detach()
+        self.timeouts[i] = t_out.detach().bool()
+        self.ptr += 1
+
+    @torch.no_grad()  # make GAE graph-free
+    def compute_gae(self, last_value: torch.Tensor, gamma=0.99, lam=0.95):
+        """last_value: (N,)"""
+        H, N = self.h, self.n
+        adv = torch.zeros(H, N, device=self.device, dtype=torch.float32)
+        last_gae = torch.zeros(N, device=self.device, dtype=torch.float32)
+        for t in reversed(range(H)):
+            if t == H - 1:
+                next_nonterminal = (~self.dones[t] | self.timeouts[t]).float()
+                next_values = last_value
+            else:
+                next_nonterminal = (~self.dones[t + 1] | self.timeouts[t + 1]).float()
+                next_values = self.values[t + 1]
+            delta = self.rewards[t] + gamma * next_values * next_nonterminal - self.values[t]
+            last_gae = delta + gamma * lam * next_nonterminal * last_gae
+            adv[t] = last_gae
+        ret = adv + self.values
+        return adv, ret
+
+    def iterate_minibatches(self, batch_size: int, num_epochs: int) -> Iterable:
+        H, N = self.h, self.n
+        total = H * N
+        obs     = self.obs.view(total, -1)
+        acts    = self.actions.view(total, -1)
+        logp    = self.logp.view(total)
+        values  = self.values.view(total)
+
+        def idx_batches():
+            idx = torch.randperm(total, device=self.device)
+            for i in range(0, total, batch_size):
+                j = idx[i : i + batch_size]
+                yield j
+
+        for _ in range(num_epochs):
+            for j in idx_batches():
+                yield j, obs[j], acts[j], logp[j], values[j]
+
+
+# ========================= Helpers =========================
+def _as_tensor(x, device: torch.device, dtype: torch.dtype | None = None) -> torch.Tensor:
+    """Convert numpy/torch to torch on device with optional dtype, avoiding copies when possible."""
+    if isinstance(x, torch.Tensor):
+        if dtype is not None and x.dtype != dtype:
+            x = x.to(dtype)
+        return x.to(device)
+    return torch.as_tensor(x, device=device, dtype=dtype)
+
+
+@torch.no_grad()
+def _reset_env(env, device: torch.device):
+    """Support env.reset() that returns torch or numpy."""
+    rst = env.reset()
+    obs = rst["obs"]
+    return _as_tensor(obs, device=device, dtype=torch.float32)
+
+
+def _choose_minibatch_size(total_batch: int, num_envs: int) -> int:
+    """Heuristic minibatch sizing that divides total_batch."""
+    mb = max(num_envs * 2, 32)
+    while total_batch % mb != 0 and mb > 8:
+        mb //= 2
+    if total_batch % mb != 0:
+        mb = num_envs
+    return mb
+
+
+@torch.no_grad()
+def _partial_reset_and_merge_obs(env, next_obs: torch.Tensor, done_t: torch.Tensor, device: torch.device):
+    """
+    If any env is done:
+      1) Try env.reset_done(done_mask) which returns {"obs": (M,D)} or (N,D)
+      2) Else try env.reset(env_ids=idxs)
+      3) Else fallback to global reset()
+    Then merge the reset obs into next_obs only for those done envs and return merged tensor.
+    """
+    if not bool(done_t.any()):
+        return next_obs  # nothing to do
+
+    idxs = torch.nonzero(done_t, as_tuple=False).squeeze(-1)  # (M,)
+    M = int(idxs.numel())
+    if M == 0:
+        return next_obs
+
+    rst = env.reset(env_ids=idxs)
+    new_obs = _as_tensor(rst["obs"], device=device, dtype=torch.float32)
+    if new_obs.shape[0] == next_obs.shape[0]:
+        return new_obs
+    merged = next_obs.clone()
+    merged[idxs] = new_obs
+    return merged
+
+# ========================= PPO Train =========================
+def ppo_train(
+    *,
+    env,
+    net: ActorCritic,
+    optimizer: optim.Optimizer,
+    device: torch.device,
+    total_epochs: int,
+    horizon: int,
+    mini_epochs: int,
+    batch_size_hint: int,
+    clip_eps: float,
+    value_coef: float,
+    entropy_coef: float,
+    gamma: float,
+    gae_lambda: float,
+    run_dir: Path,
+    save_every: int = 50,
+):
+    """
+    Generic PPO training loop.
+    Assumes:
+      - env.reset() -> {"obs": torch or numpy, shape (N,D)}
+      - env.step(action) ->
+            ({"obs": torch/numpy (N,D)}, reward (N,1)/(N,), done (N,), {"time_outs": (N,)})
+      - Partial reset:
+            env.reset_done(done_mask)   OR   env.reset(env_ids=idxs)   OR   fallback global reset()
+    """
+    run_dir.mkdir(parents=True, exist_ok=True)
+    ckpt_best = run_dir / "best.pt"
+
+    num_envs = getattr(env, "num_envs", None)
+    if num_envs is None:
+        raise ValueError("env.num_envs is required for batch sizing.")
+    obs_dim = env.observation_space.shape[0]
+    act_dim = env.action_space.shape[0]
+
+    writer = SummaryWriter(log_dir=run_dir)
+
+    print(f"[INFO] PPO training: epochs={total_epochs}, horizon={horizon}, num_envs={num_envs}, "
+          f"obs_dim={obs_dim}, act_dim={act_dim}")
+    buffer = RolloutBuffer(horizon=horizon, num_envs=num_envs, obs_dim=obs_dim, act_dim=act_dim, device=device)
+
+    total_batch = horizon * num_envs if batch_size_hint <= 0 else int(batch_size_hint)
+    minibatch_size = _choose_minibatch_size(total_batch, num_envs)
+
+    # initial reset
+    obs = _reset_env(env, device)
+    best_mean_return = -1e9
+
+    for epoch in range(1, total_epochs + 1):
+        buffer.reset()
+
+        # ===== rollout =====
+        for t in range(horizon):
+            mu, std, v = net(obs)
+            act, logp, _ = net.sample_tanh_gaussian(mu, std)  # act in [-1,1], (N,K)
+
+            # IMPORTANT: do not let env/sim see gradients
+            act_no_grad = act.detach()
+
+            step_out = env.step(act_no_grad)
+            next_obs_any, reward_any, done_any, infos = step_out[0]["obs"], step_out[1], step_out[2], step_out[3]
+
+            next_obs = _as_tensor(next_obs_any, device=device, dtype=torch.float32)
+            rew_t    = _as_tensor(reward_any,  device=device, dtype=torch.float32)  # (N,1) or (N,)
+            done_t   = _as_tensor(done_any,    device=device, dtype=torch.bool)     # (N,)
+            t_out    = _as_tensor(infos.get("time_outs", torch.zeros_like(done_t)), device=device, dtype=torch.bool)
+
+            buffer.add(obs=obs, act=act_no_grad, logp=logp, rew=rew_t, done=done_t, val=v, t_out=t_out)
+
+            # partial reset (merge only done envs' obs)
+            next_obs = _partial_reset_and_merge_obs(env, next_obs, done_t, device)
+
+            obs = next_obs
+
+        # ===== bootstrap value =====
+        _, _, last_v = net(obs)  # (N,)
+
+        # ===== GAE / returns (graph-free) =====
+        adv, ret = buffer.compute_gae(last_value=last_v.detach(), gamma=gamma, lam=gae_lambda)
+
+        # normalize advantage
+        adv_flat = adv.view(-1)
+        ret_flat = ret.view(-1)
+        adv_flat = (adv_flat - torch.mean(adv_flat)) / (torch.std(adv_flat) + 1e-8)
+
+        # ===== PPO updates =====
+        policy_loss_epoch = 0.0
+        value_loss_epoch  = 0.0
+        entropy_epoch     = 0.0
+        num_updates       = 0
+
+        for idx, obs_b, act_b, old_logp_b, old_v_b in buffer.iterate_minibatches(minibatch_size, mini_epochs):
+            adv_b = adv_flat[idx]
+            ret_b = ret_flat[idx]
+
+            mu_b, std_b, v_b = net(obs_b)
+            new_logp_b = net.log_prob_tanh_gaussian(mu_b, std_b, act_b)
+            ratio = torch.exp(new_logp_b - old_logp_b)
+
+            surrogate1 = ratio * adv_b
+            surrogate2 = torch.clamp(ratio, 1.0 - clip_eps, 1.0 + clip_eps) * adv_b
+            policy_loss = -torch.min(surrogate1, surrogate2).mean()
+
+            value_clipped = old_v_b + (v_b - old_v_b).clamp(-clip_eps, clip_eps)
+            value_losses = (v_b - ret_b).pow(2)
+            value_losses_clipped = (value_clipped - ret_b).pow(2)
+            value_loss = 0.5 * torch.max(value_losses, value_losses_clipped).mean()
+
+            base_dist = torch.distributions.Normal(mu_b, std_b)
+            entropy = base_dist.entropy().sum(-1).mean()
+
+            loss = policy_loss + value_coef * value_loss - entropy_coef * entropy
+
+            optimizer.zero_grad(set_to_none=True)
+            loss.backward()
+            nn.utils.clip_grad_norm_(net.parameters(), 0.5)
+            optimizer.step()
+
+            policy_loss_epoch += float(policy_loss.detach())
+            value_loss_epoch  += float(value_loss.detach())
+            entropy_epoch     += float(entropy.detach())
+            num_updates       += 1
+
+        mean_return = float(ret.mean().detach())
+        print(f"[E{epoch:04d}] return={mean_return:7.3f}  "
+              f"pi={policy_loss_epoch/max(1,num_updates):7.4f}  "
+              f"v={value_loss_epoch/max(1,num_updates):7.4f}  "
+              f"H={entropy_epoch/max(1,num_updates):7.4f}")
+
+        # Value head quality: explained variance on flattened batch
+        ev = float(1.0 - torch.var(ret_flat - buffer.values.view(-1)) / (torch.var(ret_flat) + 1e-12))
+
+        writer.add_scalar("return_mean", mean_return, epoch)
+        writer.add_scalar("policy_loss", policy_loss_epoch / max(1,num_updates), epoch)
+        writer.add_scalar("value_loss", value_loss_epoch / max(1,num_updates), epoch)
+        writer.add_scalar("entropy", entropy_epoch / max(1,num_updates), epoch)
+        writer.add_scalar("explained_variance", ev, epoch)
+
+        # save best & periodic
+        if mean_return > best_mean_return:
+            best_mean_return = mean_return
+            torch.save({"model": net.state_dict(),
+                        "optimizer": optimizer.state_dict(),
+                        "epoch": epoch,
+                        "best_return": best_mean_return},
+                       ckpt_best)
+        if (save_every > 0) and (epoch % save_every == 0):
+            ckpt_path = run_dir / f"epoch_{epoch:04d}_ret_{mean_return:.2f}.pt"
+            torch.save({"model": net.state_dict(),
+                        "optimizer": optimizer.state_dict(),
+                        "epoch": epoch,
+                        "return": mean_return},
+                       ckpt_path)
+
+    return {"best_mean_return": best_mean_return, "ckpt_best": ckpt_best}
diff --git a/openreal2sim/simulation/isaaclab-old/sim_base.py b/openreal2sim/simulation/isaaclab-old/sim_base.py
new file mode 100755
index 0000000..987a0f2
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_base.py
@@ -0,0 +1,951 @@
+from __future__ import annotations
+
+import json
+import os
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+
+import curobo
+import imageio
+
+# Isaac Lab
+import isaaclab.sim as sim_utils
+import numpy as np
+import torch
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import (
+    MotionGen,
+    MotionGenConfig,
+    MotionGenPlanConfig,
+)
+from isaaclab.controllers import DifferentialIKController, DifferentialIKControllerCfg
+from isaaclab.devices import Se3Gamepad, Se3Keyboard, Se3SpaceMouse
+from isaaclab.managers import SceneEntityCfg
+from isaaclab.markers import VisualizationMarkers, VisualizationMarkersCfg
+from isaaclab.sensors.camera import Camera
+from isaaclab.utils.assets import ISAAC_NUCLEUS_DIR
+from isaaclab.utils.math import (
+    subtract_frame_transforms,
+    transform_points,
+    unproject_depth,
+)
+
+##-- for sim background to real background video composition--
+from PIL import Image 
+import cv2  
+
+
+
+def get_next_demo_id(demo_root: Path) -> int:
+    if not demo_root.exists():
+        return 0
+    demo_ids = []
+    for name in os.listdir(demo_root):
+        if name.startswith("demo_"):
+            try:
+                demo_ids.append(int(name.split("_")[1]))
+            except Exception:
+                pass
+    return max(demo_ids) + 1 if demo_ids else 0
+
+
+class BaseSimulator:
+    """
+    Base class for robot simulation.
+
+    Attributes:
+      self.sim, self.scene, self.sim_dt
+      self.robot, self.object_prim, self.background_prim
+      self.teleop_interface, self.sim_state_machine
+      self.diff_ik_cfg, self.diff_ik_controller
+      self.ee_goals, self.current_goal_idx, self.ik_commands
+      self.robot_entity_cfg, self.robot_gripper_cfg
+      self.gripper_open_tensor, self.gripper_close_tensor
+      self.ee_jacobi_idx, self.count, self.demo_id
+      self.camera, self.save_dict
+      self.selected_object_id, self.obj_rel_traj, self.debug_level
+      self._goal_vis, self._traj_vis
+    """
+
+    def __init__(
+        self,
+        sim: sim_utils.SimulationContext,
+        scene: Any,  # InteractiveScene
+        *,
+        args,
+        sim_cfgs: Dict,
+        robot_pose: torch.Tensor,
+        cam_dict: Dict,
+        out_dir: Path,
+        img_folder: str,
+        set_physics_props: bool = True,
+        enable_motion_planning: bool = True,
+        debug_level: int = 1,
+    ) -> None:
+        # basic simulation setup
+        self.sim: sim_utils.SimulationContext = sim
+        self.sim_cfgs = sim_cfgs
+        self.scene = scene
+        self.sim_dt = sim.get_physics_dt()
+
+        self.num_envs: int = int(scene.num_envs)
+        self._all_env_ids = torch.arange(
+            self.num_envs, device=sim.device, dtype=torch.long
+        )
+
+        self.cam_dict = cam_dict
+        self.out_dir: Path = out_dir
+        self.img_folder: str = img_folder
+
+        # scene entities
+        self.robot = scene["robot"]
+        if robot_pose.ndim == 1:
+            self.robot_pose = (
+                robot_pose.view(1, -1).repeat(self.num_envs, 1).to(self.robot.device)
+            )
+        else:
+            assert robot_pose.shape[0] == self.num_envs and robot_pose.shape[1] == 7, (
+                f"robot_pose must be [B,7], got {robot_pose.shape}"
+            )
+            self.robot_pose = robot_pose.to(self.robot.device).contiguous()
+
+        self.object_prim = scene["object_00"]
+        self.other_object_prims = [
+            scene[key]
+            for key in scene.keys()
+            if f"object_" in key and key != "object_00"
+        ]
+        self.background_prim = scene["background"]
+        self.camera: Camera = scene["camera"]
+
+        # physics properties
+        if set_physics_props:
+            static_friction = 5.0
+            dynamic_friction = 5.0
+            restitution = 0.0
+
+            # object: rigid prim -> has root_physx_view
+            if (
+                hasattr(self.object_prim, "root_physx_view")
+                and self.object_prim.root_physx_view is not None
+            ):
+                obj_view = self.object_prim.root_physx_view
+                obj_mats = obj_view.get_material_properties()
+                vals_obj = torch.tensor(
+                    [static_friction, dynamic_friction, restitution],
+                    device=obj_mats.device,
+                    dtype=obj_mats.dtype,
+                )
+                obj_mats[:] = vals_obj
+                obj_view.set_material_properties(
+                    obj_mats, self._all_env_ids.to(obj_mats.device)
+                )
+
+            # background: GroundPlaneCfg -> XFormPrim (no root_physx_view); skip if unavailable
+            if (
+                hasattr(self.background_prim, "root_physx_view")
+                and self.background_prim.root_physx_view is not None
+            ):
+                bg_view = self.background_prim.root_physx_view
+                bg_mats = bg_view.get_material_properties()
+                vals_bg = torch.tensor(
+                    [static_friction, dynamic_friction, restitution],
+                    device=bg_mats.device,
+                    dtype=bg_mats.dtype,
+                )
+                bg_mats[:] = vals_bg
+                bg_view.set_material_properties(
+                    bg_mats, self._all_env_ids.to(bg_mats.device)
+                )
+
+        # ik controller
+        self.diff_ik_cfg = DifferentialIKControllerCfg(
+            command_type="pose", use_relative_mode=False, ik_method="dls"
+        )
+        self.diff_ik_controller = DifferentialIKController(
+            self.diff_ik_cfg, num_envs=self.num_envs, device=sim.device
+        )
+
+        # robot: joints / gripper / jacobian indices
+        self.robot_entity_cfg = SceneEntityCfg(
+            "robot", joint_names=["panda_joint.*"], body_names=["panda_hand"]
+        )
+        self.robot_gripper_cfg = SceneEntityCfg(
+            "robot", joint_names=["panda_finger_joint.*"], body_names=["panda_hand"]
+        )
+        self.robot_entity_cfg.resolve(scene)
+        self.robot_gripper_cfg.resolve(scene)
+        self.gripper_open_tensor = 0.04 * torch.ones(
+            (self.num_envs, len(self.robot_gripper_cfg.joint_ids)),
+            device=self.robot.device,
+        )
+        self.gripper_close_tensor = torch.zeros(
+            (self.num_envs, len(self.robot_gripper_cfg.joint_ids)),
+            device=self.robot.device,
+        )
+        if self.robot.is_fixed_base:
+            self.ee_jacobi_idx = self.robot_entity_cfg.body_ids[0] - 1
+        else:
+            self.ee_jacobi_idx = self.robot_entity_cfg.body_ids[0]
+
+        # demo count and data saving
+        self.count = 0
+        self.demo_id = 0
+        self.save_dict = {
+            "rgb": [],
+            "depth": [],
+            "segmask": [],
+            "joint_pos": [],
+            "joint_vel": [],
+            "actions": [],
+            "gripper_pos": [],
+            "gripper_cmd": [],
+        }
+
+        # visualization
+        self.selected_object_id = 0
+        self.debug_level = debug_level
+
+        self.goal_vis_list = []
+        if self.debug_level > 0:
+            for b in range(self.num_envs):
+                cfg = VisualizationMarkersCfg(
+                    prim_path=f"/Visuals/ee_goal/env_{b:03d}",
+                    markers={
+                        "frame": sim_utils.UsdFileCfg(
+                            usd_path=f"{ISAAC_NUCLEUS_DIR}/Props/UIElements/frame_prim.usd",
+                            scale=(0.06, 0.06, 0.06),
+                            visual_material=sim_utils.PreviewSurfaceCfg(
+                                diffuse_color=(1.0, 0.0, 0.0)
+                            ),
+                        ),
+                    },
+                )
+                self.goal_vis_list.append(VisualizationMarkers(cfg))
+
+        # curobo motion planning
+        self.enable_motion_planning = enable_motion_planning
+        if self.enable_motion_planning:
+            print(f"prepare curobo motion planning: {enable_motion_planning}")
+            self.prepare_curobo()
+            print("curobo motion planning ready.")
+
+    # -------- Curobo Motion Planning ----------
+    def prepare_curobo(self):
+        setup_curobo_logger("error")
+        # tensor_args = TensorDeviceType()
+        tensor_args = TensorDeviceType(device=self.sim.device, dtype=torch.float32)
+        curobo_path = curobo.__file__.split("/__init__")[0]
+        robot_file = f"{curobo_path}/content/configs/robot/franka.yml"
+        motion_gen_config = MotionGenConfig.load_from_robot_config(
+            robot_cfg=robot_file,
+            world_model=None,
+            tensor_args=tensor_args,
+            interpolation_dt=self.sim_dt,
+            use_cuda_graph=True if self.num_envs == 1 else False,
+        )
+        self.motion_gen = MotionGen(motion_gen_config)
+        if self.num_envs == 1:
+            self.motion_gen.warmup(enable_graph=True)
+        _ = RobotConfig.from_dict(
+            load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"],
+            tensor_args,
+        )
+
+    # ---------- Helpers ----------
+    def _ensure_batch_pose(self, p, q):
+        """Ensure position [B,3], quaternion [B,4] on device."""
+        B = self.scene.num_envs
+        p = torch.as_tensor(p, dtype=torch.float32, device=self.sim.device)
+        q = torch.as_tensor(q, dtype=torch.float32, device=self.sim.device)
+        if p.ndim == 1:
+            p = p.view(1, -1).repeat(B, 1)
+        if q.ndim == 1:
+            q = q.view(1, -1).repeat(B, 1)
+        return p.contiguous(), q.contiguous()
+
+    def _traj_to_BT7(self, traj):
+        """Normalize various curobo traj.position shapes to [B, T, 7]."""
+        B = self.scene.num_envs
+        pos = traj.position  # torch or numpy
+        pos = torch.as_tensor(pos, device=self.sim.device, dtype=torch.float32)
+
+        if pos.ndim == 3:
+            # candidate shapes: [B,T,7] or [T,B,7]
+            if pos.shape[0] == B and pos.shape[-1] == 7:
+                return pos  # [B,T,7]
+            if pos.shape[1] == B and pos.shape[-1] == 7:
+                return pos.permute(1, 0, 2).contiguous()  # [B,T,7]
+        elif pos.ndim == 2 and pos.shape[-1] == 7:
+            # [T,7] → broadcast to all envs
+            return pos.unsqueeze(0).repeat(B, 1, 1)
+        # Fallback: flatten and infer
+        flat = pos.reshape(-1, 7)  # [B*T,7]
+        T = flat.shape[0] // B
+        return flat.view(B, T, 7).contiguous()
+
+    # ---------- Planning / Execution (Single) ----------
+    def motion_planning_single(
+        self, position, quaternion, max_attempts=1, use_graph=True
+    ):
+        """
+        single environment planning: prefer plan_single (supports graph / CUDA graph warmup better).
+        Returns [1, T, 7], returns None on failure.
+        """
+        # current joint position
+        joint_pos0 = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids][
+            0:1
+        ].contiguous()  # [1,7]
+        start_state = JointState.from_position(joint_pos0)
+
+        # goal (ensure [1,3]/[1,4])
+        pos_b, quat_b = self._ensure_batch_pose(position, quaternion)
+        pos_b = pos_b[0:1]
+        quat_b = quat_b[0:1]
+        goal_pose = Pose(position=pos_b, quaternion=quat_b)
+
+        plan_cfg = MotionGenPlanConfig(
+            max_attempts=max_attempts, enable_graph=use_graph
+        )
+
+        result = self.motion_gen.plan_single(start_state, goal_pose, plan_cfg)
+
+        traj = result.get_interpolated_plan()  # JointState
+
+        if result.success[0] == True:
+            T = traj.position.shape[-2]
+            BT7 = (
+                traj.position.to(self.sim.device).to(torch.float32).unsqueeze(0)
+            )  # [1,T,7]
+        else:
+            print(f"[WARN] motion planning failed.")
+            BT7 = joint_pos0.unsqueeze(1)  # [1,1,7]
+
+        return BT7, result.success
+
+    # ---------- Planning / Execution (Batched) ----------
+    def motion_planning_batch(
+        self, position, quaternion, max_attempts=1, allow_graph=False
+    ):
+        """
+        multi-environment planning: use plan_batch.
+        Default require_all=True: if any env fails, return None (keep your original semantics).
+        Returns [B, T, 7].
+        """
+        B = self.scene.num_envs
+        joint_pos = self.robot.data.joint_pos[
+            :, self.robot_entity_cfg.joint_ids
+        ].contiguous()  # [B,7]
+        start_state = JointState.from_position(joint_pos)
+
+        pos_b, quat_b = self._ensure_batch_pose(position, quaternion)  # [B,3], [B,4]
+        goal_pose = Pose(position=pos_b, quaternion=quat_b)
+
+        plan_cfg = MotionGenPlanConfig(
+            max_attempts=max_attempts, enable_graph=allow_graph
+        )
+
+        result = self.motion_gen.plan_batch(start_state, goal_pose, plan_cfg)
+
+        try:
+            paths = result.get_paths()  # List[JointState]
+            T_max = 1
+            for i, p in enumerate(paths):
+                if not result.success[i]:
+                    print(f"[WARN] motion planning failed for env {i}.")
+                else:
+                    T_max = max(T_max, int(p.position.shape[-2]))
+            dof = joint_pos.shape[-1]
+            BT7 = torch.zeros(
+                (B, T_max, dof), device=self.sim.device, dtype=torch.float32
+            )
+            for i, p in enumerate(paths):
+                if result.success[i] == False:
+                    BT7[i, :, :] = (
+                        joint_pos[i : i + 1, :].unsqueeze(1).repeat(1, T_max, 1)
+                    )
+                else:
+                    Ti = p.position.shape[-2]
+                    BT7[i, :Ti, :] = p.position.to(self.sim.device).to(torch.float32)
+                    if Ti < T_max:
+                        BT7[i, Ti:, :] = BT7[i, Ti - 1 : Ti, :]
+        except Exception as e:
+            print(f"[WARN] motion planning all failed with exception: {e}")
+            success = torch.zeros(
+                B, dtype=torch.bool, device=self.sim.device
+            )  # set to all false
+            BT7 = joint_pos.unsqueeze(1)  # [B,1,7]
+
+        # check exceptions
+        if result.success is None or result.success.shape[0] != B:
+            print(f"[WARN] motion planning success errors: {result.success}")
+            success = torch.zeros(
+                B, dtype=torch.bool, device=self.sim.device
+            )  # set to all false
+            BT7 = joint_pos.unsqueeze(1)  # [B,1,7]
+        else:
+            success = result.success
+        if BT7.shape[0] != B or BT7.shape[2] != joint_pos.shape[1]:
+            print(
+                f"[WARN] motion planning traj dim mismatch: {BT7.shape} vs {[B, 'T', joint_pos.shape[1]]}"
+            )
+            BT7 = joint_pos.unsqueeze(1)  # [B,1,7]
+
+        return BT7, success
+
+    def motion_planning(self, position, quaternion, max_attempts=1):
+        if self.scene.num_envs == 1:
+            return self.motion_planning_single(
+                position, quaternion, max_attempts=max_attempts, use_graph=True
+            )
+        else:
+            return self.motion_planning_batch(
+                position, quaternion, max_attempts=max_attempts, allow_graph=False
+            )
+
+    def move_to_motion_planning(
+        self,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        gripper_open: bool = True,
+        record: bool = True,
+    ) -> torch.Tensor:
+        """
+        Cartesian space control: Move the end effector to the desired position and orientation using motion planning.
+        Works with batched envs. If inputs are 1D, they will be broadcast to all envs.
+        """
+        traj, success = self.motion_planning(position, quaternion)
+        BT7 = traj
+        T = BT7.shape[1]
+        last = None
+        for i in range(T):
+            joint_pos_des = BT7[:, i, :]  # [B,7]
+            self.apply_actions(joint_pos_des, gripper_open=gripper_open)
+            obs = self.get_observation(gripper_open=gripper_open)
+            if record:
+                self.record_data(obs)
+            last = joint_pos_des
+        return last, success
+
+    def compose_real_video(self, env_id: int = 0):
+        """
+        Composite simulated video onto real background using mask-based rendering.
+        
+        Args:
+            env_id: Environment ID to process
+        """
+    
+        def pad_to_even(frame):
+            """Pad frame to even dimensions for video encoding."""
+            H, W = frame.shape[:2]
+            pad_h = H % 2
+            pad_w = W % 2
+            if pad_h or pad_w:
+                pad = ((0, pad_h), (0, pad_w)) if frame.ndim == 2 else ((0, pad_h), (0, pad_w), (0, 0))
+                frame = np.pad(frame, pad, mode='edge')
+            return frame
+        
+        # Construct paths
+        base_path = self.out_dir / self.img_folder
+        demo_path = self._demo_dir() / f"env_{env_id:03d}"
+        
+        SIM_VIDEO_PATH = demo_path / "sim_video.mp4"
+        MASK_VIDEO_PATH = demo_path / "mask_video.mp4"
+        REAL_BACKGROUND_PATH = base_path / "simulation" / "background.jpg"
+        OUTPUT_PATH = demo_path / "real_video.mp4"
+        
+        # Check if required files exist
+        if not SIM_VIDEO_PATH.exists():
+            print(f"[WARN] Simulated video not found: {SIM_VIDEO_PATH}")
+            return False
+        if not MASK_VIDEO_PATH.exists():
+            print(f"[WARN] Mask video not found: {MASK_VIDEO_PATH}")
+            return False
+        if not REAL_BACKGROUND_PATH.exists():
+            print(f"[WARN] Real background not found: {REAL_BACKGROUND_PATH}")
+            return False
+        
+        # Load real background image
+        print(f"[INFO] Loading real background: {REAL_BACKGROUND_PATH}")
+        real_img = np.array(Image.open(REAL_BACKGROUND_PATH))
+        
+        H, W, _ = real_img.shape
+        print(f"[INFO] Background size: {W}x{H}")
+        
+        # Open videos
+        print(f"[INFO] Loading simulated video: {SIM_VIDEO_PATH}")
+        rgb_reader = imageio.get_reader(SIM_VIDEO_PATH)
+        print(f"[INFO] Loading mask video: {MASK_VIDEO_PATH}")
+        mask_reader = imageio.get_reader(MASK_VIDEO_PATH)
+        
+        # Get metadata from original video
+        N = rgb_reader.count_frames()
+        fps = rgb_reader.get_meta_data()['fps']
+        print(f"[INFO] Processing {N} frames at {fps} FPS...")
+        
+        composed_images = []
+        
+        for i in range(N):
+            # Read frames
+            sim_rgb = rgb_reader.get_data(i)
+            sim_mask = mask_reader.get_data(i)
+            
+            # Convert mask to binary (grayscale > 127 = foreground)
+            if sim_mask.ndim == 3:
+                sim_mask = cv2.cvtColor(sim_mask, cv2.COLOR_RGB2GRAY)
+            sim_mask = sim_mask > 127
+            
+            # Resize sim frames to match real background if needed
+            if sim_rgb.shape[:2] != (H, W):
+                sim_rgb = cv2.resize(sim_rgb, (W, H))
+                sim_mask = cv2.resize(sim_mask.astype(np.uint8), (W, H)) > 0
+            
+            # Compose: real background + simulated foreground (where mask is True)
+            composed = real_img.copy()
+            composed = pad_to_even(composed)
+            composed[sim_mask] = sim_rgb[sim_mask]
+            
+            composed_images.append(composed)
+            
+            if (i + 1) % 10 == 0:
+                print(f"[INFO] Processed {i + 1}/{N} frames")
+        
+        # Save composed video
+        print(f"[INFO] Saving composed video to: {OUTPUT_PATH}")
+        writer = imageio.get_writer(OUTPUT_PATH, fps=fps, macro_block_size=None)
+        for frame in composed_images:
+            writer.append_data(frame)
+        writer.close()
+        
+        print(f"[INFO] Done! Saved {len(composed_images)} frames to {OUTPUT_PATH} at {fps} FPS")
+        return True
+    # ---------- Visualization ----------
+    def show_goal(self, pos, quat):
+        """
+        show a pose with visual marker(s).
+          - if [B,3]/[B,4], update all envs;
+          - if [3]/[4] or [1,3]/[1,4], default to update env 0;
+          - optional env_ids specify a subset of envs to update; when a single pose is input, it will be broadcast to these envs.
+        """
+        if self.debug_level == 0:
+            print("debug_level=0, skip visualization.")
+            return
+
+        if not isinstance(pos, torch.Tensor):
+            pos_t = torch.tensor(pos, dtype=torch.float32, device=self.sim.device)
+            quat_t = torch.tensor(quat, dtype=torch.float32, device=self.sim.device)
+        else:
+            pos_t = pos.to(self.sim.device)
+            quat_t = quat.to(self.sim.device)
+
+        if pos_t.ndim == 1:
+            pos_t = pos_t.view(1, -1)
+        if quat_t.ndim == 1:
+            quat_t = quat_t.view(1, -1)
+
+        B = self.num_envs
+
+        if pos_t.shape[0] == B:
+            for b in range(B):
+                self.goal_vis_list[b].visualize(pos_t[b : b + 1], quat_t[b : b + 1])
+        else:
+            self.goal_vis_list[0].visualize(pos_t, quat_t)
+
+    def set_robot_pose(self, robot_pose: torch.Tensor):
+        if robot_pose.ndim == 1:
+            self.robot_pose = (
+                robot_pose.view(1, -1).repeat(self.num_envs, 1).to(self.robot.device)
+            )
+        else:
+            assert robot_pose.shape[0] == self.num_envs and robot_pose.shape[1] == 7, (
+                f"robot_pose must be [B,7], got {robot_pose.shape}"
+            )
+            self.robot_pose = robot_pose.to(self.robot.device).contiguous()
+
+
+    # ---------- Environment Step ----------
+    def step(self):
+        self.scene.write_data_to_sim()
+        self.sim.step()
+        self.camera.update(dt=self.sim_dt)
+        self.count += 1
+        self.scene.update(self.sim_dt)
+
+    # ---------- Apply actions to robot joints ----------
+    def apply_actions(self, joint_pos_des, gripper_open: bool = True):
+        # joint_pos_des: [B, n_joints]
+        self.robot.set_joint_position_target(
+            joint_pos_des, joint_ids=self.robot_entity_cfg.joint_ids
+        )
+        if gripper_open:
+            self.robot.set_joint_position_target(
+                self.gripper_open_tensor, joint_ids=self.robot_gripper_cfg.joint_ids
+            )
+        else:
+            self.robot.set_joint_position_target(
+                self.gripper_close_tensor, joint_ids=self.robot_gripper_cfg.joint_ids
+            )
+        self.step()
+
+    # ---------- EE control ----------
+    def move_to(
+        self,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        gripper_open: bool = True,
+        record: bool = True,
+    ) -> torch.Tensor:
+        if self.enable_motion_planning:
+            return self.move_to_motion_planning(
+                position, quaternion, gripper_open=gripper_open, record=record
+            )
+        else:
+            return self.move_to_ik(
+                position, quaternion, gripper_open=gripper_open, record=record
+            )
+
+    def move_to_ik(
+        self,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        steps: int = 50,
+        gripper_open: bool = True,
+        record: bool = True,
+    ) -> torch.Tensor:
+        """
+        Cartesian space control: Move the end effector to the desired position and orientation using inverse kinematics.
+        Works with batched envs. If inputs are 1D, they will be broadcast to all envs.
+
+        Early-stop when both position and orientation errors are within tolerances.
+        'steps' now acts as a max-iteration cap; the loop breaks earlier on convergence.
+        """
+        # Ensure [B,3]/[B,4] tensors on device
+        position, quaternion = self._ensure_batch_pose(position, quaternion)
+
+        # IK command (world frame goals)
+        ee_goals = torch.cat([position, quaternion], dim=1).to(self.sim.device).float()
+        self.diff_ik_controller.reset()
+        self.diff_ik_controller.set_command(ee_goals)
+
+        # Tolerances (you can tune if needed)
+        pos_tol = 3e-3  # meters
+        ori_tol = 3.0 * np.pi / 180.0  # radians (~3 degrees)
+
+        # Interpret 'steps' as max iterations; early-stop on convergence
+        max_steps = int(steps) if steps is not None and steps > 0 else 10_000
+
+        joint_pos_des = None
+        for _ in range(max_steps):
+            # Current EE pose (world) and Jacobian
+            jacobian = self.robot.root_physx_view.get_jacobians()[
+                :, self.ee_jacobi_idx, :, self.robot_entity_cfg.joint_ids
+            ]
+            ee_pose_w = self.robot.data.body_state_w[
+                :, self.robot_entity_cfg.body_ids[0], 0:7
+            ]
+            root_pose_w = self.robot.data.root_state_w[:, 0:7]
+            joint_pos = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
+
+            # Current EE pose expressed in robot base
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_pose_w[:, 0:3],
+                root_pose_w[:, 3:7],
+                ee_pose_w[:, 0:3],
+                ee_pose_w[:, 3:7],
+            )
+
+            # Compute next joint command
+            joint_pos_des = self.diff_ik_controller.compute(
+                ee_pos_b, ee_quat_b, jacobian, joint_pos
+            )
+
+            # Apply
+            self.apply_actions(joint_pos_des, gripper_open=gripper_open)
+
+            # Optional recording
+            if record:
+                obs = self.get_observation(gripper_open=gripper_open)
+                self.record_data(obs)
+
+            # --- Early-stop check ---
+            # Desired EE pose in base frame (convert world goal -> base)
+            des_pos_b, des_quat_b = subtract_frame_transforms(
+                root_pose_w[:, 0:3], root_pose_w[:, 3:7], position, quaternion
+            )
+            # Position error [B]
+            pos_err = torch.norm(des_pos_b - ee_pos_b, dim=1)
+            # Orientation error [B]: angle between quaternions
+            # Note: q and -q are equivalent -> take |dot|
+            dot = torch.sum(des_quat_b * ee_quat_b, dim=1).abs().clamp(-1.0, 1.0)
+            ori_err = 2.0 * torch.acos(dot)
+
+            done = (pos_err <= pos_tol) & (ori_err <= ori_tol)
+            if bool(torch.all(done)):
+                break
+
+        return joint_pos_des
+
+    # ---------- Robot Waiting ----------
+    def wait(self, gripper_open, steps: int, record: bool = True):
+        joint_pos_des = self.robot.data.joint_pos[
+            :, self.robot_entity_cfg.joint_ids
+        ].clone()
+        for _ in range(steps):
+            self.apply_actions(joint_pos_des, gripper_open=gripper_open)
+            obs = self.get_observation(gripper_open=gripper_open)
+            if record:
+                self.record_data(obs)
+        return joint_pos_des
+
+    # ---------- Reset Envs ----------
+    def reset(self, env_ids=None):
+        """
+        Reset all envs or only those in env_ids.
+        Assumptions:
+          - self.robot_pose.shape == (B, 7)        # base pose per env (wxyz in [:,3:])
+          - self.robot.data.default_joint_pos == (B, 7)
+          - self.robot.data.default_joint_vel == (B, 7)
+        """
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)  # (B,)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(
+                -1
+            )  # (M,)
+        M = int(env_ids_t.shape[0])
+
+        # --- object pose/vel: set object at env origins with identity quat ---
+        env_origins = self.scene.env_origins.to(device)[env_ids_t]  # (M,3)
+        object_pose = torch.zeros((M, 7), device=device, dtype=torch.float32)
+        object_pose[:, :3] = env_origins
+        object_pose[:, 3] = 1.0  # wxyz = [1,0,0,0]
+        self.object_prim.write_root_pose_to_sim(object_pose, env_ids=env_ids_t)
+        self.object_prim.write_root_velocity_to_sim(
+            torch.zeros((M, 6), device=device, dtype=torch.float32), env_ids=env_ids_t
+        )
+        self.object_prim.write_data_to_sim()
+        for prim in self.other_object_prims:
+            prim.write_root_pose_to_sim(object_pose, env_ids=env_ids_t)
+            prim.write_root_velocity_to_sim(
+                torch.zeros((M, 6), device=device, dtype=torch.float32),
+                env_ids=env_ids_t,
+            )
+            prim.write_data_to_sim()
+
+        # --- robot base pose/vel ---
+        # robot_pose is (B,7) in *local* base frame; add env origin offset per env
+        rp_local = self.robot_pose.to(self.robot.device)[env_ids_t]  # (M,7)
+        env_origins_robot = env_origins.to(self.robot.device)  # (M,3)
+        robot_pose_world = rp_local.clone()
+        robot_pose_world[:, :3] = env_origins_robot + robot_pose_world[:, :3]
+        self.robot.write_root_pose_to_sim(robot_pose_world, env_ids=env_ids_t)
+        self.robot.write_root_velocity_to_sim(
+            torch.zeros((M, 6), device=self.robot.device, dtype=torch.float32),
+            env_ids=env_ids_t,
+        )
+
+        # --- joints (B,7) -> select ids (M,7) ---
+        joint_pos = self.robot.data.default_joint_pos.to(self.robot.device)[
+            env_ids_t
+        ]  # (M,7)
+        joint_vel = self.robot.data.default_joint_vel.to(self.robot.device)[
+            env_ids_t
+        ]  # (M,7)
+        self.robot.write_joint_state_to_sim(joint_pos, joint_vel, env_ids=env_ids_t)
+        self.robot.write_data_to_sim()
+
+        # housekeeping
+        self.clear_data()
+
+    # ---------- Get Observations ----------
+    def get_observation(self, gripper_open) -> Dict[str, torch.Tensor]:
+        # camera outputs (already batched)
+        rgb = self.camera.data.output["rgb"]  # [B,H,W,3]
+        depth = self.camera.data.output["distance_to_image_plane"]  # [B,H,W]
+        ins_all = self.camera.data.output["instance_id_segmentation_fast"]  # [B,H,W]
+
+        B, H, W, _ = ins_all.shape
+        fg_mask_list = []
+        obj_mask_list = []
+        for b in range(B):
+            ins_id_seg = ins_all[b]
+            id_mapping = self.camera.data.info[b]["instance_id_segmentation_fast"][
+                "idToLabels"
+            ]
+            fg_mask_b = torch.zeros_like(
+                ins_id_seg, dtype=torch.bool, device=ins_id_seg.device
+            )
+            obj_mask_b = torch.zeros_like(
+                ins_id_seg, dtype=torch.bool, device=ins_id_seg.device
+            )
+            for key, value in id_mapping.items():
+                if "object" in value:
+                    fg_mask_b |= ins_id_seg == key
+                    obj_mask_b |= ins_id_seg == key
+                if "Robot" in value:
+                    fg_mask_b |= ins_id_seg == key
+            fg_mask_list.append(fg_mask_b)
+            obj_mask_list.append(obj_mask_b)
+        fg_mask = torch.stack(fg_mask_list, dim=0)  # [B,H,W]
+        obj_mask = torch.stack(obj_mask_list, dim=0)  # [B,H,W]
+
+        ee_pose_w = self.robot.data.body_state_w[
+            :, self.robot_entity_cfg.body_ids[0], 0:7
+        ]
+        joint_pos = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
+        joint_vel = self.robot.data.joint_vel[:, self.robot_entity_cfg.joint_ids]
+        gripper_pos = self.robot.data.joint_pos[:, self.robot_gripper_cfg.joint_ids]
+        gripper_cmd = (
+            self.gripper_open_tensor if gripper_open else self.gripper_close_tensor
+        )
+
+        cam_pos_w = self.camera.data.pos_w
+        cam_quat_w = self.camera.data.quat_w_ros
+        ee_pos_cam, ee_quat_cam = subtract_frame_transforms(
+            cam_pos_w, cam_quat_w, ee_pose_w[:, 0:3], ee_pose_w[:, 3:7]
+        )
+        ee_pose_cam = torch.cat([ee_pos_cam, ee_quat_cam], dim=1)
+
+        points_3d_cam = unproject_depth(
+            self.camera.data.output["distance_to_image_plane"],
+            self.camera.data.intrinsic_matrices,
+        )
+        points_3d_world = transform_points(
+            points_3d_cam, self.camera.data.pos_w, self.camera.data.quat_w_ros
+        )
+
+        object_center = self.object_prim.data.root_com_pos_w[:, :3]
+
+        return {
+            "rgb": rgb,
+            "depth": depth,
+            "fg_mask": fg_mask,
+            "joint_pos": joint_pos,
+            "gripper_pos": gripper_pos,
+            "gripper_cmd": gripper_cmd,
+            "joint_vel": joint_vel,
+            "ee_pose_cam": ee_pose_cam,
+            "ee_pose_w": ee_pose_w,
+            "object_mask": obj_mask,
+            "points_cam": points_3d_cam,
+            "points_world": points_3d_world,
+            "object_center": object_center,
+        }
+
+    # ---------- Task Completion Verifier ----------
+    def is_success(self) -> bool:
+        raise NotImplementedError(
+            "BaseSimulator.is_success() should be implemented in subclass."
+        )
+
+    # ---------- Data Recording & Saving & Clearing ----------
+    def record_data(self, obs: Dict[str, torch.Tensor]):
+        self.save_dict["rgb"].append(obs["rgb"].cpu().numpy())  # [B,H,W,3]
+        self.save_dict["depth"].append(obs["depth"].cpu().numpy())  # [B,H,W]
+        self.save_dict["segmask"].append(obs["fg_mask"].cpu().numpy())  # [B,H,W]
+        self.save_dict["joint_pos"].append(obs["joint_pos"].cpu().numpy())  # [B,nJ]
+        self.save_dict["gripper_pos"].append(obs["gripper_pos"].cpu().numpy())  # [B,3]
+        self.save_dict["gripper_cmd"].append(obs["gripper_cmd"].cpu().numpy())  # [B,1]
+        self.save_dict["joint_vel"].append(obs["joint_vel"].cpu().numpy())
+
+    def clear_data(self):
+        for key in self.save_dict.keys():
+            self.save_dict[key] = []
+
+    def _demo_dir(self) -> Path:
+        return self.out_dir / self.img_folder / "demos" / f"demo_{self.demo_id}"
+
+    def _env_dir(self, base: Path, b: int) -> Path:
+        d = base / f"env_{b:03d}"
+        d.mkdir(parents=True, exist_ok=True)
+        return d
+
+    def save_data(self, ignore_keys: List[str] = []):
+        save_root = self._demo_dir()
+        save_root.mkdir(parents=True, exist_ok=True)
+
+        stacked = {k: np.array(v) for k, v in self.save_dict.items()}
+
+        for b in range(self.num_envs):
+            env_dir = self._env_dir(save_root, b)
+            for key, arr in stacked.items():
+                if key in ignore_keys:  # skip the keys for storage
+                    continue
+                if key == "rgb":
+                    video_path = env_dir / "sim_video.mp4"
+                    writer = imageio.get_writer(
+                        video_path, fps=50, macro_block_size=None
+                    )
+                    for t in range(arr.shape[0]):
+                        writer.append_data(arr[t, b])
+                    writer.close()
+                elif key == "segmask":
+                    video_path = env_dir / "mask_video.mp4"
+                    writer = imageio.get_writer(
+                        video_path, fps=50, macro_block_size=None
+                    )
+                    for t in range(arr.shape[0]):
+                        writer.append_data((arr[t, b].astype(np.uint8) * 255))
+                    writer.close()
+                elif key == "depth":
+                    depth_seq = arr[:, b]
+                    flat = depth_seq[depth_seq > 0]
+                    max_depth = np.percentile(flat, 99) if flat.size > 0 else 1.0
+                    depth_norm = np.clip(depth_seq / max_depth * 255.0, 0, 255).astype(
+                        np.uint8
+                    )
+                    video_path = env_dir / "depth_video.mp4"
+                    writer = imageio.get_writer(
+                        video_path, fps=50, macro_block_size=None
+                    )
+                    for t in range(depth_norm.shape[0]):
+                        writer.append_data(depth_norm[t])
+                    writer.close()
+                    np.save(env_dir / f"{key}.npy", depth_seq)
+                else:
+                    np.save(env_dir / f"{key}.npy", arr[:, b])
+            json.dump(self.sim_cfgs, open(env_dir / "config.json", "w"), indent=2)
+        
+        print("[INFO]: Demonstration is saved at: ", save_root)
+        
+        # Compose real videos for all environments
+        print("\n[INFO] Composing real videos with background...")
+        for b in range(self.num_envs):
+            print(f"[INFO] Processing environment {b}/{self.num_envs}...")
+            success = self.compose_real_video(env_id=b)
+            if success:
+                print(f"[INFO] Real video composed successfully for env {b}")
+            else:
+                print(f"[WARN] Failed to compose real video for env {b}")
+
+        demo_root = self.out_dir / "all_demos"
+        demo_root.mkdir(parents=True, exist_ok=True)
+        total_demo_id = get_next_demo_id(demo_root)
+        demo_save_path = demo_root / f"demo_{total_demo_id}"
+        demo_save_path.mkdir(parents=True, exist_ok=True)
+        meta_info = {
+            "path": str(save_root),
+            "fps": 50,
+        }
+        with open(demo_save_path / "meta_info.json", "w") as f:
+            json.dump(meta_info, f)
+        os.system(f"cp -r {save_root}/* {demo_save_path}")
+        print("[INFO]: Demonstration is saved at: ", demo_save_path)
+
+    def delete_data(self):
+        save_path = self._demo_dir()
+        failure_root = self.out_dir / self.img_folder / "demos_failures"
+        failure_root.mkdir(parents=True, exist_ok=True)
+        fail_demo_id = get_next_demo_id(failure_root)
+        failure_path = failure_root / f"demo_{fail_demo_id}"
+        os.system(f"mv {save_path} {failure_path}")
+        for key in self.save_dict.keys():
+            self.save_dict[key] = []
+        print("[INFO]: Clear up the folder: ", save_path)
diff --git a/openreal2sim/simulation/isaaclab-old/sim_env_factory.py b/openreal2sim/simulation/isaaclab-old/sim_env_factory.py
new file mode 100755
index 0000000..073668b
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_env_factory.py
@@ -0,0 +1,573 @@
+# -*- coding: utf-8 -*-
+"""
+Isaac Lab-based simulation environment factory.
+"""
+
+from __future__ import annotations
+import copy
+import json
+import random
+from pathlib import Path
+from typing import Tuple, Optional, List, Dict
+
+import numpy as np
+import transforms3d
+
+# Isaac Lab core
+from isaaclab.utils import configclass
+from isaaclab.scene import InteractiveSceneCfg
+from isaaclab.assets import AssetBaseCfg, RigidObjectCfg
+from isaaclab.sim.schemas import schemas_cfg
+import isaaclab.sim as sim_utils
+from isaaclab.sensors.camera import CameraCfg
+from isaaclab_assets import FRANKA_PANDA_HIGH_PD_CFG
+
+# Manager-based API (terms/configs)
+from isaaclab.managers import (
+    TerminationTermCfg as DoneTerm,
+    EventTermCfg as EventTerm,
+    ObservationGroupCfg as ObsGroup,
+    ObservationTermCfg as ObsTerm,
+    RewardTermCfg as RewTerm,
+    CurriculumTermCfg as CurrTerm,
+    SceneEntityCfg,
+)
+
+# Task-specific MDP helpers (adjust path if needed)
+import isaaclab_tasks.manager_based.manipulation.lift.mdp as mdp
+
+from dataclasses import dataclass, MISSING
+
+from dataclasses import dataclass
+from typing import Optional, List, Dict
+
+
+@dataclass
+class SceneCtx:
+    cam_dict: Dict
+    obj_paths: List[str]
+    background_path: str
+    robot_pos: List[float]
+    robot_rot: List[float]
+    bg_physics: Optional[Dict] = None
+    obj_physics: List[Dict] = None
+    use_ground_plane: bool = False
+    ground_z: Optional[float] = None
+
+
+_SCENE_CTX: Optional[SceneCtx] = None
+
+# ---- default physx presets ----
+DEFAULT_BG_PHYSICS = {
+    "mass_props": {"mass": 100.0},
+    "rigid_props": {"disable_gravity": True, "kinematic_enabled": True},
+    "collision_props": {
+        "collision_enabled": True,
+        "contact_offset": 0.0015,
+        "rest_offset": 0.0003,
+        "torsional_patch_radius": 0.02,
+        "min_torsional_patch_radius": 0.005,
+    },
+}
+DEFAULT_OBJ_PHYSICS = {
+    "mass_props": {"mass": 0.5},
+    "rigid_props": {"disable_gravity": False, "kinematic_enabled": False},
+    "collision_props": {
+        "collision_enabled": True,
+        "contact_offset": 0.0015,
+        "rest_offset": 0.0003,
+        "torsional_patch_radius": 0.02,
+        "min_torsional_patch_radius": 0.005,
+    },
+}
+
+
+def _deep_update(dst: dict, src: dict | None) -> dict:
+    """Recursive dict update without touching the original."""
+    out = copy.deepcopy(dst)
+    if not src:
+        return out
+    for k, v in src.items():
+        if isinstance(v, dict) and isinstance(out.get(k), dict):
+            out[k] = _deep_update(out[k], v)
+        else:
+            out[k] = v
+    return out
+
+
+# --------------------------------------------------------------------------------------
+# Camera / Robot builders (use _SCENE_CTX)
+# --------------------------------------------------------------------------------------
+def create_camera():
+    """Return a CameraCfg using the global SceneCtx."""
+    assert _SCENE_CTX is not None, (
+        "init_scene_configs/init_scene_from_scene_dict must be called first."
+    )
+    C = _SCENE_CTX.cam_dict
+    width = int(C["width"])
+    height = int(C["height"])
+    fx, fy, cx, cy = C["fx"], C["fy"], C["cx"], C["cy"]
+    cam_orientation = tuple(C["cam_orientation"])
+    cam_pos = tuple(C["scene_info"]["move_to"])
+    return CameraCfg(
+        prim_path="{ENV_REGEX_NS}/Camera",
+        offset=CameraCfg.OffsetCfg(pos=cam_pos, rot=cam_orientation, convention="ros"),
+        data_types=[
+            "rgb",
+            "distance_to_image_plane",
+            "distance_to_camera",
+            "instance_id_segmentation_fast",
+        ],
+        colorize_instance_id_segmentation=False,
+        spawn=sim_utils.PinholeCameraCfg.from_intrinsic_matrix(
+            intrinsic_matrix=[fx, 0, cx, 0, fy, cy, 0, 0, 1],
+            width=width,
+            height=height,
+        ),
+        width=width,
+        height=height,
+    )
+
+
+def create_robot():
+    """Return a configured Franka Panda config using the global SceneCtx."""
+    assert _SCENE_CTX is not None, (
+        "init_scene_configs/init_scene_from_scene_dict must be called first."
+    )
+    robot = FRANKA_PANDA_HIGH_PD_CFG.replace(prim_path="{ENV_REGEX_NS}/Robot")
+    robot.init_state.pos = tuple(_SCENE_CTX.robot_pos)
+    robot.init_state.rot = tuple(_SCENE_CTX.robot_rot)
+    return robot
+
+
+# --------------------------------------------------------------------------------------
+# Dynamic InteractiveSceneCfg builder
+# --------------------------------------------------------------------------------------
+def build_tabletop_scene_cfg():
+    """
+    Auto-generate a multi-object InteractiveSceneCfg subclass:
+      - background, camera, robot
+      - object_00, object_01, ... based on _SCENE_CTX.obj_paths
+    """
+    assert _SCENE_CTX is not None, (
+        "init_scene_configs/init_scene_from_scene_dict must be called first."
+    )
+    C = _SCENE_CTX
+
+    base_attrs = {}
+
+    # Light
+    base_attrs["light"] = AssetBaseCfg(
+        prim_path="/World/lightDome",
+        spawn=sim_utils.DomeLightCfg(intensity=4000.0, color=(1.0, 1.0, 1.0)),
+    )
+
+    _bg = _deep_update(DEFAULT_BG_PHYSICS, C.bg_physics)
+    _objs = [
+        _deep_update(DEFAULT_OBJ_PHYSICS, obj_physics) for obj_physics in C.obj_physics
+    ]
+
+    bg_mass_cfg = schemas_cfg.MassPropertiesCfg(**_bg["mass_props"])
+    bg_rigid_cfg = schemas_cfg.RigidBodyPropertiesCfg(**_bg["rigid_props"])
+    bg_colli_cfg = schemas_cfg.CollisionPropertiesCfg(**_bg["collision_props"])
+
+    # add another ground plane (mainly for better visualization)
+    base_attrs["backgroundn"] = AssetBaseCfg(
+        prim_path="/World/defaultGroundPlane",
+        spawn=sim_utils.GroundPlaneCfg(),
+    )
+    z = float(C.ground_z if C.ground_z is not None else 0.0)
+    base_attrs["backgroundn"].init_state.pos = (0.0, 0.0, z - 0.2)
+
+    # ---------- Background ----------
+    if C.use_ground_plane:
+        # Simple horizontal ground; only z is customized.
+        # Note: GroundPlaneCfg doesn't take mass/rigid/collision configs (it's a helper),
+        #       so we only set pose here.
+        base_attrs["background"] = AssetBaseCfg(
+            prim_path="/World/defaultGroundPlane",
+            spawn=sim_utils.GroundPlaneCfg(),
+        )
+        z = float(C.ground_z if C.ground_z is not None else 0.0)
+        base_attrs["background"].init_state.pos = (0.0, 0.0, z)
+        # no usd_path assignment in __post_init__ when using ground
+    else:
+        # original USD background
+        base_attrs["background"] = RigidObjectCfg(
+            prim_path="{ENV_REGEX_NS}/Background",
+            spawn=sim_utils.UsdFileCfg(
+                usd_path="",
+                mass_props=bg_mass_cfg,
+                rigid_props=bg_rigid_cfg,
+                collision_props=bg_colli_cfg,
+            ),
+        )
+
+    # Placeholder entries to be replaced in __post_init__
+    base_attrs["camera"] = CameraCfg(prim_path="{ENV_REGEX_NS}/Camera")
+    base_attrs["robot"] = FRANKA_PANDA_HIGH_PD_CFG.replace(
+        prim_path="{ENV_REGEX_NS}/Robot"
+    )
+
+    # Instantiate objects
+    for i, usd_path in enumerate(C.obj_paths):
+        obj_mass_cfg = schemas_cfg.MassPropertiesCfg(**_objs[i]["mass_props"])
+        obj_rigid_cfg = schemas_cfg.RigidBodyPropertiesCfg(**_objs[i]["rigid_props"])
+        obj_colli_cfg = schemas_cfg.CollisionPropertiesCfg(
+            **_objs[i]["collision_props"]
+        )
+
+        obj_template = RigidObjectCfg(
+            prim_path="{ENV_REGEX_NS}/Object",
+            spawn=sim_utils.UsdFileCfg(
+                usd_path="",
+                mass_props=obj_mass_cfg,
+                rigid_props=obj_rigid_cfg,
+                collision_props=obj_colli_cfg,
+            ),
+        )
+
+        name = f"object_{i:02d}"
+        cfg_i = copy.deepcopy(obj_template)
+        cfg_i.prim_path = f"{{ENV_REGEX_NS}}/{name}"
+        cfg_i.spawn.usd_path = usd_path
+        base_attrs[name] = cfg_i
+
+    # Inject background path + replace camera/robot on finalize
+    def __post_init__(self):
+        if not C.use_ground_plane:
+            self.background.spawn.usd_path = C.background_path
+        self.camera = create_camera()
+        self.robot = create_robot()
+
+    attrs = dict(base_attrs)
+    attrs["__doc__"] = "Auto-generated multi-object TableTop scene cfg."
+    attrs["__post_init__"] = __post_init__
+
+    DynamicSceneCfg = configclass(
+        type("TableTopSceneCfgAuto", (InteractiveSceneCfg,), attrs)
+    )
+    return DynamicSceneCfg
+
+
+# --------------------------------------------------------------------------------------
+# Build cam_dict & init directly from a raw scene dict
+# --------------------------------------------------------------------------------------
+def init_scene_from_scene_dict(
+    scene: dict,
+    cfgs: dict,
+    *,
+    use_ground_plane: bool = False,
+):
+    """
+    Initialize SceneCtx directly from a raw scene dict.
+    If robot pose not provided, sample one with robot_placement_candidates_v2().
+    """
+    cam_dict = cfgs["cam_cfg"]
+    obj_paths = [o["usd"] for o in scene["objects"].values()]
+    background_path = scene["background"]["usd"]
+
+    # overwrite physics
+    # Priority: args > scene > default
+    obj_physics = cfgs["physics_cfg"]["obj_physics"]
+    bg_physics = cfgs["physics_cfg"]["bg_physics"]
+    if obj_physics is None:
+        obj_physics = [o.get("physics", None) for o in scene["objects"].values()]
+    elif isinstance(obj_physics, dict):
+        obj_physics = [obj_physics for _ in scene["objects"].values()]
+    elif isinstance(obj_physics, list):
+        assert len(obj_physics) == len(scene["objects"]), (
+            "obj_physics must be a list of the same length as scene['objects'] if provided."
+        )
+        pass
+    else:
+        raise TypeError("obj_physics must be None, a dict, or a list of dicts.")
+    bg_physics = (
+        scene["background"].get("physics", None) if bg_physics is None else bg_physics
+    )
+
+    robot_pos = cfgs["robot_cfg"]["robot_pose"][:3]
+    robot_rot = cfgs["robot_cfg"]["robot_pose"][3:]
+
+    ground_z = None
+    if use_ground_plane:
+        try:
+            ground_z = float(scene["plane"]["simulation"]["point"][2])
+        except Exception as e:
+            raise ValueError(
+                f"use_ground_plane=True but scene['plane']['simulation'] missing/invalid: {e}"
+            )
+
+    # write global ctx (keep old fields the same)
+    global _SCENE_CTX
+    _SCENE_CTX = SceneCtx(
+        cam_dict=cam_dict,
+        obj_paths=obj_paths,
+        background_path=background_path,
+        robot_pos=list(robot_pos),
+        robot_rot=list(robot_rot),
+        bg_physics=bg_physics,
+        obj_physics=list(obj_physics),
+        use_ground_plane=use_ground_plane,
+        ground_z=ground_z,
+    )
+
+    return {
+        "cam_dict": cam_dict,
+        "obj_usd_paths": obj_paths,
+        "background_usd": background_path,
+        "robot_pos": robot_pos,
+        "robot_rot": robot_rot,
+        "use_ground_plane": use_ground_plane,
+        "ground_z": ground_z,
+    }
+
+
+# --------------------------------------------------------------------------------------
+# Env factory
+# --------------------------------------------------------------------------------------
+def _build_manip_env_cfg(scene_cfg_cls, *, num_envs: int, env_spacing: float = 2.5):
+    """Return a ManagerBasedRLEnvCfg subclass stitched together from sub-Cfgs."""
+    from isaaclab.envs import ManagerBasedRLEnvCfg
+    from isaaclab.controllers.differential_ik_cfg import DifferentialIKControllerCfg
+    from isaaclab.envs.mdp.actions.actions_cfg import (
+        DifferentialInverseKinematicsActionCfg,
+    )
+
+    @configclass
+    class ManipEnvCfg(ManagerBasedRLEnvCfg):
+        scene = scene_cfg_cls(num_envs=num_envs, env_spacing=env_spacing)
+        commands = CommandsCfg()
+        actions = ActionsCfg()
+        observations = ObservationsCfg()
+        events = EventCfg()
+        rewards = RewardsCfg()
+        terminations = TerminationsCfg()
+        curriculum = CurriculumCfg()
+
+        def __post_init__(self):
+            # ---- Sim & PhysX ----
+            self.decimation = 2  # 4
+            self.episode_length_s = 5.0
+            self.sim.dt = 0.01
+            self.sim.render_interval = self.decimation
+
+            physx = self.sim.physx
+            physx.enable_ccd = True
+            physx.solver_type = 1  # TGS
+            physx.num_position_iterations = 16
+            physx.num_velocity_iterations = 2
+            physx.contact_offset = 0.003
+            physx.rest_offset = 0.0
+            physx.max_depenetration_velocity = 0.5
+            physx.enable_stabilization = True
+            physx.enable_sleeping = True
+
+            # ---- IK arm & binary gripper ----
+            self.actions.arm_action = DifferentialInverseKinematicsActionCfg(
+                asset_name="robot",
+                joint_names=["panda_joint.*"],
+                body_name="panda_hand",
+                controller=DifferentialIKControllerCfg(
+                    command_type="pose", use_relative_mode=True, ik_method="dls"
+                ),
+                scale=0.5,
+                body_offset=DifferentialInverseKinematicsActionCfg.OffsetCfg(
+                    pos=[0.0, 0.0, 0.107]
+                ),
+            )
+            self.actions.gripper_action = mdp.BinaryJointPositionActionCfg(
+                asset_name="robot",
+                joint_names=["panda_finger.*"],
+                open_command_expr={"panda_finger_.*": 0.04},
+                close_command_expr={"panda_finger_.*": 0.0},
+            )
+            self.commands.object_pose.body_name = "panda_hand"
+
+    return ManipEnvCfg
+
+
+def load_scene_json(key: str) -> dict:
+    """Return the raw scene dict from outputs/<key>/simulation/scene.json."""
+    scene_path = Path.cwd() / "outputs" / key / "simulation" / "scene.json"
+    if not scene_path.exists():
+        raise FileNotFoundError(scene_path)
+    return json.load(open(scene_path))
+
+
+def make_env(
+    cfgs: dict,
+    num_envs: int = 1,
+    device: str = "cuda:0",
+    bg_simplify: bool = False,
+) -> Tuple["ManagerBasedRLEnv", "ManagerBasedRLEnvCfg"]:
+    """
+    Public entry to construct a ManagerBasedRLEnv from outputs/<key>/simulation/scene.json.
+    Returns: (env, env_cfg)
+    """
+    from isaaclab.envs import ManagerBasedRLEnv
+
+    # Load scene json and initialize global SceneCtx
+    scene = cfgs["scene_cfg"]
+    init_scene_from_scene_dict(
+        scene,
+        cfgs=cfgs,
+        use_ground_plane=bg_simplify,
+    )
+
+    # Build scene & env cfg
+    SceneCfg = build_tabletop_scene_cfg()
+    ManipEnvCfg = _build_manip_env_cfg(SceneCfg, num_envs=num_envs, env_spacing=2.5)
+    env_cfg = ManipEnvCfg()
+    env_cfg.sim.device = device
+    env_cfg.scene.num_envs = num_envs  # double safety
+
+    env = ManagerBasedRLEnv(cfg=env_cfg)
+    return env, env_cfg
+
+
+# --------------------------------------------------------------------------------------
+# Observation/Action/Reward/Termination/Curriculum config classes
+# --------------------------------------------------------------------------------------
+@configclass
+class CommandsCfg:
+    """Command terms for the MDP."""
+
+    object_pose = mdp.UniformPoseCommandCfg(
+        asset_name="robot",
+        body_name=MISSING,
+        resampling_time_range=(5.0, 5.0),
+        debug_vis=False,
+        ranges=mdp.UniformPoseCommandCfg.Ranges(
+            pos_x=(0.4, 0.6),
+            pos_y=(-0.25, 0.25),
+            pos_z=(0.25, 0.5),
+            roll=(0.0, 0.0),
+            pitch=(0.0, 0.0),
+            yaw=(0.0, 0.0),
+        ),
+    )
+
+
+@configclass
+class ActionsCfg:
+    """Action specifications for the MDP."""
+
+    arm_action: (
+        mdp.JointPositionActionCfg | mdp.DifferentialInverseKinematicsActionCfg
+    ) = MISSING  # type: ignore
+    gripper_action: mdp.BinaryJointPositionActionCfg = MISSING  # type: ignore
+
+
+@configclass
+class ObservationsCfg:
+    """Observation specifications for the MDP."""
+
+    @configclass
+    class PolicyCfg(ObsGroup):
+        joint_pos_rel = ObsTerm(func=mdp.joint_pos_rel)
+        joint_pos = ObsTerm(func=mdp.joint_pos)
+        joint_vel_rel = ObsTerm(func=mdp.joint_vel_rel)
+        joint_vel = ObsTerm(func=mdp.joint_vel)
+        target_object_position = ObsTerm(
+            func=mdp.generated_commands, params={"command_name": "object_pose"}
+        )
+        actions = ObsTerm(func=mdp.last_action)
+        rgb = ObsTerm(
+            func=mdp.image,
+            params={
+                "sensor_cfg": SceneEntityCfg(name="camera"),
+                "data_type": "rgb",
+                "convert_perspective_to_orthogonal": False,
+                "normalize": False,
+            },
+        )
+        depth = ObsTerm(
+            func=mdp.image,
+            params={
+                "sensor_cfg": SceneEntityCfg(name="camera"),
+                "data_type": "distance_to_image_plane",
+                "convert_perspective_to_orthogonal": False,
+                "normalize": False,
+            },
+        )
+        segmask = ObsTerm(
+            func=mdp.image,
+            params={
+                "sensor_cfg": SceneEntityCfg(name="camera"),
+                "data_type": "instance_id_segmentation_fast",
+                "convert_perspective_to_orthogonal": False,
+                "normalize": False,
+            },
+        )
+
+        def __post_init__(self):
+            self.enable_corruption = True
+            self.concatenate_terms = False
+
+    policy: PolicyCfg = PolicyCfg()
+
+
+@configclass
+class EventCfg:
+    """Configuration for events."""
+
+    reset_all = EventTerm(func=mdp.reset_scene_to_default, mode="reset")
+
+
+@configclass
+class RewardsCfg:
+    """Reward terms for the MDP (kept simple)."""
+
+    action_rate = RewTerm(func=mdp.action_rate_l2, weight=-1e-4)
+    joint_vel = RewTerm(
+        func=mdp.joint_vel_l2,
+        weight=-1e-4,
+        params={"asset_cfg": SceneEntityCfg("robot")},
+    )
+
+
+@configclass
+class TerminationsCfg:
+    """Termination terms for the MDP."""
+
+    time_out = DoneTerm(func=mdp.time_out, time_out=True)
+
+
+@configclass
+class CurriculumCfg:
+    """Curriculum terms for the MDP."""
+
+    action_rate = CurrTerm(
+        func=mdp.modify_reward_weight,
+        params={"term_name": "action_rate", "weight": -1e-1, "num_steps": 10000},
+    )
+    joint_vel = CurrTerm(
+        func=mdp.modify_reward_weight,
+        params={"term_name": "joint_vel", "weight": -1e-1, "num_steps": 10000},
+    )
+
+
+# Public symbols
+__all__ = [
+    # Scene init
+    "init_scene_configs",
+    "init_scene_from_scene_dict",
+    # Scene builders
+    "create_camera",
+    "create_robot",
+    "build_tabletop_scene_cfg",
+    # Placement samplers
+    "robot_placement_candidates",
+    "robot_placement_candidates_v2",
+    # Manager config groups
+    "CommandsCfg",
+    "ActionsCfg",
+    "ObservationsCfg",
+    "EventCfg",
+    "RewardsCfg",
+    "TerminationsCfg",
+    "CurriculumCfg",
+    # Env factory
+    "make_env",
+]
diff --git a/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py b/openreal2sim/simulation/isaaclab-old/sim_heuristic_manip.py
similarity index 100%
rename from openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
rename to openreal2sim/simulation/isaaclab-old/sim_heuristic_manip.py
diff --git a/openreal2sim/simulation/isaaclab/sim_preprocess/grasp_generation.py b/openreal2sim/simulation/isaaclab-old/sim_preprocess/grasp_generation.py
similarity index 100%
rename from openreal2sim/simulation/isaaclab/sim_preprocess/grasp_generation.py
rename to openreal2sim/simulation/isaaclab-old/sim_preprocess/grasp_generation.py
diff --git a/openreal2sim/simulation/isaaclab/sim_preprocess/grasp_utils.py b/openreal2sim/simulation/isaaclab-old/sim_preprocess/grasp_utils.py
similarity index 100%
rename from openreal2sim/simulation/isaaclab/sim_preprocess/grasp_utils.py
rename to openreal2sim/simulation/isaaclab-old/sim_preprocess/grasp_utils.py
diff --git a/openreal2sim/simulation/isaaclab-old/sim_preprocess/trajectory_smoothing.py b/openreal2sim/simulation/isaaclab-old/sim_preprocess/trajectory_smoothing.py
new file mode 100755
index 0000000..7e309ed
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_preprocess/trajectory_smoothing.py
@@ -0,0 +1,190 @@
+"""
+Trajectory smoothing with axis-flip/axis-permutation disambiguation in the object-local frame.
+Note this is just an experimental feature and may not work well.
+"""
+
+import numpy as np
+import transforms3d
+from pathlib import Path
+import yaml
+import json
+import argparse
+
+def _proj_to_SO3(R: np.ndarray) -> np.ndarray:
+    """Project a near-rotation 3x3 matrix to the closest element in SO(3) via SVD."""
+    U, _, Vt = np.linalg.svd(R)
+    R_star = U @ Vt
+    if np.linalg.det(R_star) < 0:
+        U[:, -1] *= -1
+        R_star = U @ Vt
+    return R_star
+
+def _angle_between_rotations(Ra: np.ndarray, Rb: np.ndarray) -> float:
+    """Geodesic angle on SO(3): arccos( (trace(Ra^T Rb) - 1)/2 )."""
+    M = Ra.T @ Rb
+    cos_th = (np.trace(M) - 1.0) * 0.5
+    cos_th = np.clip(cos_th, -1.0, 1.0)
+    return float(np.arccos(cos_th))
+
+def _quat_slerp(q0: np.ndarray, q1: np.ndarray, alpha: float) -> np.ndarray:
+    """Shortest-arc SLERP between unit quaternions q0 -> q1. Quaternions are (w,x,y,z)."""
+    q0 = q0 / np.linalg.norm(q0)
+    q1 = q1 / np.linalg.norm(q1)
+    dot = float(np.dot(q0, q1))
+    if dot < 0.0:
+        q1 = -q1
+        dot = -dot
+    dot = np.clip(dot, -1.0, 1.0)
+    if dot > 0.9995:
+        q = (1.0 - alpha) * q0 + alpha * q1
+        return q / np.linalg.norm(q)
+    theta0 = np.arccos(dot)
+    s0 = np.sin((1 - alpha) * theta0) / np.sin(theta0)
+    s1 = np.sin(alpha * theta0) / np.sin(theta0)
+    q = s0 * q0 + s1 * q1
+    return q / np.linalg.norm(q)
+
+def _generate_group24() -> np.ndarray:
+    """
+    Generate 24 orientation-preserving signed permutation matrices (cube rotations).
+    Each is 3x3, has one ±1 per row/column, det=+1.
+    """
+    mats = []
+    perms = [(0,1,2),(0,2,1),(1,0,2),(1,2,0),(2,0,1),(2,1,0)]
+    for p in perms:
+        for sx in (-1,1):
+            for sy in (-1,1):
+                for sz in (-1,1):
+                    M = np.zeros((3,3), float)
+                    M[0, p[0]] = sx
+                    M[1, p[1]] = sy
+                    M[2, p[2]] = sz
+                    if np.linalg.det(M) > 0:
+                        mats.append(M)
+    mats = np.stack(mats, axis=0)
+    assert mats.shape[0] == 24
+    return mats
+
+def smooth_object_local_trajectory(
+    poses: np.ndarray,
+    rot_smooth_alpha: float = 0.2,
+    trans_smooth_alpha: float = 0.2,
+    repair_SO3: bool = True,
+) -> np.ndarray:
+    """
+    Smooth a (N,4,4) trajectory with axis-flip/axis-permutation disambiguation in the OBJECT-LOCAL frame.
+
+    Assumptions:
+      - Each pose T_t = [R_t, t_t; 0 0 0 1], rotation acts actively on column vectors.
+      - Coordinate re-labeling / flips (right-handed only) may occur in the object's local frame.
+      - Therefore we correct R_meas by RIGHT-multiplying S in G24, i.e., R_corr = R_meas @ S.
+
+    Args:
+        poses: np.ndarray of shape (N, 4, 4), the raw trajectory.
+        rot_smooth_alpha: rotation smoothing factor in [0,1] (higher -> less smoothing).
+        trans_smooth_alpha: translation smoothing factor in [0,1].
+        repair_SO3: if True, project measured rotations to SO(3) per frame (robust to noise/reflection).
+
+    Returns:
+        np.ndarray of shape (N, 4, 4): the smoothed trajectory.
+    """
+    assert poses.ndim == 3 and poses.shape[1:] == (4,4), "poses must be (N,4,4)"
+    N = poses.shape[0]
+    if N == 0:
+        return poses.copy()
+
+    G = _generate_group24()
+
+    out = np.zeros_like(poses, dtype=float)
+    R_prev = None
+    q_prev = None
+    t_prev = None
+
+    for i in range(N):
+        T_in = poses[i].astype(float)
+        R_meas = T_in[:3, :3]
+        t_meas = T_in[:3, 3]
+
+        if repair_SO3:
+            R_meas = _proj_to_SO3(R_meas)
+
+        # --- Disambiguation in object-local frame: choose S to minimize distance to previous rotation.
+        if R_prev is not None:
+            best_angle = np.inf
+            R_best = R_meas
+            for S in G:
+                Rc = R_meas @ S  # RIGHT-multiply: local/object-frame relabel/flip
+                ang = _angle_between_rotations(R_prev, Rc)
+                if ang < best_angle:
+                    best_angle = ang
+                    R_best = Rc
+        else:
+            R_best = R_meas
+
+        # --- Quaternion sign continuity + SLERP smoothing.
+        q_best = transforms3d.quaternions.mat2quat(R_best)  # (w,x,y,z)
+        if q_prev is not None and float(np.dot(q_best, q_prev)) < 0.0:
+            q_best = -q_best
+        q_smooth = q_best if q_prev is None else _quat_slerp(q_prev, q_best, rot_smooth_alpha)
+        R_smooth = transforms3d.quaternions.quat2mat(q_smooth)
+
+        # --- Translation EMA smoothing (object-local relabeling doesn't change translation component).
+        t_smooth = t_meas if t_prev is None else (1.0 - trans_smooth_alpha) * t_prev + trans_smooth_alpha * t_meas
+
+        # --- Compose output pose.
+        T_out = np.eye(4, dtype=float)
+        T_out[:3, :3] = R_smooth
+        T_out[:3, 3]  = t_smooth
+        out[i] = T_out
+
+        # --- Update memory.
+        R_prev = R_smooth
+        q_prev = q_smooth
+        t_prev = t_smooth
+
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Smooth trajectory poses")
+    parser.add_argument("--traj", type=str, default="fdpose_trajs", help="Path to config file")
+    args = parser.parse_args()
+
+    cfg_path = Path.cwd() / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+
+    base_dir = Path.cwd()
+    out_dir = base_dir / "outputs"
+    for key in keys:
+        print(f"\n========== [Trajectory Smoothing] Processing key: {key} ==========")
+
+        scene_json = out_dir / key / "simulation" / "scene.json"
+        if not scene_json.exists():
+            raise FileNotFoundError(scene_json)
+
+        scene_dict = json.load(open(scene_json))
+        objects = scene_dict.get("objects", {})
+        if not isinstance(objects, dict) or len(objects) == 0:
+            print(f"[WARN][{key}] scene_dict['objects'] is empty.")
+            return
+
+        key_dir = out_dir / key
+        for i, obj in objects.items():
+            traj_path = obj[args.traj]
+            traj = np.load(traj_path).astype(np.float32) # relative poses (N,4,4)
+            
+            smoothed_trajs = smooth_object_local_trajectory(
+                traj,
+                rot_smooth_alpha=0.2,
+                trans_smooth_alpha=0.2,
+                repair_SO3=True,
+            ).astype(np.float32)
+            smoothed_path = key_dir / "reconstruction" / "motions" / f"{i}_{obj['name']}_smoothed_trajs.npy"
+            np.save(smoothed_path, smoothed_trajs)
+            scene_dict["objects"][i]["smooth_trajs"] = str(smoothed_path)
+
+        json.dump(scene_dict, open(scene_json, "w"), indent=2)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/openreal2sim/simulation/isaaclab-old/sim_preprocess/usd_conversion.py b/openreal2sim/simulation/isaaclab-old/sim_preprocess/usd_conversion.py
new file mode 100755
index 0000000..49b2f50
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_preprocess/usd_conversion.py
@@ -0,0 +1,584 @@
+########################################################
+## Commandline arguments (IsaacLab only; no --key)
+########################################################
+import argparse
+from isaaclab.app import AppLauncher
+
+parser = argparse.ArgumentParser()
+# NOTE: no --key here; we will batch keys from config/config.yaml
+AppLauncher.add_app_launcher_args(parser)
+args_isaaclab = parser.parse_args()
+# Force GUI unless you toggle via IsaacLab flags
+args_isaaclab.headless = False
+app_launcher = AppLauncher(args_isaaclab)
+simulation_app = app_launcher.app
+
+########################################################
+## Imports
+########################################################
+from dataclasses import dataclass
+from typing import Literal, List
+from pathlib import Path
+import numpy as np
+from PIL import Image
+import trimesh
+import json
+import yaml
+import contextlib
+import os
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+
+import carb
+import isaacsim.core.utils.stage as stage_utils
+import omni.kit.app
+from loguru import logger as log
+from isaaclab.sim.converters import (
+    MeshConverter,
+    MeshConverterCfg,
+    UrdfConverter,
+    UrdfConverterCfg,
+)
+from isaaclab.sim.schemas import schemas_cfg
+from isaaclab.utils.assets import check_file_path
+from pxr import Usd, UsdPhysics
+from rich.logging import RichHandler
+
+# Extra USD/PhysX schemas + tokens
+from pxr import PhysxSchema, Sdf, UsdGeom
+
+log.configure(handlers=[{"sink": RichHandler(), "format": "{message}"}])
+
+########################################################
+## Small utilities
+########################################################
+base_dir = Path.cwd()
+out_dir = base_dir / "outputs"
+
+def glb_to_usd(filename: str) -> str:
+    assert filename.endswith(".glb")
+    return filename.replace(".glb", ".usd")
+
+def convert_glb_to_obj(glb_path):
+    """
+    Convert a GLB file to an OBJ file with texture.
+    (Kept from your original file; unused by the flow unless you point inputs to .glb->.obj)
+    """
+    obj_name = str(glb_path).split("/")[-1].split(".")[0]
+    obj_path = Path(glb_path).parent / f"{obj_name}.obj"
+    mtl_path = Path(glb_path).parent / f"{obj_name}.mtl"
+    texture_path = Path(glb_path).parent / f"{obj_name}.jpg"
+
+    mesh = trimesh.load(glb_path)
+
+    if hasattr(mesh.visual, 'material') and hasattr(mesh.visual.material, 'image'):
+        image = mesh.visual.material.image
+        if image is not None:
+            Image.fromarray(np.asarray(image)).save(texture_path)
+
+    with open(mtl_path, "w") as mtl_file:
+        mtl_file.write(
+            "newmtl material_0\nKa 1.000 1.000 1.000\nKd 1.000 1.000 1.000\n"
+            "Ks 0.000 0.000 0.000\nNs 10.000\nd 1.0\nillum 2\n"
+        )
+        mtl_file.write(f"map_Kd {texture_path.name}\n")
+
+    with open(obj_path, "w") as obj_file:
+        obj_file.write(f"mtllib {mtl_path.name}\n")
+        for v in mesh.vertices:
+            obj_file.write(f"v {v[0]} {v[1]} {v[2]}\n")
+        for uv in mesh.visual.uv:
+            obj_file.write(f"vt {uv[0]} {1 - uv[1]}\n")
+        for f in mesh.faces:
+            obj_file.write(f"f {f[0]+1}/{f[0]+1} {f[1]+1}/{f[1]+1} {f[2]+1}/{f[2]+1}\n")
+
+    print(f"Converted GLB to OBJ: {obj_path}")
+    return str(obj_path)
+
+########################################################
+## Args container (same fields as before)
+########################################################
+@dataclass
+class Args:
+    input: List[str]
+    output: List[str]
+    make_instanceable: bool = False
+    # We will enforce convexDecomposition for both bg and objects in run_conversion()
+    collision_approximation: Literal["convexHull", "convexDecomposition", "meshSimplification", "none"] = "convexDecomposition"
+    mass: float | None = 1.0
+    disable_gravity: bool = False
+    kinematic_enabled: bool = False
+    headless: bool = False
+    exit_on_finish: bool = True
+
+# We'll assign a per-key Args instance to this global so helper functions keep working unchanged.
+args = None  # type: ignore
+
+########################################################
+## Converters and USD post-processing (unchanged)
+########################################################
+def convert_obj_to_usd(obj_path, usd_path):
+    log.info(f"Converting {obj_path}")
+
+    if not os.path.isabs(obj_path):
+        obj_path = os.path.abspath(obj_path)
+    if not check_file_path(obj_path):
+        raise ValueError(f"Invalid mesh file path: {obj_path}")
+
+    usd_path = os.path.abspath(usd_path)
+
+    mass_props = schemas_cfg.MassPropertiesCfg(mass=args.mass) if args.mass is not None else None
+    rigid_props = schemas_cfg.RigidBodyPropertiesCfg(
+        disable_gravity=args.disable_gravity,
+        kinematic_enabled=args.kinematic_enabled,
+    )
+    collision_props = schemas_cfg.CollisionPropertiesCfg(
+        collision_enabled=args.collision_approximation != "none",
+        contact_offset=0.006,
+        rest_offset=-0.002,
+        torsional_patch_radius=0.05,
+        min_torsional_patch_radius=0.005,
+    )
+
+    mesh_converter_cfg = MeshConverterCfg(
+        mass_props=mass_props,
+        rigid_props=rigid_props,
+        collision_props=collision_props,
+        asset_path=obj_path,
+        force_usd_conversion=True,
+        usd_dir=os.path.dirname(usd_path),
+        usd_file_name=os.path.basename(usd_path),
+        make_instanceable=args.make_instanceable,
+        collision_approximation="convexDecomposition",  # enforce
+    )
+
+    log.info(f"Conversion configuration: {mesh_converter_cfg}")
+    MeshConverter(mesh_converter_cfg)
+
+def ensure_rigidbody_and_set_kinematic(usd_path: str, disable_gravity: bool = True):
+    stage = Usd.Stage.Open(usd_path)
+    prim = stage.GetDefaultPrim()
+    if not prim or not prim.IsValid():
+        for p in stage.GetPseudoRoot().GetChildren():
+            prim = p
+            break
+        if not prim or not prim.IsValid():
+            raise RuntimeError(f"[ensure_rigidbody_and_set_kinematic] No valid prim in stage: {usd_path}")
+
+    UsdPhysics.RigidBodyAPI.Apply(prim)
+    try:
+        rb = PhysxSchema.PhysxRigidBodyAPI.Apply(prim)
+        if hasattr(rb, "CreateKinematicEnabledAttr"):
+            rb.CreateKinematicEnabledAttr(True)
+        else:
+            prim.CreateAttribute("physxRigidBody:kinematicEnabled", Sdf.ValueTypeNames.Bool).Set(True)
+        if hasattr(rb, "CreateDisableGravityAttr"):
+            rb.CreateDisableGravityAttr(bool(disable_gravity))
+        else:
+            prim.CreateAttribute("physxRigidBody:disableGravity", Sdf.ValueTypeNames.Bool).Set(bool(disable_gravity))
+    except Exception:
+        prim.CreateAttribute("physxRigidBody:kinematicEnabled", Sdf.ValueTypeNames.Bool).Set(True)
+        prim.CreateAttribute("physxRigidBody:disableGravity", Sdf.ValueTypeNames.Bool).Set(bool(disable_gravity))
+    try:
+        UsdPhysics.RigidBodyAPI(prim).CreateDisableGravityAttr(bool(disable_gravity))
+    except Exception:
+        prim.CreateAttribute("physics:disableGravity", Sdf.ValueTypeNames.Bool).Set(bool(disable_gravity))
+    stage.Save()
+
+def enable_ccd_and_iters(usd_path: str, pos_iters: int = 14, vel_iters: int = 4,
+                         enable_ccd: bool = True, enable_speculative_ccd: bool = True):
+    stage = Usd.Stage.Open(usd_path)
+    prim = stage.GetDefaultPrim()
+    if not prim or not prim.IsValid():
+        for p in stage.Traverse():
+            prim = p
+            break
+        if not prim or not prim.IsValid():
+            raise RuntimeError(f"[enable_ccd_and_iters] No valid prim in stage: {usd_path}")
+
+    UsdPhysics.RigidBodyAPI.Apply(prim)
+    try:
+        rb = PhysxSchema.PhysxRigidBodyAPI.Apply(prim)
+        if enable_ccd:
+            (rb.CreateEnableCCDAttr(True) if hasattr(rb, "CreateEnableCCDAttr")
+             else prim.CreateAttribute("physxRigidBody:enableCCD", Sdf.ValueTypeNames.Bool).Set(True))
+        if enable_speculative_ccd:
+            (rb.CreateEnableSpeculativeCCDAttr(True) if hasattr(rb, "CreateEnableSpeculativeCCDAttr")
+             else prim.CreateAttribute("physxRigidBody:enableSpeculativeCCD", Sdf.ValueTypeNames.Bool).Set(True))
+        (rb.CreateSolverPositionIterationCountAttr(int(pos_iters)) if hasattr(rb, "CreateSolverPositionIterationCountAttr")
+         else prim.CreateAttribute("physxRigidBody:solverPositionIterationCount", Sdf.ValueTypeNames.Int).Set(int(pos_iters)))
+        (rb.CreateSolverVelocityIterationCountAttr(int(vel_iters)) if hasattr(rb, "CreateSolverVelocityIterationCountAttr")
+         else prim.CreateAttribute("physxRigidBody:solverVelocityIterationCount", Sdf.ValueTypeNames.Int).Set(int(vel_iters)))
+    except Exception:
+        if enable_ccd:
+            prim.CreateAttribute("physxRigidBody:enableCCD", Sdf.ValueTypeNames.Bool).Set(True)
+        if enable_speculative_ccd:
+            prim.CreateAttribute("physxRigidBody:enableSpeculativeCCD", Sdf.ValueTypeNames.Bool).Set(True)
+        prim.CreateAttribute("physxRigidBody:solverPositionIterationCount", Sdf.ValueTypeNames.Int).Set(int(pos_iters))
+        prim.CreateAttribute("physxRigidBody:solverVelocityIterationCount", Sdf.ValueTypeNames.Int).Set(int(vel_iters))
+    stage.Save()
+
+def set_convex_decomposition_params(usd_path: str,
+                                    voxel_resolution: int = 500_000,
+                                    max_convex_hulls: int = 24,
+                                    max_vertices_per_hull: int = 64,
+                                    concavity: float = 0.002,
+                                    shrink_wrap: bool = True,
+                                    overlap: bool = False,
+                                    contact_offset: float = 0.006,
+                                    rest_offset: float = -0.002):
+    stage = Usd.Stage.Open(usd_path)
+    root = stage.GetDefaultPrim()
+    if not root or not root.IsValid():
+        return
+
+    for prim in stage.Traverse():
+        if not prim or not prim.IsValid():
+            continue
+        if not prim.GetPath().HasPrefix(root.GetPath()):
+            continue
+        if not prim.IsA(UsdGeom.Mesh):
+            continue
+
+        UsdPhysics.CollisionAPI.Apply(prim)
+        mesh_col = UsdPhysics.MeshCollisionAPI.Apply(prim)
+
+        try:
+            mesh_col.CreateApproximationAttr(PhysxSchema.Tokens.convexDecomposition)
+        except Exception:
+            try:
+                mesh_col.GetApproximationAttr().Set("convexDecomposition")
+            except Exception:
+                prim.CreateAttribute("physics:approximation", Sdf.ValueTypeNames.Token).Set("convexDecomposition")
+
+        try:
+            cd_api = PhysxSchema.PhysxConvexDecompositionCollisionAPI.Apply(prim)
+            if hasattr(cd_api, "CreateVoxelResolutionAttr"):
+                cd_api.CreateVoxelResolutionAttr(int(voxel_resolution))
+            else:
+                prim.CreateAttribute("physxConvexDecomposition:voxelResolution", Sdf.ValueTypeNames.Int).Set(int(voxel_resolution))
+            if hasattr(cd_api, "CreateMaxConvexHullsAttr"):
+                cd_api.CreateMaxConvexHullsAttr(int(max_convex_hulls))
+            else:
+                prim.CreateAttribute("physxConvexDecomposition:maxConvexHulls", Sdf.ValueTypeNames.Int).Set(int(max_convex_hulls))
+            if hasattr(cd_api, "CreateMaxNumVerticesPerCHAttr"):
+                cd_api.CreateMaxNumVerticesPerCHAttr(int(max_vertices_per_hull))
+            else:
+                prim.CreateAttribute("physxConvexDecomposition:maxNumVerticesPerCH", Sdf.ValueTypeNames.Int).Set(int(max_vertices_per_hull))
+            if hasattr(cd_api, "CreateConcavityAttr"):
+                cd_api.CreateConcavityAttr(float(concavity))
+            else:
+                prim.CreateAttribute("physxConvexDecomposition:concavity", Sdf.ValueTypeNames.Float).Set(float(concavity))
+            if hasattr(cd_api, "CreateShrinkWrapAttr"):
+                cd_api.CreateShrinkWrapAttr(bool(shrink_wrap))
+            else:
+                prim.CreateAttribute("physxConvexDecomposition:shrinkWrap", Sdf.ValueTypeNames.Bool).Set(bool(shrink_wrap))
+            if hasattr(cd_api, "CreateOverlapAttr"):
+                cd_api.CreateOverlapAttr(bool(overlap))
+            else:
+                prim.CreateAttribute("physxConvexDecomposition:overlap", Sdf.ValueTypeNames.Bool).Set(bool(overlap))
+        except Exception:
+            prim.CreateAttribute("physxConvexDecomposition:voxelResolution", Sdf.ValueTypeNames.Int).Set(int(voxel_resolution))
+            prim.CreateAttribute("physxConvexDecomposition:maxConvexHulls", Sdf.ValueTypeNames.Int).Set(int(max_convex_hulls))
+            prim.CreateAttribute("physxConvexDecomposition:maxNumVerticesPerCH", Sdf.ValueTypeNames.Int).Set(int(max_vertices_per_hull))
+            prim.CreateAttribute("physxConvexDecomposition:concavity", Sdf.ValueTypeNames.Float).Set(float(concavity))
+            prim.CreateAttribute("physxConvexDecomposition:shrinkWrap", Sdf.ValueTypeNames.Bool).Set(bool(shrink_wrap))
+            prim.CreateAttribute("physxConvexDecomposition:overlap", Sdf.ValueTypeNames.Bool).Set(bool(overlap))
+
+        try:
+            col_api = PhysxSchema.PhysxCollisionAPI.Apply(prim)
+            if hasattr(col_api, "CreateContactOffsetAttr"):
+                col_api.CreateContactOffsetAttr(float(contact_offset))
+            else:
+                prim.CreateAttribute("physxCollision:contactOffset", Sdf.ValueTypeNames.Float).Set(float(contact_offset))
+            if hasattr(col_api, "CreateRestOffsetAttr"):
+                col_api.CreateRestOffsetAttr(float(rest_offset))
+            else:
+                prim.CreateAttribute("physxCollision:restOffset", Sdf.ValueTypeNames.Float).Set(float(rest_offset))
+        except Exception:
+            prim.CreateAttribute("physxCollision:contactOffset", Sdf.ValueTypeNames.Float).Set(float(contact_offset))
+            prim.CreateAttribute("physxCollision:restOffset", Sdf.ValueTypeNames.Float).Set(float(rest_offset))
+
+        try:
+            UsdGeom.Mesh(prim).CreateDoubleSidedAttr(True)
+        except Exception:
+            pass
+
+    stage.Save()
+
+def make_visual_names_unique(xml_string: str) -> str:
+    tree = ET.ElementTree(ET.fromstring(xml_string))
+    root = tree.getroot()
+    elements_with_name = root.findall(".//visual")
+    name_counts = defaultdict(int)
+    for element in elements_with_name:
+        name = element.get("name")
+        name_counts[name] += 1
+    for element in elements_with_name:
+        name = element.get("name")
+        if name_counts[name] > 1:
+            count = name_counts[name]
+            new_name = f"{name}{count}"
+            element.set("name", new_name)
+            name_counts[name] -= 1
+    return ET.tostring(root, encoding="unicode")
+
+def make_urdf_with_unique_visual_names(urdf_path: str) -> str:
+    xml_str = open(urdf_path, "r").read()
+    xml_str = '<?xml version="1.0"?>' + "\n" + make_visual_names_unique(xml_str)
+    urdf_unique_path = urdf_path.replace(".urdf", "_unique.urdf")
+    with open(urdf_unique_path, "w") as f:
+        f.write(xml_str)
+    return urdf_unique_path
+
+def make_obj_without_slashes(obj_path: str) -> str:
+    obj_str = open(obj_path, "r").read()
+    obj_str = obj_str.replace("/", "")
+    obj_clean_path = obj_path.replace(".obj", "_clean.obj")
+    with open(obj_clean_path, "w") as f:
+        f.write(obj_str)
+    return obj_clean_path
+
+def convert_urdf_to_usd(urdf_path, usd_path):
+    log.info(f"Converting {urdf_path}")
+    if not os.path.isabs(urdf_path):
+        urdf_path = os.path.abspath(urdf_path)
+    urdf_converter_cfg = UrdfConverterCfg(
+        asset_path=urdf_path,
+        usd_dir=os.path.dirname(usd_path),
+        usd_file_name=os.path.basename(usd_path),
+        make_instanceable=args.make_instanceable,
+        force_usd_conversion=True,
+        fix_base=True,
+    )
+    UrdfConverter(urdf_converter_cfg)
+
+def is_articulation(usd_path: str):
+    joint_count = 0
+    stage = Usd.Stage.Open(usd_path)
+    for prim in stage.Traverse():
+        if prim.IsA(UsdPhysics.Joint):
+            joint_count += 1
+    return joint_count > 0
+
+def apply_rigidbody_api(usd_path):
+    stage = Usd.Stage.Open(usd_path)
+    defaultPrim = stage.GetDefaultPrim()
+    UsdPhysics.RigidBodyAPI.Apply(defaultPrim)
+    stage.Save()
+
+def count_rigid_api(usd_path):
+    stage = Usd.Stage.Open(usd_path)
+    rigid_api_count = 0
+    for prim in stage.Traverse():
+        if prim.HasAPI(UsdPhysics.RigidBodyAPI):
+            rigid_api_count += 1
+    return rigid_api_count
+
+########################################################
+## Per-key conversion (batch)
+########################################################
+def run_conversion_for_key(key: str):
+    global args
+
+    scene_json = out_dir / key / "simulation" / "scene.json"
+    assert scene_json.exists(), f"[{key}] scene.json not found: {scene_json}"
+    scene_dict = json.load(open(scene_json, "r"))
+
+    # Build IO lists (background always first). Prefer "refined" if present.
+    input_list, output_list = [scene_dict["background"]["registered"]], [glb_to_usd(scene_dict["background"]["registered"])]
+    scene_dict["background"]["usd"] = glb_to_usd(scene_dict["background"]["registered"])
+    for idx, obj in scene_dict["objects"].items():
+        input_list.append(obj["optimized"])
+        output_list.append(glb_to_usd(obj["optimized"]))
+        scene_dict["objects"][idx]["usd"] = glb_to_usd(obj["optimized"])
+
+    # Persist USD paths back to scene.json
+    with open(scene_json, 'w') as f:
+        json.dump(scene_dict, f, indent=2)
+
+    # Create a fresh Args for this key
+    args = Args(input=input_list, output=output_list, make_instanceable=False,
+                collision_approximation="convexDecomposition",
+                mass=1.0, disable_gravity=False, kinematic_enabled=False,
+                headless=args_isaaclab.headless, exit_on_finish=True)
+
+    # Run the original main conversion logic for this key
+    log.info(f"[{key}] converting {len(args.input)} assets...")
+    run_conversion(scene_dict)
+
+def run_conversion(scene_dict: dict):
+    """This is the body of your original `main()` but parameterized and reused per key."""
+    if isinstance(args.input, list) and isinstance(args.output, list):
+        for input_path, output_path in zip(args.input, args.output):
+            is_background = (input_path == scene_dict["background"]["registered"])
+
+            if input_path.endswith(".obj") or input_path.endswith(".glb"):
+                # Save current, then enforce per-asset overrides
+                prev_mass = args.mass
+                prev_approx = args.collision_approximation
+                prev_disable_grav = args.disable_gravity
+                prev_kinematic = args.kinematic_enabled
+
+                args.collision_approximation = "convexDecomposition"
+
+                if is_background:
+                    args.mass = None
+                    args.disable_gravity = True
+                    args.kinematic_enabled = True
+                else:
+                    args.mass = 1.0
+                    args.disable_gravity = False
+                    args.kinematic_enabled = False
+
+                convert_obj_to_usd(input_path, output_path)
+
+                if is_background:
+                    apply_rigidbody_api(output_path)
+                    ensure_rigidbody_and_set_kinematic(output_path, disable_gravity=True)
+                    set_convex_decomposition_params(
+                        output_path,
+                        voxel_resolution=600_000,
+                        max_convex_hulls=512,
+                        max_vertices_per_hull=128,
+                        concavity=0.0005,
+                        shrink_wrap=True,
+                        overlap=False,
+                        contact_offset=0.006,
+                        rest_offset=-0.002
+                    )
+                else:
+                    apply_rigidbody_api(output_path)
+                    enable_ccd_and_iters(output_path, pos_iters=14, vel_iters=4,
+                                         enable_ccd=True, enable_speculative_ccd=True)
+                    set_convex_decomposition_params(
+                        output_path,
+                        voxel_resolution=600_000,
+                        max_convex_hulls=24,
+                        max_vertices_per_hull=64,
+                        concavity=0.002,
+                        shrink_wrap=True,
+                        overlap=False,
+                        contact_offset=0.006,
+                        rest_offset=-0.002
+                    )
+
+                # Restore global args
+                args.mass = prev_mass
+                args.collision_approximation = prev_approx
+                args.disable_gravity = prev_disable_grav
+                args.kinematic_enabled = prev_kinematic
+
+                log.info(f'Rigid body count: {count_rigid_api(output_path)}')
+                log.info(f"Saved USD file to {os.path.abspath(output_path)}")
+
+            elif input_path.endswith(".urdf"):
+                urdf_unique_path = make_urdf_with_unique_visual_names(input_path)
+                convert_urdf_to_usd(urdf_unique_path, output_path)
+
+                if is_background:
+                    apply_rigidbody_api(output_path)
+                    ensure_rigidbody_and_set_kinematic(output_path, disable_gravity=True)
+                    set_convex_decomposition_params(
+                        output_path,
+                        voxel_resolution=600_000,
+                        max_convex_hulls=28,
+                        max_vertices_per_hull=64,
+                        concavity=0.0015,
+                        shrink_wrap=True,
+                        overlap=False,
+                        contact_offset=0.006,
+                        rest_offset=-0.002
+                    )
+                elif not is_articulation(output_path):
+                    apply_rigidbody_api(output_path)
+                    enable_ccd_and_iters(output_path, pos_iters=14, vel_iters=4,
+                                         enable_ccd=True, enable_speculative_ccd=True)
+                    set_convex_decomposition_params(
+                        output_path,
+                        voxel_resolution=600_000,
+                        max_convex_hulls=24,
+                        max_vertices_per_hull=64,
+                        concavity=0.002,
+                        shrink_wrap=True,
+                        overlap=False,
+                        contact_offset=0.006,
+                        rest_offset=-0.002
+                    )
+
+                log.info(f'Rigid body count: {count_rigid_api(output_path)}')
+                log.info(f"Saved USD file to {os.path.abspath(output_path)}")
+
+    else:
+        # Single-path branches kept for completeness (not used in batch flow)
+        input_path = args.input
+        output_path = args.output
+        if input_path.endswith(".obj") or input_path.endswith(".glb"):
+            convert_obj_to_usd(input_path, output_path)
+            apply_rigidbody_api(output_path)
+            enable_ccd_and_iters(output_path, pos_iters=14, vel_iters=4,
+                                 enable_ccd=True, enable_speculative_ccd=True)
+            set_convex_decomposition_params(
+                output_path,
+                voxel_resolution=600_000,
+                max_convex_hulls=24,
+                max_vertices_per_hull=64,
+                concavity=0.002,
+                shrink_wrap=True,
+                overlap=False,
+                contact_offset=0.006,
+                rest_offset=-0.002
+            )
+            log.info(f'Rigid body count: {count_rigid_api(output_path)}')
+            log.info(f"Saved USD file to {os.path.abspath(output_path)}")
+
+        elif input_path.endswith(".urdf"):
+            urdf_unique_path = make_urdf_with_unique_visual_names(input_path)
+            convert_urdf_to_usd(urdf_unique_path, output_path)
+            apply_rigidbody_api(output_path)
+            enable_ccd_and_iters(output_path, pos_iters=14, vel_iters=4,
+                                 enable_ccd=True, enable_speculative_ccd=True)
+            set_convex_decomposition_params(
+                output_path,
+                voxel_resolution=600_000,
+                max_convex_hulls=24,
+                max_vertices_per_hull=64,
+                concavity=0.002,
+                shrink_wrap=True,
+                overlap=False,
+                contact_offset=0.006,
+                rest_offset=-0.002
+            )
+            log.info(f'Rigid body count: {count_rigid_api(output_path)}')
+            log.info(f"Saved USD file to {os.path.abspath(output_path)}")
+
+########################################################
+## Batch entry
+########################################################
+def main():
+    # Load keys from config/config.yaml and run per key
+    cfg = yaml.safe_load((base_dir / "config" / "config.yaml").open("r"))
+    keys = cfg["keys"]
+    for key in keys:
+        log.info(f"\n========== [USD Convert] key: {key} ==========")
+        run_conversion_for_key(key)
+
+    # Exit or keep GUI (same behavior as your original main)
+    if args and args.exit_on_finish:
+        return
+
+    if args and not args.headless:
+        carb_settings_iface = carb.settings.get_settings()
+        local_gui = carb_settings_iface.get("/app/window/enabled")
+        livestream_gui = carb_settings_iface.get("/app/livestream/enabled")
+        if local_gui or livestream_gui:
+            # If you want to open the last output USD in a viewer, uncomment below
+            # stage_utils.open_stage(args.output[-1] if isinstance(args.output, list) else args.output)
+            app = omni.kit.app.get_app_interface()
+            with contextlib.suppress(KeyboardInterrupt):
+                while app.is_running():
+                    app.update()
+
+if __name__ == "__main__":
+    main()
+    simulation_app.close()
diff --git a/openreal2sim/simulation/isaaclab-old/sim_trajectory_replay.py b/openreal2sim/simulation/isaaclab-old/sim_trajectory_replay.py
new file mode 100755
index 0000000..1ed673c
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_trajectory_replay.py
@@ -0,0 +1,143 @@
+"""
+Replay existing trajectories in Isaac Lab simulation.
+"""
+from __future__ import annotations
+
+# ─────────── AppLauncher ───────────
+import argparse, os, json, random, transforms3d
+from pathlib import Path
+import numpy as np
+import torch
+import yaml
+from isaaclab.app import AppLauncher
+
+# ─────────── CLI ───────────
+parser = argparse.ArgumentParser("sim_policy")
+parser.add_argument("--demo_dir", type=str, help="directory of robot trajectory")
+parser.add_argument("--key", type=str, default=None, help="scene key (outputs/<key>)")
+parser.add_argument("--robot", type=str, default="franka")
+parser.add_argument("--num_envs", type=int, default=1)
+parser.add_argument("--num_trials", type=int, default=1)
+parser.add_argument("--teleop_device", type=str, default="keyboard")
+parser.add_argument("--sensitivity", type=float, default=1.0)
+AppLauncher.add_app_launcher_args(parser)
+args_cli = parser.parse_args()
+args_cli.enable_cameras = True
+args_cli.headless = False  # headless mode for batch execution
+app_launcher = AppLauncher(vars(args_cli))
+simulation_app = app_launcher.app
+
+# ─────────── Runtime imports ───────────
+import isaaclab.sim as sim_utils
+from isaaclab.utils.math import subtract_frame_transforms
+
+from graspnetAPI.grasp import GraspGroup
+
+
+# ─────────── Simulation environments ───────────
+from sim_base import BaseSimulator, get_next_demo_id
+from sim_env_factory import make_env
+from sim_preprocess.grasp_utils import get_best_grasp_with_hints
+from sim_utils.transform_utils import pose_to_mat, mat_to_pose, grasp_to_world, grasp_approach_axis_batch
+from sim_utils.sim_utils import load_sim_parameters
+
+BASE_DIR   = Path.cwd()
+sim_cfgs = json.load(open(Path(args_cli.demo_dir) / "config.json", "r"))
+args_cli.key = sim_cfgs["key"]
+img_folder = args_cli.key
+out_dir    = BASE_DIR / "outputs"
+
+
+# ──────────────────────────── Heuristic Manipulation ────────────────────────────
+class HeuristicManipulation(BaseSimulator):
+    """
+    Physical trial-and-error grasping with approach-axis perturbation:
+      • Multiple grasp proposals executed in parallel;
+      • Every attempt does reset → pre → grasp → close → lift → check;
+      • Early stops when any env succeeds; then re-exec for logging.
+    """
+    def __init__(self, sim, scene, sim_cfgs: dict):
+        robot_pose = torch.tensor(
+            sim_cfgs["robot_cfg"]["robot_pose"],
+            dtype=torch.float32,
+            device=sim.device
+        )
+        super().__init__(
+            sim=sim, sim_cfgs=sim_cfgs, scene=scene, args=args_cli,
+            robot_pose=robot_pose, cam_dict=sim_cfgs["cam_cfg"],
+            out_dir=out_dir, img_folder=img_folder,
+            enable_motion_planning=True,
+            set_physics_props=True, debug_level=0,
+        )
+
+        self.selected_object_id = sim_cfgs["demo_cfg"]["manip_object_id"]
+        self.traj_key = sim_cfgs["demo_cfg"]["traj_key"]
+        self.traj_path = sim_cfgs["demo_cfg"]["traj_path"]
+        self.goal_offset = [0, 0, sim_cfgs["demo_cfg"]["goal_offset"]]
+        self.grasp_path = sim_cfgs["demo_cfg"]["grasp_path"]
+        self.grasp_idx = sim_cfgs["demo_cfg"]["grasp_idx"]
+        self.grasp_pre = sim_cfgs["demo_cfg"]["grasp_pre"]
+        self.grasp_delta = sim_cfgs["demo_cfg"]["grasp_delta"]
+
+    # ---------- Helpers ----------
+    def _to_base(self, pos_w: np.ndarray | torch.Tensor, quat_w: np.ndarray | torch.Tensor):
+        """World → robot base frame for all envs."""
+        root = self.robot.data.root_state_w[:, 0:7]  # [B,7]
+        p_w, q_w = self._ensure_batch_pose(pos_w, quat_w)
+        pb, qb = subtract_frame_transforms(
+            root[:, 0:3], root[:, 3:7], p_w, q_w
+        )
+        return pb, qb  # [B,3], [B,4]
+
+    def replay_actions(self, actions: np.ndarray):
+        """
+        Replay a sequence of recorded actions: (p[B,3], q[B,4], gripper[B,1])
+        """
+        n_steps = actions.shape[0]
+
+        self.reset()
+        self.wait(gripper_open=True, steps=10)
+
+        for t in range(n_steps):
+            print(f"[INFO] replay step {t}/{n_steps}")
+            act = actions[t:t+1]
+            p_b = torch.as_tensor(act[:, 0:3], dtype=torch.float32, device=self.sim.device)
+            q_b = torch.as_tensor(act[:, 3:7], dtype=torch.float32, device=self.sim.device)
+            g_b = act[:, 7] < 0.5
+            jp, success = self.move_to(p_b, q_b, gripper_open=g_b)
+            if torch.any(success==False):
+                print(f"[ERR] replay step {t} failed.")
+                return False
+            jp = self.wait(gripper_open=g_b, steps=3)
+        return True
+
+# ──────────────────────────── Entry Point ────────────────────────────
+def main():
+    env, _ = make_env(
+        cfgs=sim_cfgs, num_envs=args_cli.num_envs,
+        device=args_cli.device,
+        bg_simplify=False,
+    )
+    sim, scene = env.sim, env.scene
+
+    my_sim = HeuristicManipulation(sim, scene, sim_cfgs=sim_cfgs)
+
+    demo_root = (out_dir / img_folder / "demos").resolve()
+
+    for _ in range(args_cli.num_trials):
+
+        robot_pose = torch.tensor(sim_cfgs["robot_cfg"]["robot_pose"], dtype=torch.float32, device=my_sim.sim.device)  # [7], pos(3)+quat(wxyz)(4)
+        my_sim.set_robot_pose(robot_pose)
+        my_sim.demo_id = get_next_demo_id(demo_root)
+        my_sim.reset()
+        print(f"[INFO] start simulation demo_{my_sim.demo_id}")
+        actions = np.load(Path(args_cli.demo_dir) / "actions.npy")
+        my_sim.replay_actions(actions)
+
+    env.close()
+    simulation_app.close()
+
+if __name__ == "__main__":
+    main()
+    os.system("quit()")
+    simulation_app.close()
diff --git a/openreal2sim/simulation/isaaclab-old/sim_utils/calibration_utils.py b/openreal2sim/simulation/isaaclab-old/sim_utils/calibration_utils.py
new file mode 100755
index 0000000..e8c3666
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_utils/calibration_utils.py
@@ -0,0 +1,88 @@
+import numpy as np
+import transforms3d
+
+def _as_homo(T):
+    A = np.asarray(T, dtype=np.float64)
+    if A.shape == (4, 4):
+        return A
+    if A.shape == (3, 4):
+        M = np.eye(4, dtype=np.float64); M[:3, :4] = A; return M
+    if A.shape == (3, 3):
+        M = np.eye(4, dtype=np.float64); M[:3, :3] = A; return M
+    raise ValueError(f"Unsupported shape: {A.shape}")
+
+def _blk3(R):
+    M = np.eye(4, dtype=np.float64); M[:3, :3] = R; return M
+
+def _quat_wxyz(R):
+    q = transforms3d.quaternions.mat2quat(R)  # (w,x,y,z)
+    return -q if q[0] < 0 else q
+
+def load_extrinsics(config: dict, key: str) -> np.ndarray:
+    """
+    Load the extrinsic parameters from the scene JSON.
+    """
+
+    E_list = config.get("extrinsics", None)
+    if E_list is None:
+        print(f"Warning: No calibrated extrinsics found for {key}. Use random robot poses.")
+        return None
+    E = np.array(E_list).reshape(4, 4)
+    print(f"Loaded extrinsics for {key}:")
+    print(E)
+    return E
+
+
+def calibration_to_robot_pose(scene_js: dict, T_camopencv_to_robotbase_m) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Reproduce the real-world calibrated camera to robot transform into simulation
+    Inputs:
+        scene_js: The scene JSON containing camera extrinsics
+        T_camopencv_to_robotbase_m: The transform matrix from camera_optical (in opencv camera format) to robot_base
+    Outputs:
+        pos_w: The position of the robot base in world coordinates
+        quat_w: The orientation of the robot base in world coordinates (as a quaternion)
+        meta: A dictionary containing additional information about the calibration
+    """
+
+    T_camopencv_to_world = _as_homo(scene_js["camera"]["camera_opencv_to_world"])
+
+    T_camopencv_to_robotbase = _as_homo(T_camopencv_to_robotbase_m)
+
+    T_robotbase_to_world = T_camopencv_to_world @ np.linalg.inv(T_camopencv_to_robotbase)
+
+    pos_w  = T_robotbase_to_world[:3, 3].copy()
+    quat_w = _quat_wxyz(T_robotbase_to_world[:3, :3])
+
+    return pos_w, quat_w
+
+def calibration_to_robot_pose_deprecated(scene_js: dict, T_camisaac_to_robotbase_m) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Reproduce the real-world calibrated camera to robot transform into simulation
+    Inputs:
+        scene_js: The scene JSON containing camera extrinsics
+        T_camisaac_to_robotbase_m: The transform matrix from camera_isaac to robot_base
+    Outputs:
+        pos_w: The position of the robot base in world coordinates
+        quat_w: The orientation of the robot base in world coordinates (as a quaternion)
+        meta: A dictionary containing additional information about the calibration
+    """
+
+    T_camopencv_to_world = _as_homo(scene_js["camera"]["camera_opencv_to_world"])
+
+    T_camisaac_to_robotbase = _as_homo(T_camisaac_to_robotbase_m)
+
+    # This may be the true transformation
+    # REP-103: camera_opencv(OpenCV: +x right, +y down, +z forward) -> camera_isaac(IsaacSim: +x forward, +y left, +z up)
+    T_camopencv_to_camisaac = np.array([[ 0,  0,  1],
+                                        [-1,  0,  0],
+                                        [ 0, -1,  0]], dtype=np.float64)
+
+    T_camopencv_to_robotbase = T_camisaac_to_robotbase @ _blk3(T_camopencv_to_camisaac)   # base -> camera_link
+
+    T_robotbase_to_world = T_camopencv_to_world @ np.linalg.inv(T_camopencv_to_robotbase)
+
+    pos_w  = T_robotbase_to_world[:3, 3].copy()
+    quat_w = _quat_wxyz(T_robotbase_to_world[:3, :3])
+
+    return pos_w, quat_w
diff --git a/openreal2sim/simulation/isaaclab-old/sim_utils/grasp_group_utils.py b/openreal2sim/simulation/isaaclab-old/sim_utils/grasp_group_utils.py
new file mode 100644
index 0000000..950d8f5
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_utils/grasp_group_utils.py
@@ -0,0 +1,71 @@
+"""
+Light-weight GraspGroup fallback.
+
+If graspnetAPI is installed we reuse its GraspGroup. Otherwise we provide a very
+small compatible subset that can load npy files saved by graspnetAPI and expose
+rotation/translation information for the manipulation scripts.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Sequence, Union
+
+import numpy as np
+
+try:  # pragma: no cover - only executed when dependency exists.
+    from graspnetAPI.grasp import GraspGroup as _ExternalGraspGroup  # type: ignore
+
+    GraspGroup = _ExternalGraspGroup
+except ImportError:
+
+    @dataclass
+    class SimpleGrasp:
+        rotation_matrix: np.ndarray  # (3, 3)
+        translation: np.ndarray  # (3,)
+
+    class GraspGroup:
+        """Minimal GraspGroup replacement supporting npy loading and indexing."""
+
+        def __init__(self, array: np.ndarray | None = None):
+            if array is None:
+                self._array = np.zeros((0, 17), dtype=np.float32)
+            else:
+                self._array = np.asarray(array, dtype=np.float32)
+
+        @property
+        def grasp_group_array(self) -> np.ndarray:
+            return self._array
+
+        def __len__(self) -> int:
+            return len(self._array)
+
+        def __getitem__(
+            self, index: Union[int, slice, Sequence[int], np.ndarray]
+        ) -> Union[SimpleGrasp, "GraspGroup"]:
+            if isinstance(index, int):
+                row = self._array[index]
+                return SimpleGrasp(
+                    rotation_matrix=row[4:13].reshape(3, 3),
+                    translation=row[13:16],
+                )
+            if isinstance(index, slice):
+                return GraspGroup(self._array[index])
+            if isinstance(index, (list, tuple, np.ndarray)):
+                return GraspGroup(self._array[index])
+            raise TypeError(f"Unsupported index type {type(index)!r} for GraspGroup")
+
+        def from_npy(self, npy_file_path: str) -> "GraspGroup":
+            data = np.load(npy_file_path)
+            if data.ndim == 1:
+                data = data.reshape(1, -1)
+            self._array = data.astype(np.float32, copy=False)
+            return self
+
+        # convenience for legacy code expecting classmethod behaviour
+        @classmethod
+        def load(cls, npy_file_path: str) -> "GraspGroup":
+            return cls().from_npy(npy_file_path)
+
+__all__ = ["GraspGroup"]
+
+
diff --git a/openreal2sim/simulation/isaaclab-old/sim_utils/sim_utils.py b/openreal2sim/simulation/isaaclab-old/sim_utils/sim_utils.py
new file mode 100755
index 0000000..2ddd68b
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_utils/sim_utils.py
@@ -0,0 +1,257 @@
+import json
+from pathlib import Path
+import yaml
+import torch
+import random
+import numpy as np
+import transforms3d
+
+from sim_utils.calibration_utils import calibration_to_robot_pose, load_extrinsics
+
+default_config = {
+    "physics": "default",
+    "extrinsics": None,
+    "goal_offset": 0,
+    "grasp_idx": -1,
+    "grasp_pre": None,
+    "grasp_delta": None,
+    "traj_key": "fdpose_trajs", # "fdpose_trajs", "simple_trajs", "hybrid_trajs"
+    "manip_object_id": "1",
+}
+
+def compose_configs(key_name: str, config: dict) -> dict:
+    ret_key_config = {}
+    local_config = config["local"].get(key_name, {})
+    local_config = local_config.get("simulation", {})
+    global_config = config.get("global", {})
+    global_config = global_config.get("simulation", {})
+    for param in default_config.keys():
+        value = local_config.get(param, global_config.get(param, default_config[param]))
+        ret_key_config[param] = value
+    print(f"[Info] Config for {key_name}: {ret_key_config}")
+    return ret_key_config
+
+def load_sim_parameters(basedir, key) -> dict:
+    scene_json_path = Path(basedir) / "outputs" / key / "simulation" / "scene.json"
+    scene_json   = json.load(open(scene_json_path, "r"))
+    exp_config = yaml.load(open(Path(basedir) / "config/config.yaml"), Loader=yaml.FullLoader)
+    exp_config = compose_configs(key, exp_config)
+
+    # cam configs
+    cam_cfg = {
+        "width":  int(scene_json["camera"]["width"]),
+        "height": int(scene_json["camera"]["height"]),
+        "fx": float(scene_json["camera"]["fx"]),
+        "fy": float(scene_json["camera"]["fy"]),
+        "cx": float(scene_json["camera"]["cx"]),
+        "cy": float(scene_json["camera"]["cy"]),
+        "cam_orientation": tuple(scene_json["camera"]["camera_heading_wxyz"]),
+        "scene_info": {
+            "move_to":      list(scene_json["camera"]["camera_position"]),
+            "scene_min":    list(scene_json["aabb"]["scene_min"]),
+            "scene_max":    list(scene_json["aabb"]["scene_max"]),
+            "object_center":list(scene_json["objects"]["1"]["object_center"]),
+        },
+    }
+
+    # robot configs
+    E = load_extrinsics(exp_config, key)
+    if E is None:
+        robot_cfg = random.choice(robot_placement_candidates_v2(cam_cfg["scene_info"]))
+        pos_w, quat_w = robot_cfg["position"], robot_cfg["rotation"]
+    else:
+        pos_w, quat_w = calibration_to_robot_pose(scene_json, E)
+    robot_pose = list(pos_w) + list(quat_w)
+    robot_cfg = {
+        "robot_pose": robot_pose,
+    }
+
+    # demo configs
+    goal_offset = exp_config.get("goal_offset", 0)
+    grasp_idx = exp_config.get("grasp_idx", -1)
+    grasp_pre = exp_config.get("grasp_pre", None)
+    grasp_delta = exp_config.get("grasp_delta", None)
+    traj_key = exp_config.get("traj_key", "fdpose_trajs") # "fdpose_trajs", "simple_trajs", "hybrid_trajs"
+    manip_object_id = exp_config.get("manip_object_id", "1")
+    traj_path = scene_json["objects"][manip_object_id][traj_key]
+    grasp_path = scene_json["objects"][manip_object_id].get("grasps", None)
+    demo_cfg = {
+        "manip_object_id": manip_object_id,
+        "traj_key": traj_key,
+        "traj_path": traj_path,
+        "goal_offset": goal_offset,
+        "grasp_idx": grasp_idx,
+        "grasp_pre": grasp_pre,
+        "grasp_delta": grasp_delta,
+        "grasp_path": grasp_path,
+    }
+
+    # physics configs
+    physics_key = exp_config.get("physics", "default")
+    physics_cfg = {}
+    if physics_key == "default":
+        physics_cfg["bg_physics"] = default_bg_physics
+        physics_cfg["obj_physics"] = default_obj_physics
+    elif physics_key == "viz":
+        physics_cfg["bg_physics"] = default_bg_physics
+        physics_cfg["obj_physics"] = viz_obj_physics
+    else:
+        print(f"[WARN] Unrecognized physics key '{physics_key}', no physics specified in exp configs.")
+        physics_cfg["bg_physics"] = None
+        physics_cfg["obj_physics"] = None
+
+    return {
+        "key": key,
+        "scene_cfg": scene_json,
+        "exp_cfg": exp_config,
+        "cam_cfg": cam_cfg,
+        "robot_cfg": robot_cfg,
+        "demo_cfg": demo_cfg,
+        "physics_cfg": physics_cfg,
+    }
+
+
+# Default physics for background and objects
+default_bg_physics = {
+    "mass_props": {"mass": 100.0},
+    "rigid_props": {"disable_gravity": True, "kinematic_enabled": True},
+    "collision_props": {"collision_enabled": True,},
+}
+
+default_obj_physics = {
+    "mass_props": {"mass": 0.2},
+    "rigid_props": {"disable_gravity": False, "kinematic_enabled": False},
+    "collision_props": {"collision_enabled": True,},
+}
+
+# For viz
+viz_obj_physics = {
+    "mass_props": {"mass": 0.5},
+    "rigid_props": {"disable_gravity": True, "kinematic_enabled": False},
+    "collision_props": {"collision_enabled": False,},
+}
+
+def robot_placement_candidates_v2(
+    scene_info: dict,
+    reachability=(0.3, 0.65),
+    reachability_zcenter_offset=0.3,
+    robot_aabb=(0.12, 0.12, 0.72),
+    num_radius_steps=10,
+    num_angle_steps=36,
+    num_z_steps=5,
+    occlusion_clearance=(0.02, 0.02, 0.02),  # inflated base AABB for occlusion checks (m)
+    obj_world_radius=0.02,                   # small ring near object center (m)
+    num_obj_rays=8,                          # # of rays from ring points (>=8)
+    min_cam_base_separation_deg=45.0,        # minimum separation angle wrt camera direction
+):
+    """
+    Return robot base poses (position + wxyz rotation) satisfying:
+      (1) Reachability
+      (2) No overlap with background AABB
+      (3) Same half-space as camera w.r.t object & min separation angle
+      (4) No occlusion from camera to object (center ray + ring rays)
+    """
+    object_center = np.array(scene_info["object_center"], dtype=float)
+    scene_min = np.array(scene_info["scene_min"], dtype=float)
+    scene_max = np.array(scene_info["scene_max"], dtype=float)
+    cam_pos = np.array(scene_info["move_to"], dtype=float)
+
+    def segment_intersects_aabb(p0, p1, aabb_min, aabb_max, eps=1e-9):
+        d = p1 - p0
+        t0, t1 = 0.0, 1.0
+        for i in range(3):
+            if abs(d[i]) < eps:
+                if p0[i] < aabb_min[i] or p0[i] > aabb_max[i]:
+                    return False
+            else:
+                inv_d = 1.0 / d[i]
+                t_near = (aabb_min[i] - p0[i]) * inv_d
+                t_far = (aabb_max[i] - p0[i]) * inv_d
+                if t_near > t_far:
+                    t_near, t_far = t_far, t_near
+                t0 = max(t0, t_near)
+                t1 = min(t1, t_far)
+                if t0 > t1:
+                    return False
+        return True
+
+    def edge_ring_points(obj_center, cam_pos, radius, m):
+        if m <= 0 or radius <= 0.0:
+            return np.empty((0, 3), dtype=float)
+        view = obj_center - cam_pos
+        n = np.linalg.norm(view)
+        if n < 1e-9:
+            return np.empty((0, 3), dtype=float)
+        u = view / n
+        tmp = np.array([1.0, 0.0, 0.0], dtype=float)
+        if abs(np.dot(tmp, u)) > 0.9:
+            tmp = np.array([0.0, 1.0, 0.0], dtype=float)
+        e1 = np.cross(u, tmp); e1 /= (np.linalg.norm(e1) + 1e-12)
+        e2 = np.cross(u, e1);  e2 /= (np.linalg.norm(e2) + 1e-12)
+        phis = np.linspace(0.0, 2.0 * np.pi, num=m, endpoint=False)
+        ring = [obj_center + radius * (np.cos(phi) * e1 + np.sin(phi) * e2) for phi in phis]
+        return np.asarray(ring, dtype=float)
+
+    candidates = []
+    radius_values = np.linspace(reachability[0], reachability[1], num=num_radius_steps)
+    angle_values = np.linspace(0, 2 * np.pi, num=num_angle_steps, endpoint=False)
+    z_min, z_max = 0.0, 2.0 * float(object_center[2])
+    z_values = np.linspace(z_min, z_max, num=num_z_steps)
+    ring_pts = edge_ring_points(object_center, cam_pos, obj_world_radius, num_obj_rays)
+
+    cos_min_sep = np.cos(np.deg2rad(float(min_cam_base_separation_deg)))
+
+    for z in z_values:
+        reach_center_z = z + reachability_zcenter_offset
+        for r in radius_values:
+            for theta in angle_values:
+                dx = r * np.cos(theta); dy = r * np.sin(theta)
+                base_pos = np.array([object_center[0] - dx, object_center[1] - dy, z], dtype=float)
+
+                # (1) reachability
+                reach_dist = np.linalg.norm(object_center - np.array([base_pos[0], base_pos[1], reach_center_z]))
+                if reach_dist > reachability[1] or reach_dist < reachability[0]:
+                    continue
+
+                # (2) no overlap with background AABB
+                half = 0.5 * np.array(robot_aabb, dtype=float)
+                base_min = base_pos - half
+                base_max = base_pos + half
+                if np.all(base_max > scene_min) and np.all(base_min < scene_max):
+                    continue
+
+                # (3) same-side + min angle
+                v_cam = cam_pos - object_center
+                v_base = base_pos - object_center
+                n1 = np.linalg.norm(v_cam); n2 = np.linalg.norm(v_base)
+                if n1 < 1e-9 or n2 < 1e-9:
+                    continue
+                cos_val = float(np.dot(v_cam, v_base) / (n1 * n2))
+                if (cos_val < 0.0) or (cos_val > cos_min_sep):
+                    continue
+
+                # (4a) no occlusion on center ray
+                extra = np.array(occlusion_clearance, dtype=float)
+                occ_min = base_min - extra
+                occ_max = base_max + extra
+                if segment_intersects_aabb(cam_pos, object_center, occ_min, occ_max):
+                    continue
+                # (4b) ring rays
+                if any(segment_intersects_aabb(cam_pos, P, occ_min, occ_max) for P in ring_pts):
+                    continue
+
+                # (5) yaw towards object in XY plane
+                facing_vec = object_center[:2] - base_pos[:2]
+                yaw = np.arctan2(facing_vec[1], facing_vec[0])
+                quat_wxyz = transforms3d.euler.euler2quat(0.0, 0.0, yaw)
+
+                candidates.append({
+                    "position": base_pos.tolist(),
+                    "rotation": [float(x) for x in quat_wxyz],  # (w,x,y,z)
+                    "yaw_deg": float(np.degrees(yaw)),
+                    "base_z": float(z),
+                })
+
+    if not candidates:
+        raise RuntimeError("No valid base found under reach/collision/same-side(min-angle)/occlusion constraints.")
+    return candidates
diff --git a/openreal2sim/simulation/isaaclab-old/sim_utils/transform_utils.py b/openreal2sim/simulation/isaaclab-old/sim_utils/transform_utils.py
new file mode 100755
index 0000000..a8997e9
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab-old/sim_utils/transform_utils.py
@@ -0,0 +1,50 @@
+# transform_utils.py
+import torch
+import numpy as np
+import transforms3d
+
+def grasp_to_world(grasp) -> tuple[np.ndarray, np.ndarray]:
+    """
+    GraspNet -> EE:
+      X_grasp=approach, Y_grasp=width, Z_grasp=thickness
+      We want: Z_EE=approach, Y_EE=-width, X_EE=thickness
+    """
+    Rg = grasp.rotation_matrix.astype(np.float32)
+    delta = np.array([[0, 0, 1],
+                      [0, -1, 0],
+                      [1, 0, 0]], dtype=np.float32)
+    Ree = Rg @ delta
+    q   = transforms3d.quaternions.mat2quat(Ree).astype(np.float32)  # wxyz
+    p   = grasp.translation.astype(np.float32)
+    return p, q
+
+def grasp_approach_axis_batch(qw_batch: np.ndarray) -> np.ndarray:
+    """
+    get the approach axis (Z axis) of each env in world frame
+    qw_batch: (B,4) quaternion (wxyz)
+    return: (B,3) each env's grasping approach axis (Z axis) in world frame
+    """
+    qw_batch = np.asarray(qw_batch, dtype=np.float32)
+    a_list = []
+    for q in qw_batch:
+        R = transforms3d.quaternions.quat2mat(q)
+        a = R[:, 2]
+        a_list.append((a / (np.linalg.norm(a) + 1e-8)).astype(np.float32))
+    return np.stack(a_list, axis=0)
+
+def pose_to_mat(pos, quat):
+    if torch.is_tensor(pos):  pos  = pos.cpu().numpy()
+    if torch.is_tensor(quat): quat = quat.cpu().numpy()
+    m = np.eye(4, dtype=np.float32)
+    m[:3, :3] = transforms3d.quaternions.quat2mat(quat)
+    m[:3,  3] = pos
+    return m
+
+def mat_to_pose(mat):
+    pos  = mat[:3, 3]
+    quat = transforms3d.quaternions.mat2quat(mat[:3, :3])
+    return pos, quat
+
+def normalize_quaternion(q: np.ndarray) -> np.ndarray:
+    norm = np.linalg.norm(q, axis=1, keepdims=True) + 1e-9
+    return q / norm
diff --git a/openreal2sim/simulation/isaaclab/demo/README.md b/openreal2sim/simulation/isaaclab/demo/README.md
new file mode 100755
index 0000000..b5dee8d
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/demo/README.md
@@ -0,0 +1,150 @@
+### Data Collection and Generation ###
+This folder is used for collecting manipulation data and augmenting it with spatial transformation.
+
+The demo pipeline consists of three main stages:
+1. **USD Conversion** - Converts scene data to USD format for simulation
+2. **Heuristic Manipulation** - Performs object manipulation using heuristic policies with grasping and motion planning. This will create a new folder in `task` and `h5py` folder for each key. In the task folder, a json is formulated, and required assets for the scene are structured. 
+3. **Randomized Rollout** - Generates randomized scene configurations and collects successful manipulation trajectories. This will add demos to the `task` folder and trajectory configs to the task config, and add new h5py file to the `h5py` folder.
+
+### Code Structure ###
+
+```
+demo/
+├── sim_agent.sh                    # Main pipeline script that runs all stages
+├── sim_heuristic_manip.py          # Heuristic manipulation policy implementation
+├── sim_randomize_rollout.py        # Randomized rollout for data augmentation
+└── envs/                           # Environment configuration and utilities
+    ├── task_cfg.py                 # Task configuration classes (TaskCfg, TrajectoryCfg, etc.)
+    ├── task_construct.py           # Task construction and serialization utilities
+    ├── randomizer.py               # Scene randomization for data augmentation
+    └── running_cfg.py              # Runtime configuration management
+```
+
+### Running Script ###
+Please run `bash openreal2sim/simulation/isaaclab/demo/sim_agent.sh` to collect the demo. This will iterate through the data folder and run the three stages for each input key.
+
+### Running Config Explanation ###
+There are two editable configs for this script:
+
+First, running device and headless option are defined in the bash script.
+
+Second, the randomizer config, environment num and total trajectry num are all defined in the `running_cfg.py` file. Please edit that file for your own needs.
+
+
+#### TaskCfg (Main Task Configuration)
+
+The top-level configuration that defines a complete manipulation task:
+
+- **task_key** (str): Unique identifier for the task
+- **task_id** (int): Sequential task ID
+- **task_desc** (List[str]): Description of the task
+- **task_type** (TaskType): Type of task (SIMPLE_PICK, SIMPLE_PICK_PLACE, TARGETTED_PICK_PLACE)
+- **background_cfg** (BackgroundCfg): Background configuration
+- **camera_info** (CameraInfo): Camera intrinsic and extrinsic parameters
+- **manipulated_oid** (int): Object ID of the manipulated object
+- **start_related** (List[int]): List of object IDs related to the start state
+- **end_related** (List[int]): List of object IDs related to the end state
+- **objects** (List[ObjectCfg]): List of all objects in the scene
+- **reference_trajectory** (TrajectoryCfg, optional): Reference trajectory generated by heuristic manipulation
+- **generated_trajectories** (List[TrajectoryCfg]): List of randomized trajectories for data augmentation
+
+#### TrajectoryCfg (Trajectory Configuration)
+
+Defines a specific manipulation trajectory:
+
+- **robot_pose** (List[float]): Robot base pose [x, y, z, qw, qx, qy, qz] in world frame
+- **object_poses** (Dict[int, List[float]]): Initial poses of all objects, keyed by object ID
+- **object_trajectory** (List[List[float]]): Object trajectory as sequence of poses [x, y, z, qw, qx, qy, qz]
+- **final_gripper_close** (bool): Whether gripper should be closed at the end
+- **success_metric** (SuccessMetric): Criteria for task success
+- **pregrasp_pose** (List[float], optional): Pre-grasp end-effector pose in world frame
+- **grasp_pose** (List[float], optional): Grasp end-effector pose in world frame
+- **robot_type** (RobotType, optional): Type of robot (FRANKA or UR5)
+
+#### SuccessMetric (Success Criteria)
+
+Defines how task success is evaluated:
+
+- **success_metric_type** (SuccessMetricType): Type of success metric
+  - `SIMPLE_LIFT`: Object must be lifted by a certain height
+  - `TARGET_POINT`: Object must reach a specific target pose
+  - `TARGET_PLANE`: Object must reach a target plane (height)
+- **final_gripper_close** (bool): Whether gripper should be closed at success
+- **lift_height** (float, optional): Required lift height for SIMPLE_LIFT
+- **ground_value** (float, optional): Target height for TARGET_PLANE
+- **end_pose** (List[float], optional): Target pose [x, y, z, qw, qx, qy, qz] for TARGET_POINT
+
+#### ObjectCfg (Object Configuration)
+
+Defines an object in the scene:
+
+- **object_id** (int): Unique object identifier
+- **object_name** (str): Name of the object
+- **mesh_path** (str): Path to object mesh file (GLB format)
+- **usd_path** (str): Path to object USD file
+
+#### BackgroundCfg (Background Configuration)
+
+Defines the background of the scene:
+
+- **background_rgb_path** (str): Path to background RGB image
+- **background_mesh_path** (str): Path to background mesh file (GLB format)
+- **background_usd_path** (str): Path to background USD file
+- **background_point** (List[float]): Background point [x, y, z]
+
+#### CameraInfo (Camera Configuration)
+
+Camera intrinsic and extrinsic parameters:
+
+- **width, height** (float): Image dimensions
+- **fx, fy** (float): Focal lengths
+- **cx, cy** (float): Principal point
+- **camera_opencv_to_world** (List[List[float]]): 4x4 transformation matrix from OpenCV to world frame
+- **camera_position** (List[float]): Camera position [x, y, z]
+- **camera_heading_wxyz** (List[float]): Camera orientation quaternion [w, x, y, z]
+
+#### Configuration Workflow
+
+1. **Initial Task Creation**: `construct_task_config()` creates a `TaskCfg` from scene reconstruction data
+2. **Reference Trajectory**: `add_reference_trajectory()` adds the first successful trajectory from heuristic manipulation
+3. **Data Augmentation**: `add_generated_trajectories()` adds randomized trajectories from rollout stage
+4. **Serialization**: Task configs are saved as JSON files in `tasks/<key>/task.json`
+5. **Loading**: `load_task_cfg()` reconstructs `TaskCfg` objects from JSON files
+
+#### Example Task Configuration Structure
+
+```json
+{
+  "task_key": "demo_video",
+  "task_id": 0,
+  "task_type": "TARGETTED_PICK_PLACE",
+  "manipulated_oid": 2,
+  "objects": [
+    {
+      "object_id": 2,
+      "object_name": "pen",
+      "mesh_path": "tasks/demo_video/object_2.glb",
+      "usd_path": "tasks/demo_video/object_2.usd"
+    }
+  ],
+  "background_cfg": {
+    "background_rgb_path": "tasks/demo_video/bg_rgb.jpg",
+    "background_mesh_path": "tasks/demo_video/background.glb",
+    "background_usd_path": "tasks/demo_video/background.usd",
+    "background_point": [0.0, 0.0, 0.0]
+  },
+  "reference_trajectory": {
+    "robot_pose": [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
+    "object_poses": {"2": [0.1, 0.2, 0.05, 1.0, 0.0, 0.0, 0.0]},
+    "object_trajectory": [[...], [...], ...],
+    "pregrasp_pose": [0.1, 0.2, 0.15, 1.0, 0.0, 0.0, 0.0],
+    "grasp_pose": [0.1, 0.2, 0.1, 1.0, 0.0, 0.0, 0.0],
+    "success_metric": {
+      "success_metric_type": "TARGET_POINT",
+      "end_pose": [0.3, 0.4, 0.1, 1.0, 0.0, 0.0, 0.0],
+      "final_gripper_close": false
+    }
+  },
+  "generated_trajectories": [...]
+}
+```
diff --git a/openreal2sim/simulation/isaaclab/demo/envs/randomizer.py b/openreal2sim/simulation/isaaclab/demo/envs/randomizer.py
new file mode 100755
index 0000000..67e48be
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/demo/envs/randomizer.py
@@ -0,0 +1,366 @@
+import numpy as np
+import random
+from scipy.spatial.transform import Rotation as R
+from scipy.spatial.transform import Slerp
+from .task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg
+
+import torch
+import transforms3d
+
+def pose_to_mat(pos, quat):
+    if torch.is_tensor(pos):  pos  = pos.cpu().numpy()
+    if torch.is_tensor(quat): quat = quat.cpu().numpy()
+    m = np.eye(4, dtype=np.float32)
+    m[:3, :3] = transforms3d.quaternions.quat2mat(quat)
+    m[:3,  3] = pos
+    return m
+
+def mat_to_pose(mat):
+    pos  = mat[:3, 3]
+    quat = transforms3d.quaternions.mat2quat(mat[:3, :3])
+    return pos, quat
+
+
+
+class Randomizer(TaskCfg):
+    def __init__(self, task_cfg: TaskCfg):
+        self.task_cfg = task_cfg
+
+    
+    def generate_randomized_scene_cfg(self, grid_dist: float, grid_num: int, angle_random_range: float, angle_random_num: int, traj_randomize_num:int, scene_randomize_num: int, robot_pose_randomize_range, robot_pose_randomize_angle, robot_pose_randomize_num, fix_end_pose: bool = False):
+        # Step 1: Generate candidate transforms
+      
+        candidate_transforms = []
+        for i in range(-grid_num, grid_num):
+            for j in range(-grid_num, grid_num):
+                random_angles = np.random.uniform(-angle_random_range, angle_random_range, angle_random_num)
+                for angle in random_angles:
+                    orig = np.eye(4)
+                    orig[0, 3] = i * grid_dist
+                    orig[1, 3] = j * grid_dist
+                    orig[0, 0] = np.cos(angle)
+                    orig[0, 1] = -np.sin(angle)
+                    orig[1, 0] = np.sin(angle)
+                    orig[1, 1] = np.cos(angle)
+                    candidate_transforms.append(orig)
+        
+        # Step 2: Generate traj_randomize_num combinations of (start_pose, end_pose)
+        traj_pose_pairs = []
+        for _ in range(traj_randomize_num):
+            start_pose = random.choice(candidate_transforms)
+            if fix_end_pose:
+                #import pdb; pdb.set_trace()
+                end_pose = np.eye(4)
+            else:
+                if self.task_cfg.task_type == TaskType.SIMPLE_PICK:
+                    end_pose = start_pose.copy()  # For SIMPLE_PICK, end same as start
+                else:
+                    end_pose = random.choice(candidate_transforms)
+            traj_pose_pairs.append((start_pose, end_pose))
+        
+        # Step 3: Generate scene_randomize_num combinations for other objects
+        other_object_ids = [obj.object_id for obj in self.task_cfg.objects 
+                           if obj.object_id != self.task_cfg.manipulated_oid and obj.object_id not in self.task_cfg.start_related and obj.object_id not in self.task_cfg.end_related]
+        
+        scene_poses_combinations = []
+        for _ in range(scene_randomize_num):
+            # Create list of (oid, pose) pairs
+            other_object_poses = [(oid, random.choice(candidate_transforms)) 
+                                  for oid in other_object_ids]
+            scene_poses_combinations.append(other_object_poses)
+        
+        # Step 4: Generate robot_pose_randomize_num random robot poses
+        # robot_pose format: [x, y, z, w, x, y, z] (position + quaternion wxyz)
+        ## FIXME: hacking
+        ref_traj = self.task_cfg.reference_trajectory[-1]
+        assert ref_traj is not None, "Reference trajectory is not found"
+        ref_robot_pose = np.array(ref_traj.robot_pose)
+        robot_pose_mat = pose_to_mat(ref_robot_pose[:3], ref_robot_pose[3:7])
+        
+        robot_poses = []
+        for _ in range(robot_pose_randomize_num):
+            # Random translation within range
+            random_trans = np.random.uniform(
+                -robot_pose_randomize_range, 
+                robot_pose_randomize_range, 
+                3
+            )
+            random_rot = np.random.uniform(
+                -robot_pose_randomize_angle, 
+                robot_pose_randomize_angle, 
+            )
+
+            rotate_matrix = np.eye(4)
+            rotate_matrix[:3, :3] = R.from_euler('z', random_rot).as_matrix()
+            new_robot_pose = robot_pose_mat @ rotate_matrix
+            new_robot_pose[:3, 3] += random_trans
+            # Combine position and quaternion [x, y, z, w, x, y, z]
+            pos, quat = mat_to_pose(new_robot_pose)
+            robot_pose_7d = np.concatenate([pos, quat])
+            robot_poses.append(robot_pose_7d.tolist())
+        
+        # Step 5: Generate trajectories for all combinations
+        generated_trajectories = []
+        ref_traj = self.task_cfg.reference_trajectory[-1]
+        for start_pose, end_pose in traj_pose_pairs:
+            for other_object_poses in scene_poses_combinations:
+                for robot_pose in robot_poses:
+                    # Generate trajectory for manipulated object
+                    if self.task_cfg.task_type == TaskType.SIMPLE_PICK:
+                        new_traj_mats = []
+                        for traj_pose_7d in ref_traj.object_trajectory:
+                            pose_mat = pose_to_mat(np.array(traj_pose_7d[:3]), np.array(traj_pose_7d[3:7]))
+                            new_traj_mats.append(start_pose @ pose_mat)
+                        # Convert back to 7D format
+                        new_traj_7d = []
+                        for mat in new_traj_mats:
+                            pos, quat = mat_to_pose(mat)
+                            new_traj_7d.append(np.concatenate([pos, quat]).tolist())
+                        
+                        # Transform pregrasp and grasp poses
+                        if ref_traj.pregrasp_pose:
+                            pregrasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.pregrasp_pose[:3]), np.array(ref_traj.pregrasp_pose[3:7]))
+                            pos, quat = mat_to_pose(pregrasp_mat)
+                            pregrasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            pregrasp_pose = None
+                            
+                        if ref_traj.grasp_pose:
+                            grasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.grasp_pose[:3]), np.array(ref_traj.grasp_pose[3:7]))
+                            pos, quat = mat_to_pose(grasp_mat)
+                            grasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            grasp_pose = None
+                    else:
+                        # Convert reference trajectory to mat format
+                        ref_traj_mats = []
+                        for traj_pose_7d in ref_traj.object_trajectory:
+                            ref_traj_mats.append(pose_to_mat(np.array(traj_pose_7d[:3]), np.array(traj_pose_7d[3:7])))
+                        ref_traj_mats = np.array(ref_traj_mats)
+                        
+                        new_traj_mats = self.compute_new_traj(start_pose, end_pose, ref_traj_mats)
+                        new_traj_mats = self.lift_traj(ref_traj_mats, new_traj_mats)
+                        # Convert back to 7D format
+                        new_traj_7d = []
+                        for mat in new_traj_mats:
+                            pos, quat = mat_to_pose(mat)
+                            new_traj_7d.append(np.concatenate([pos, quat]).tolist())
+                        #new_traj_7d = lift_traj(new_traj_7d)
+                        # Transform pregrasp and grasp poses
+                        if ref_traj.pregrasp_pose:
+                            pregrasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.pregrasp_pose[:3]), np.array(ref_traj.pregrasp_pose[3:7]))
+                            pos, quat = mat_to_pose(pregrasp_mat)
+                            pregrasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            pregrasp_pose = None
+                            
+                        if ref_traj.grasp_pose:
+                            grasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.grasp_pose[:3]), np.array(ref_traj.grasp_pose[3:7]))
+                            pos, quat = mat_to_pose(grasp_mat)
+                            grasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            grasp_pose = None
+                    
+                    # Apply scene randomization to other objects (if needed, update object_poses here)
+                    # other_object_poses is now a list of (oid, pose) pairs
+                    
+                    # Create success metric    
+
+                    ref_end_pose_mat = pose_to_mat(
+                        np.array(ref_traj.success_metric.end_pose[:3]), 
+                        np.array(ref_traj.success_metric.end_pose[3:7])
+                    )
+                    end_pose_metric_mat = end_pose @ ref_end_pose_mat
+                    pos, quat = mat_to_pose(end_pose_metric_mat)
+                    end_pose_metric_7d = np.concatenate([pos, quat]).tolist()
+
+                    success_metric_type = ref_traj.success_metric.success_metric_type
+                    if success_metric_type == SuccessMetricType.SIMPLE_LIFT:
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.SIMPLE_LIFT,
+                            lift_height=ref_traj.success_metric.lift_height,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    elif success_metric_type == SuccessMetricType.TARGET_POINT:
+                        # Convert end_pose from 7D to mat, transform, then back to 7D
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.TARGET_POINT,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    elif success_metric_type == SuccessMetricType.TARGET_PLANE:
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.TARGET_PLANE,
+                            ground_value=ref_traj.success_metric.ground_value,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    
+                    # Store object poses in 7D format
+                    object_poses = {}
+                    for oid, pose_mat in other_object_poses:
+                        pos, quat = mat_to_pose(pose_mat)
+                        object_poses[oid] = np.concatenate([pos, quat]).tolist()
+                    
+                    # Add start and end related objects
+                    start_pos, start_quat = mat_to_pose(start_pose)
+                    start_pose_7d = np.concatenate([start_pos, start_quat]).tolist()
+                    for oid in self.task_cfg.start_related:
+                        object_poses[oid] = start_pose_7d
+                    
+                    end_pos, end_quat = mat_to_pose(end_pose)
+                    end_pose_7d = np.concatenate([end_pos, end_quat]).tolist()
+                    for oid in self.task_cfg.end_related:
+                        object_poses[oid] = end_pose_7d
+                    
+                    object_poses[self.task_cfg.manipulated_oid] = start_pose_7d
+                    
+                    robot_type = ref_traj.robot_type
+                    generated_trajectories.append(TrajectoryCfg(
+                        robot_pose=robot_pose,
+                        object_poses=object_poses,
+                        object_trajectory=new_traj_7d,
+                        final_gripper_close=ref_traj.final_gripper_close,
+                        pregrasp_pose=pregrasp_pose,
+                        grasp_pose=grasp_pose,
+                        success_metric=success_metric,
+                        robot_type=robot_type
+                    ))
+        
+        # Total trajectories = traj_randomize_num * scene_randomize_num * robot_pose_randomize_num
+        assert len(generated_trajectories) == traj_randomize_num * scene_randomize_num * robot_pose_randomize_num, \
+            f"Expected {traj_randomize_num * scene_randomize_num * robot_pose_randomize_num} trajectories, got {len(generated_trajectories)}"
+      
+        random.shuffle(generated_trajectories)
+        return generated_trajectories
+
+    @staticmethod
+    def compute_new_traj(start_trans: np.ndarray, end_trans: np.ndarray, reference_traj: np.ndarray, 
+                        interp_mode: str = 'slerp') -> np.ndarray:
+        """
+        Compute interpolated trajectory using Real2Render2Real's relative shape preservation method.
+        
+        This maintains the relative motion pattern of the reference trajectory by normalizing
+        the progress along the reference and mapping it to the new start and end positions.
+        
+        Args:
+            start_trans: 4x4 transformation matrix from original start to new start
+            end_trans: 4x4 transformation matrix from original end to new end  
+            reference_traj: Reference trajectory in SE(3), shape (N, 4, 4) or (N, 16)
+            interp_mode: 'linear' or 'slerp' for rotation interpolation (default: 'slerp')
+            
+        Returns:
+            New interpolated trajectory with same shape as reference_traj
+        """
+        # Ensure reference_traj is in (N, 4, 4) format
+        if reference_traj.ndim == 2 and reference_traj.shape[1] == 16:
+            reference_traj = reference_traj.reshape(-1, 4, 4)
+        elif reference_traj.ndim == 3 and reference_traj.shape[1] == 4 and reference_traj.shape[2] == 4:
+            reference_traj = reference_traj.copy()
+        else:
+            raise ValueError(f"Invalid reference_traj shape: {reference_traj.shape}, expected (N, 4, 4) or (N, 16)")
+        
+        N = len(reference_traj)
+        if N == 0:
+            return reference_traj.copy()
+        
+        # Get start and end poses from reference trajectory
+        ref_start = reference_traj[0].copy()
+        ref_end = reference_traj[-1].copy()
+        
+        # Compute new start and end poses
+        new_start_mat = start_trans @ ref_start
+        new_end_mat = end_trans @ ref_end
+        
+        # Convert to 7D format using mat_to_pose
+        ref_traj_7d = []
+        for pose_mat in reference_traj:
+            pos, quat = mat_to_pose(pose_mat)
+            ref_traj_7d.append(np.concatenate([pos, quat]))
+        ref_traj_7d = np.array(ref_traj_7d)
+        
+        pos_start, quat_start = mat_to_pose(new_start_mat)
+        pos_end, quat_end = mat_to_pose(new_end_mat)
+        
+        # Initialize new trajectory
+        new_traj_7d = np.zeros_like(ref_traj_7d)
+        
+        # Normalize time steps
+        t = np.linspace(0, 1, N)
+        
+        # Split into position and rotation components
+        pos_orig = ref_traj_7d[:, :3]
+        quat_orig = ref_traj_7d[:, 3:7]  # wxyz format (from transforms3d)
+        
+        ref_start_pos, ref_start_quat = mat_to_pose(ref_start)
+        ref_end_pos, ref_end_quat = mat_to_pose(ref_end)
+        
+        # Interpolate positions: preserve relative shape from reference trajectory
+        for axis in range(3):
+            ref_range = ref_end_pos[axis] - ref_start_pos[axis]
+            if np.abs(ref_range) > 1e-10:
+                # Normalize progress along reference trajectory
+                normalized_progress = (pos_orig[:, axis] - ref_start_pos[axis]) / ref_range
+                # Map to new range
+                new_traj_7d[:, axis] = pos_start[axis] + (pos_end[axis] - pos_start[axis]) * normalized_progress
+            else:
+                # If no change in reference, use direct transformation
+                new_traj_7d[:, axis] = pos_orig[:, axis] + (pos_start[axis] - ref_start_pos[axis])
+        
+        # Interpolate rotations using SLERP
+        if interp_mode == 'slerp':
+            # Use scipy Slerp for spherical linear interpolation
+            # Convert wxyz to xyzw for scipy
+            quat_start_xyzw = np.array([quat_start[1], quat_start[2], quat_start[3], quat_start[0]])
+            quat_end_xyzw = np.array([quat_end[1], quat_end[2], quat_end[3], quat_end[0]])
+            
+            # Create Slerp interpolator
+            key_rots = R.from_quat([quat_start_xyzw, quat_end_xyzw])
+            key_times = [0, 1]
+            slerp = Slerp(key_times, key_rots)
+            
+            # Interpolate for all time steps
+            interp_rots = slerp(t)
+            quat_interp_xyzw = interp_rots.as_quat()
+            
+            # Convert back to wxyz format
+            new_traj_7d[:, 3] = quat_interp_xyzw[:, 3]  # w
+            new_traj_7d[:, 4] = quat_interp_xyzw[:, 0]  # x
+            new_traj_7d[:, 5] = quat_interp_xyzw[:, 1]  # y
+            new_traj_7d[:, 6] = quat_interp_xyzw[:, 2]  # z
+        else:  # linear
+            # Linear interpolation (needs normalization)
+            for i in range(N):
+                new_traj_7d[i, 3:7] = (1 - t[i]) * quat_start + t[i] * quat_end
+                new_traj_7d[i, 3:7] /= np.linalg.norm(new_traj_7d[i, 3:7])
+        
+        # Convert back to SE(3) matrices using pose_to_mat
+        new_traj = []
+        for pose_7d in new_traj_7d:
+            pose_mat = pose_to_mat(pose_7d[:3], pose_7d[3:7])
+            new_traj.append(pose_mat)
+        new_traj = np.array(new_traj)
+        
+        return new_traj
+
+
+    def lift_traj(self, old_traj, new_traj):
+        T = len(old_traj)
+        renewed_traj = []
+        for t in range(T):
+            old_pose = old_traj[t]
+            new_pose = new_traj[t]
+            old_pose_z = old_pose[2,3]
+            new_pose_z = new_pose[2,3]
+            if old_pose_z > new_pose_z:
+               new_pose[2,3] = old_pose_z
+            renewed_traj.append(new_pose)
+        return renewed_traj
+
+    def select_randomized_cfg(self, train_set_size: int):
+        return random.choice(self.task_cfg.generated_trajectories[train_set_size:])
+
+
+
+
diff --git a/openreal2sim/simulation/isaaclab/demo/envs/running_cfg.py b/openreal2sim/simulation/isaaclab/demo/envs/running_cfg.py
new file mode 100755
index 0000000..0e72c05
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/demo/envs/running_cfg.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import math
+from dataclasses import asdict, dataclass
+from typing import Dict
+
+
+@dataclass(frozen=True)
+class RandomizerConfig:
+    """Configuration bundle for `Randomizer.generate_randomized_scene_cfg`."""
+
+    grid_dist: float = 0.01
+    grid_num: int = 5
+    angle_random_range: float = math.pi / 10.0
+    angle_random_num: int = 10
+    traj_randomize_num: int = 20
+    scene_randomize_num: int = 20
+    robot_pose_randomize_range: float = 0.03
+    robot_pose_randomize_angle: float = math.pi / 180.0
+    robot_pose_randomize_num: int = 3
+    fix_end_pose: bool = True
+
+    def to_kwargs(self) -> Dict[str, float | int]:
+        """Return a shallow dict representation that can be splatted into the randomizer."""
+        return asdict(self)
+
+
+@dataclass(frozen=True)
+class HeuristicConfig:
+    """Configuration for the heuristic manipulation stage."""
+
+    num_envs: int = 1
+    num_trials: int = 10
+    grasp_num: int = 1
+    robot: str = "franka"
+
+
+@dataclass(frozen=True)
+class RandomizeRolloutConfig:
+    """Rollout-related knobs (e.g., required number of successful trajectories)."""
+
+    total_num: int = 50
+    num_envs: int = 1
+
+
+@dataclass(frozen=True)
+class RunningConfig:
+    """Top-level container bundling all per-scene tunables."""
+
+    randomizer: RandomizerConfig = RandomizerConfig()
+    rollout: RandomizeRolloutConfig = RandomizeRolloutConfig()
+    heuristic: HeuristicConfig = HeuristicConfig()
+
+
+DEFAULT_RUNNING_CONFIG = RunningConfig()
+
+# Users can override the defaults per-scene key by adding entries here.
+RUNNING_CONFIGS: Dict[str, RunningConfig] = {
+    "default": DEFAULT_RUNNING_CONFIG,
+    "demo_video": DEFAULT_RUNNING_CONFIG,
+}
+
+
+def get_randomizer_config(key: str) -> RandomizerConfig:
+    """Return the config associated with `key`, falling back to the default."""
+    return RUNNING_CONFIGS.get(key, DEFAULT_RUNNING_CONFIG).randomizer
+
+
+def get_rollout_config(key: str) -> RandomizeRolloutConfig:
+    """Return rollout config for a scene key."""
+    return RUNNING_CONFIGS.get(key, DEFAULT_RUNNING_CONFIG).rollout
+
+
+def get_heuristic_config(key: str) -> HeuristicConfig:
+    """Return heuristic manipulation config for a scene key."""
+    return RUNNING_CONFIGS.get(key, DEFAULT_RUNNING_CONFIG).heuristic
diff --git a/openreal2sim/simulation/isaaclab/demo/envs/task_cfg.py b/openreal2sim/simulation/isaaclab/demo/envs/task_cfg.py
new file mode 100755
index 0000000..f7cf390
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/demo/envs/task_cfg.py
@@ -0,0 +1,94 @@
+
+from enum import Enum, auto
+from typing import List, Dict, Optional
+from dataclasses import dataclass, field
+
+class SuccessMetricType(Enum):
+    SIMPLE_LIFT = auto()
+    TARGET_POINT = auto()
+    TARGET_PLANE = auto()
+
+class TaskType(Enum):
+    SIMPLE_PICK = auto()
+    SIMPLE_PICK_PLACE = auto()
+    TARGETTED_PICK_PLACE = auto()
+
+class RobotType(Enum):
+    FRANKA = auto()
+    UR5 = auto()
+
+
+@dataclass
+class BackgroundCfg:
+    background_rgb_path: str
+    background_mesh_path: str
+    background_usd_path: str
+    background_point: List[float]
+
+
+
+@dataclass
+class ObjectCfg:
+    object_id: int
+    object_name: str
+    mesh_path: str
+    usd_path: str
+
+
+@dataclass
+class SuccessMetric:
+    success_metric_type: SuccessMetricType
+    final_gripper_close:bool
+    lift_height: Optional[float] = None
+    ground_value: Optional[float] = None
+    end_pose: Optional[List[float]] = None
+ 
+
+@dataclass
+class TrajectoryCfg:  
+    robot_pose: List[float] # quat wxyz
+    object_poses: Dict[int, List[float]] # quat
+    object_trajectory: List[List[float]] # quat
+    final_gripper_close: bool
+    success_metric: SuccessMetric
+    pregrasp_pose: Optional[List[float]] = None  ### This is eef in world frame.   quat
+    grasp_pose: Optional[List[float]] = None  ### This is eef in world frame. quat
+    robot_type: Optional[RobotType] = None
+    init_manip_object_com: Optional[List[float]] = None # com in world frame
+
+
+
+@dataclass
+class CameraInfo:
+    width: float
+    height: float
+    fx: float
+    fy: float
+    cx: float
+    cy: float
+    camera_opencv_to_world: List[List[float]]
+    camera_position: List[float]
+    camera_heading_wxyz: List[float]
+
+@dataclass
+class TaskCfg:
+    task_key: str
+    task_id: int
+    task_desc: List[str]
+    task_type: TaskType
+    background_cfg: BackgroundCfg
+    camera_info: CameraInfo
+    manipulated_oid: int  
+    start_related: List[int]
+    end_related: List[int]
+    objects: List[ObjectCfg]
+    reference_trajectory: Optional[List[TrajectoryCfg]] = None
+    generated_trajectories: Optional[List[TrajectoryCfg]] = None
+
+
+
+
+
+
+
+
diff --git a/openreal2sim/simulation/isaaclab/demo/envs/task_construct.py b/openreal2sim/simulation/isaaclab/demo/envs/task_construct.py
new file mode 100755
index 0000000..9d84d63
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/demo/envs/task_construct.py
@@ -0,0 +1,244 @@
+### construct the task config from scene dict.
+
+import json
+import numpy as np
+from enum import Enum
+import shutil
+from pathlib import Path
+import os
+import sys
+from typing import List
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+from .task_cfg import TaskCfg, TaskType, ObjectCfg, CameraInfo, BackgroundCfg, TrajectoryCfg, SuccessMetric, SuccessMetricType, RobotType
+
+def get_next_id(folder: Path) -> int:
+    if not folder.exists():
+        os.makedirs(folder, exist_ok=True)
+        return 0
+    subfolders = [f for f in folder.iterdir() if f.is_dir()]
+    task_num = len(subfolders)
+    return task_num
+
+def get_task_cfg(key: str, base_folder: Path) -> TaskCfg:
+    json_path = base_folder / "task.json"
+    return load_task_cfg(json_path)
+
+def construct_task_config(key, scene_dict: dict, base_folder: Path):
+    task_key = key
+    task_id = get_next_id(base_folder)
+    task_desc = scene_dict["task_desc"]
+    base_folder = base_folder / key
+    if base_folder.exists():
+        shutil.rmtree(base_folder)
+    base_folder.mkdir(parents=True, exist_ok=True)  # Create directory before copying files
+    background_mesh_path = scene_dict["background"]["registered"]
+    background_usd_path = scene_dict["background"]["usd"]
+    shutil.copy(background_mesh_path, base_folder / "background.glb")
+    shutil.copy(background_usd_path, base_folder / "background.usd")
+    background_mesh_path = base_folder / "background.glb"
+    background_usd_path = base_folder / "background.usd"
+    background_rgb_path = scene_dict["background_image"]
+    shutil.copy(background_rgb_path, base_folder / "bg_rgb.jpg")
+    background_rgb_path = base_folder / "bg_rgb.jpg"
+    background_point = scene_dict["groundplane_in_cam"]["point"]
+    background_cfg = BackgroundCfg(str(background_rgb_path), str(background_mesh_path), str(background_usd_path), background_point)
+    width = scene_dict["camera"]["width"]
+    height = scene_dict["camera"]["height"]
+    fx = scene_dict["camera"]["fx"]
+    fy = scene_dict["camera"]["fy"]
+    cx = scene_dict["camera"]["cx"]
+    cy = scene_dict["camera"]["cy"]
+    camera_opencv_to_world = scene_dict["camera"]["camera_opencv_to_world"]
+    camera_position = scene_dict["camera"]["camera_position"]
+    camera_heading_wxyz = scene_dict["camera"]["camera_heading_wxyz"]
+    camera_info = CameraInfo(width, height, fx, fy, cx, cy, camera_opencv_to_world, camera_position, camera_heading_wxyz)
+
+    objects = []
+    for oid, obj in scene_dict["objects"].items():
+        object_id = oid
+        object_name = obj["name"]
+        mesh_path = obj["optimized"]
+        shutil.copy(mesh_path, base_folder / f"object_{object_id}.glb")
+        mesh_path = base_folder / f"object_{object_id}.glb"
+        usd_path = obj['usd']
+        
+        shutil.copy(usd_path, base_folder / Path(usd_path).name)
+       
+        cfg_path = Path(usd_path).parent / "config.yaml"
+        asset_hash_path = Path(usd_path).parent / ".asset_hash"
+        usd_path = base_folder / Path(usd_path).name
+        shutil.copy(cfg_path, base_folder / "config.yaml")
+        shutil.copy(asset_hash_path, base_folder / ".asset_hash")
+        object_cfg = ObjectCfg(object_id, object_name, str(mesh_path), str(usd_path))
+        objects.append(object_cfg)
+
+
+    manipulated_oid = scene_dict["manipulated_oid"]
+    start_related = scene_dict["start_related"]
+    end_related = scene_dict["end_related"]
+
+    if scene_dict["task_type"] == "targetted_pick_place":
+        task_type = TaskType.TARGETTED_PICK_PLACE
+    elif scene_dict["task_type"] == "simple_pick_place":
+        task_type = TaskType.SIMPLE_PICK_PLACE
+    elif scene_dict["task_type"] == "simple_pick":
+        task_type = TaskType.SIMPLE_PICK
+    else:
+        raise ValueError(f"Invalid task type: {scene_dict['info']['task_type']}")
+    task_config = TaskCfg(task_key, task_id, task_desc, task_type, background_cfg, camera_info, manipulated_oid, start_related, end_related, objects)
+    json_path = base_folder / "task.json"
+    with open(json_path, "w") as f:
+        json.dump(serialize_task_cfg(task_config), f)
+    return task_config, base_folder
+
+
+
+def serialize_task_cfg(task_cfg):
+    """
+    Serialize TaskCfg and all nested fields (including numpy arrays) into pure Python dict/list/primitive types,
+    so it can be safely saved as JSON.
+    """
+
+    def serialize(obj):
+        # Handle None
+        if obj is None:
+            return None
+        # Handle basic types
+        elif isinstance(obj, (int, float, str, bool)):
+            return obj
+        # Handle numpy array
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        # Handle enum
+        elif hasattr(obj, 'name') and isinstance(obj, (Enum,)):
+            return obj.name
+        # Handle dict
+        elif isinstance(obj, dict):
+            return {serialize(k): serialize(v) for k, v in obj.items()}
+        # Handle list/tuple
+        elif isinstance(obj, (list, tuple)):
+            return [serialize(i) for i in obj]
+        # Handle dataclass/object with __dict__
+        elif hasattr(obj, '__dict__'):
+            data = {}
+            for key, value in obj.__dict__.items():
+                data[key] = serialize(value)
+            return data
+        # Handle class with __slots__
+        elif hasattr(obj, '__slots__'):
+            data = {}
+            for slot in obj.__slots__:
+                data[slot] = serialize(getattr(obj, slot))
+            return data
+        # Fallback (e.g. Path objects)
+        elif hasattr(obj, '__str__'):
+            return str(obj)
+        else:
+            raise TypeError(f"Cannot serialize object of type {type(obj)}: {repr(obj)}")
+
+    return serialize(task_cfg)
+
+
+def add_reference_trajectory(task_cfg: TaskCfg, reference_trajectory: TrajectoryCfg, base_folder: Path):
+    task_cfg.reference_trajectory = reference_trajectory
+    json_path = base_folder / "task.json"
+    with open(json_path, "w") as f:
+        json.dump(serialize_task_cfg(task_cfg), f)
+    return task_cfg
+
+def add_generated_trajectories(task_cfg: TaskCfg, generated_trajectories: List[TrajectoryCfg], base_folder: Path):
+    task_cfg.generated_trajectories = generated_trajectories
+    json_path = base_folder / "task.json"
+    with open(json_path, "w") as f:
+        json.dump(serialize_task_cfg(task_cfg), f)
+    return task_cfg
+
+
+def load_task_cfg(json_path: Path) -> TaskCfg:
+    """
+    Load a TaskCfg from the given JSON file path and construct a TaskCfg instance.
+    """
+    with open(json_path, "r") as f:
+        cfg_dict = json.load(f)
+
+    # Handle all fields and reconstruct proper datatypes
+    # Helper to reconstruct enums
+    def parse_enum(enum_cls, val):
+        if isinstance(val, enum_cls):
+            return val
+        elif isinstance(val, str):
+            return enum_cls[val]
+        else:
+            raise ValueError(f"Unknown value {val} for enum {enum_cls}")
+
+    # Parse SuccessMetric(s)
+    def parse_success_metric(metric_dict):
+        return SuccessMetric(
+            success_metric_type=parse_enum(SuccessMetricType, metric_dict["success_metric_type"]),
+            final_gripper_close=metric_dict["final_gripper_close"],
+            lift_height=metric_dict.get("lift_height", None),
+            ground_value=metric_dict.get("ground_value", None),
+            end_pose=metric_dict.get("end_pose", None)
+        )
+
+    # Parse TrajectoryCfg(s)
+    def parse_traj_cfg(traj_dict):
+        return TrajectoryCfg(
+            robot_pose=np.array(traj_dict["robot_pose"], dtype=np.float32).tolist(),
+            object_poses={oid: np.array(pose, dtype=np.float32).tolist() for oid, pose in traj_dict["object_poses"].items()},
+            object_trajectory=[np.array(m, dtype=np.float32).tolist() for m in traj_dict["object_trajectory"]],
+            final_gripper_close=traj_dict.get("final_gripper_close", None),
+            success_metric=parse_success_metric(traj_dict.get("success_metric", None)),
+            pregrasp_pose=traj_dict.get("pregrasp_pose", None),
+            grasp_pose=traj_dict.get("grasp_pose", None),
+            robot_type=parse_enum(RobotType, traj_dict.get("robot_type", None))
+        )
+
+    def parse_camera_info(camera_dict):
+        return CameraInfo(
+            width=camera_dict["width"],
+            height=camera_dict["height"],
+            fx=camera_dict["fx"],
+            fy=camera_dict["fy"],
+            cx=camera_dict["cx"],
+            cy=camera_dict["cy"],
+            camera_opencv_to_world=np.array(camera_dict["camera_opencv_to_world"], dtype=np.float32).tolist(),
+            camera_position=np.array(camera_dict["camera_position"], dtype=np.float32).tolist(),
+            camera_heading_wxyz=np.array(camera_dict["camera_heading_wxyz"], dtype=np.float32).tolist(),
+        )
+    def parse_object_cfg(object_dict):
+        return ObjectCfg(
+            object_id=object_dict["object_id"],
+            object_name=object_dict["object_name"],
+            mesh_path=object_dict["mesh_path"],
+            usd_path=object_dict["usd_path"]
+        )
+    def parse_background_cfg(background_dict):
+        return BackgroundCfg(
+            background_rgb_path=background_dict["background_rgb_path"],
+            background_mesh_path=background_dict["background_mesh_path"],
+            background_usd_path=background_dict["background_usd_path"],
+            background_point=np.array(background_dict["background_point"], dtype=np.float32).tolist()
+        )
+    # Compose TaskCfg
+    task_cfg = TaskCfg(
+        task_id=cfg_dict["task_id"],
+        task_desc=cfg_dict["task_desc"],
+        task_key=cfg_dict["task_key"],
+        task_type=parse_enum(TaskType, cfg_dict["task_type"]),
+        background_cfg=parse_background_cfg(cfg_dict["background_cfg"]),
+        camera_info=parse_camera_info(cfg_dict["camera_info"]),
+        manipulated_oid=cfg_dict["manipulated_oid"],
+        start_related=cfg_dict["start_related"],
+        end_related=cfg_dict["end_related"],
+        objects=[parse_object_cfg(obj) for obj in cfg_dict["objects"]],
+        reference_trajectory=[
+            parse_traj_cfg(traj) for traj in (cfg_dict.get("reference_trajectory") or [])
+        ],
+        generated_trajectories=[
+            parse_traj_cfg(traj) for traj in (cfg_dict.get("generated_trajectories") or [])
+        ]
+    )
+
+    return task_cfg
\ No newline at end of file
diff --git a/openreal2sim/simulation/isaaclab/demo/sim_agent.sh b/openreal2sim/simulation/isaaclab/demo/sim_agent.sh
new file mode 100755
index 0000000..850fd51
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/demo/sim_agent.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd /app && pwd)"
+
+# Default parameters
+gpu_id="1"
+HEADLESS="--headless"
+CONFIG_PATH="${ROOT_DIR}/config/config.yaml"
+declare -a PIPELINE=()
+
+usage() {
+  cat <<'EOF'
+Usage: sim_agent.sh [options]
+
+Options:
+  --config <path>        Path to config.yaml file (default: /app/config/config.yaml)
+  --stage <stage>         Stage to run (can be specified multiple times)
+                          Available stages:
+                            - usd_conversion
+                            - sim_heuristic_manip
+                            - sim_randomize_rollout
+                          If not specified, runs all stages in order
+  --no-headless          Disable headless mode (passes through to AppLauncher)
+  -h, --help             Show this message and exit
+
+The script loads keys from config.yaml and runs for each key:
+  1. USD conversion (`isaaclab/sim_preprocess/usd_conversion.py`)
+  2. Heuristic manipulation (`isaaclab/sim_heuristic_manip.py`)
+  3. Randomized rollout (`isaaclab/sim_randomize_rollout.py`)
+EOF
+}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --config)
+      CONFIG_PATH="${2:?Missing value for --config}"
+      shift 2
+      ;;
+    --stage)
+      STAGE="${2:?Missing value for --stage}"
+      # Validate stage
+      case "${STAGE}" in
+        usd_conversion|sim_heuristic_manip|sim_randomize_rollout)
+          PIPELINE+=("${STAGE}")
+          ;;
+        *)
+          echo "[ERR] Invalid stage: ${STAGE}" >&2
+          echo "[ERR] Valid stages: usd_conversion, sim_heuristic_manip, sim_randomize_rollout" >&2
+          exit 1
+          ;;
+      esac
+      shift 2
+      ;;
+    --no-headless)
+      HEADLESS=""
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "[ERR] Unknown option: $1" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+done
+
+# Load keys from config.yaml
+if [[ ! -f "${CONFIG_PATH}" ]]; then
+  echo "[ERR] Config file not found: ${CONFIG_PATH}" >&2
+  exit 1
+fi
+
+# Extract keys from YAML using Python (more reliable than parsing YAML in bash)
+KEYS=($(python3 -c "
+import yaml
+import sys
+try:
+    with open('${CONFIG_PATH}', 'r') as f:
+        cfg = yaml.safe_load(f)
+        keys = cfg.get('keys', [])
+        if not keys:
+            print('[ERR] No keys found in config.yaml', file=sys.stderr)
+            sys.exit(1)
+        print(' '.join(keys))
+except Exception as e:
+    print(f'[ERR] Failed to load config: {e}', file=sys.stderr)
+    sys.exit(1)
+"))
+
+if [[ ${#KEYS[@]} -eq 0 ]]; then
+  echo "[ERR] No keys found in config file: ${CONFIG_PATH}" >&2
+  exit 1
+fi
+
+echo "[INFO] Loaded ${#KEYS[@]} key(s) from ${CONFIG_PATH}: ${KEYS[*]}"
+
+# Determine which stages to run
+export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}"
+cd "${ROOT_DIR}"
+
+# If no stages specified, run all stages in order
+if [[ ${#PIPELINE[@]} -eq 0 ]]; then
+  PIPELINE=("usd_conversion" "sim_heuristic_manip" "sim_randomize_rollout")
+  echo "[INFO] No stages specified, running all stages: ${PIPELINE[*]}"
+else
+  echo "[INFO] Running specified stages: ${PIPELINE[*]}"
+fi
+
+HEADLESS_ARGS=()
+if [[ -n "${HEADLESS}" ]]; then
+  HEADLESS_ARGS+=("${HEADLESS}")
+fi
+
+run_stage() {
+  local stage="$1"
+  local key="$2"
+  export CUDA_VISIBLE_DEVICES="${gpu_id}"
+  echo "=============================="
+  echo "[RUN] Stage: ${stage} | Key: ${key}"
+  echo "=============================="
+
+  case "${stage}" in
+    usd_conversion)
+      python openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py "${HEADLESS_ARGS[@]}"
+      ;;
+    sim_heuristic_manip)
+      echo "Setting CUDA_VISIBLE_DEVICES to ${CUDA_VISIBLE_DEVICES}"
+      python openreal2sim/simulation/isaaclab/demo/sim_heuristic_manip.py \
+        --key "${key}" \
+        "${HEADLESS_ARGS[@]}"
+      ;;
+    sim_randomize_rollout)
+      python openreal2sim/simulation/isaaclab/demo/sim_randomize_rollout.py \
+        --key "${key}" \
+        "${HEADLESS_ARGS[@]}"
+      ;;
+    *)
+      echo "[ERR] Unsupported stage '${stage}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
+for k in "${KEYS[@]}"; do
+  echo "########## Processing key: ${k} ##########"
+  for stage in "${PIPELINE[@]}"; do
+    run_stage "${stage}" "${k}"
+  done
+done
+
+echo "[DONE] Processed keys: ${KEYS[*]}"
+
diff --git a/openreal2sim/simulation/isaaclab/demo/sim_heuristic_manip.py b/openreal2sim/simulation/isaaclab/demo/sim_heuristic_manip.py
new file mode 100755
index 0000000..af5ef86
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/demo/sim_heuristic_manip.py
@@ -0,0 +1,962 @@
+"""
+Heuristic manipulation policy in Isaac Lab simulation.
+Using grasping and motion planning to perform object manipulation tasks.
+"""
+from __future__ import annotations
+
+# ─────────── AppLauncher ───────────
+import argparse, os, json, random, transforms3d, typing
+from typing import Optional
+from pathlib import Path
+import numpy as np
+import torch
+import yaml
+from isaaclab.app import AppLauncher
+import sys
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
+from envs.task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg, RobotType
+from envs.task_construct import construct_task_config, add_reference_trajectory, load_task_cfg, get_task_cfg
+from envs.running_cfg import get_heuristic_config
+
+
+# ─────────── CLI ───────────
+parser = argparse.ArgumentParser("sim_policy")
+parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
+AppLauncher.add_app_launcher_args(parser)
+args_cli = parser.parse_args()
+args_cli.enable_cameras = True
+args_cli.headless = True # headless mode for batch execution
+app_launcher = AppLauncher(vars(args_cli))
+simulation_app = app_launcher.app
+
+# ─────────── Runtime imports ───────────
+import isaaclab.sim as sim_utils
+from isaaclab.utils.math import subtract_frame_transforms
+import sys
+
+from sim_utils.grasp_group_utils import GraspGroup
+
+
+# ─────────── Simulation environments ───────────
+from sim_base import BaseSimulator, get_next_demo_id
+from sim_env_factory import make_env
+
+from sim_utils.transform_utils import pose_to_mat, mat_to_pose, grasp_to_world, grasp_approach_axis_batch
+from sim_utils.sim_utils import load_sim_parameters
+
+
+
+
+# ──────────────────────────── Heuristic Manipulation ────────────────────────────
+class HeuristicManipulation(BaseSimulator):
+    """
+    Physical trial-and-error grasping with approach-axis perturbation:
+      • Multiple grasp proposals executed in parallel;
+      • Every attempt does reset → pre → grasp → close → lift → check;
+      • Early stops when any env succeeds; then re-exec for logging.
+    """
+    def __init__(self, sim, scene, sim_cfgs: dict, args, out_dir: Path, img_folder: str, data_dir: Path, num_trials: int, grasp_num: int, robot: str):
+        robot_pose = torch.tensor(
+            sim_cfgs["robot_cfg"]["robot_pose"],
+            dtype=torch.float32,
+            device=sim.device
+        )
+        super().__init__(
+            sim=sim, sim_cfgs=sim_cfgs, scene=scene, args=args_cli,
+            robot_pose=robot_pose, cam_dict=sim_cfgs["cam_cfg"],
+            out_dir=out_dir, img_folder=img_folder, data_dir = data_dir,
+            enable_motion_planning=True,
+            set_physics_props=True, debug_level=0,
+        )
+        self.final_gripper_closed = sim_cfgs["demo_cfg"]["final_gripper_closed"]
+        self.selected_object_id = sim_cfgs["demo_cfg"]["manip_object_id"]
+        self.traj_path = sim_cfgs["demo_cfg"]["traj_path"]
+        self.goal_offset = [0, 0, sim_cfgs["demo_cfg"]["goal_offset"]]
+        self.grasp_path = sim_cfgs["demo_cfg"]["grasp_path"]
+        self.grasp_idx = sim_cfgs["demo_cfg"]["grasp_idx"]
+        self.grasp_pre = sim_cfgs["demo_cfg"]["grasp_pre"]
+        self.grasp_delta = sim_cfgs["demo_cfg"]["grasp_delta"]
+        self.task_type = sim_cfgs["demo_cfg"]["task_type"]
+        self.robot_type = robot
+        self.load_obj_goal_traj()
+        self.std = 0
+        self.grasp_round = 0
+        self.grasp_num = grasp_num
+        self.num_trials = num_trials
+        self.trial_num = 0
+        self.grasp_poses = []
+        self.pregrasp_poses = []
+        self.end_object_poses = []
+        self.robot_poses = []   
+
+    
+    def load_obj_goal_traj(self):
+        """
+        Load the relative trajectory Δ_w (T,4,4) and precompute the absolute
+        object goal trajectory for each env using the *actual current* object pose
+        in the scene as T_obj_init (not env_origin).
+          T_obj_goal[t] = Δ_w[t] @ T_obj_init
+
+        Sets:
+          self.obj_rel_traj   : np.ndarray or None, shape (T,4,4)
+          self.obj_goal_traj_w: np.ndarray or None, shape (B,T,4,4)
+        """
+        # —— 1) Load Δ_w ——
+        rel = np.load(self.traj_path).astype(np.float32)
+        self.obj_rel_traj = rel[1:, :, :]  # (T,4,4)
+
+        self.reset()
+
+        # —— 2) Read current object initial pose per env as T_obj_init ——
+        B = self.scene.num_envs
+        # obj_state = self.object_prim.data.root_com_state_w[:, :7]  # [B,7], pos(3)+quat(wxyz)(4)
+        obj_state = self.object_prim.data.root_state_w[:, :7]  # [B,7], pos(3)+quat(wxyz)(4)
+        self.show_goal(obj_state[:, :3], obj_state[:, 3:7])
+
+        obj_state_np = obj_state.detach().cpu().numpy().astype(np.float32)
+        offset_np = np.asarray(self.goal_offset, dtype=np.float32).reshape(3)
+        #obj_state_np[:, :3] += offset_np  # raise a bit to avoid collision
+
+        # Note: here the relative traj Δ_w is defined in world frame with origin (0,0,0),
+        # Hence, we need to normalize it to each env's origin frame.
+        origins = self.scene.env_origins.detach().cpu().numpy().astype(np.float32)  # (B,3)
+
+        obj_state_np[:, :3] -= origins # normalize to env origin frame
+
+        # —— 3) Precompute absolute object goals for all envs ——
+        T = rel.shape[0]
+        obj_goal = np.zeros((B, T, 4, 4), dtype=np.float32)
+        for b in range(B):
+            T_init = pose_to_mat(obj_state_np[b, :3], obj_state_np[b, 3:7])  # (4,4)
+            for t in range(T):
+                goal = rel[t] @ T_init
+                goal[:3, 3] += origins[b]  # back to world frame
+                goal[:3, 3] += offset_np
+                obj_goal[b, t] = goal
+
+        self.obj_goal_traj_w = obj_goal  # [B, T, 4, 4]
+
+    def follow_object_goals(self, start_joint_pos, sample_step=1, recalibrate_interval = 1, visualize=True):
+        """
+        follow precompute object absolute trajectory: self.obj_goal_traj_w:
+          T_obj_goal[t] = Δ_w[t] @ T_obj_init
+
+        EE-object transform is fixed:
+          T_ee_goal[t] = T_obj_goal[t] @ (T_obj_grasp^{-1} @ T_ee_grasp)
+        Here T_obj_grasp / T_ee_grasp is the transform at the moment of grasping.
+        """
+        B = self.scene.num_envs
+        obj_goal_all = self.obj_goal_traj_w  # [B, T, 4, 4]
+        T = obj_goal_all.shape[1]
+        joint_pos = start_joint_pos
+        root_w = self.robot.data.root_state_w[:, 0:7]  # robot base poses per env
+        t_iter = list(range(0, T, sample_step))
+        t_iter = t_iter + [T-1] if t_iter[-1] != T-1 else t_iter
+        ee_pos_initial = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:3]
+        obj_pos_initial = self.object_prim.data.root_com_pos_w[:, 0:3]
+        initial_grasp_dist = torch.norm(ee_pos_initial - obj_pos_initial, dim=1) # [B]
+        self.initial_grasp_dist = initial_grasp_dist
+
+        for t in t_iter:
+            if recalibrate_interval> 0 and t % recalibrate_interval == 0:
+                ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+                obj_w = self.object_prim.data.root_state_w[:, :7]                                 # [B,7]
+                T_ee_in_obj = []
+                for b in range(B):
+                    T_ee_w  = pose_to_mat(ee_w[b, :3],  ee_w[b, 3:7])
+                    T_obj_w = pose_to_mat(obj_w[b, :3], obj_w[b, 3:7])
+                    T_ee_in_obj.append((np.linalg.inv(T_obj_w) @ T_ee_w).astype(np.float32))
+            goal_pos_list, goal_quat_list = [], []
+            print(f"[INFO] follow object goal step {t}/{T}")
+            for b in range(B):
+                T_obj_goal = obj_goal_all[b, t]            # (4,4)
+                T_ee_goal  = T_obj_goal @ T_ee_in_obj[b]   # (4,4)
+                pos_b, quat_b = mat_to_pose(T_ee_goal)
+                goal_pos_list.append(pos_b.astype(np.float32))
+                goal_quat_list.append(quat_b.astype(np.float32))
+
+            goal_pos  = torch.as_tensor(np.stack(goal_pos_list),  dtype=torch.float32, device=self.sim.device)
+            goal_quat = torch.as_tensor(np.stack(goal_quat_list), dtype=torch.float32, device=self.sim.device)
+
+            if visualize:
+                self.show_goal(goal_pos, goal_quat)
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_w[:, :3], root_w[:, 3:7], goal_pos, goal_quat
+            )
+            joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False)
+            self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
+
+        is_success = self.holding_is_success()
+        success_ids = torch.where(is_success)[0]
+        joint_pos = self.wait(gripper_open=not self.final_gripper_closed, steps=30)
+        return joint_pos, success_ids
+
+    ## FIXME: This should have some problem. But it works.
+    def follow_object_centers(self, start_joint_pos, sample_step=1, visualize=True, recalibrate_interval=1):
+            B = self.scene.num_envs
+            obj_goal_all = self.obj_goal_traj_w  # [B, T, 4, 4]
+            T = obj_goal_all.shape[1]
+            joint_pos = start_joint_pos
+            root_w = self.robot.data.root_state_w[:, 0:7]  # robot base poses per env
+            ee_pos_initial = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:3]
+            obj_pos_initial = self.object_prim.data.root_com_pos_w[:, 0:3]
+            initial_grasp_dist = torch.norm(ee_pos_initial - obj_pos_initial, dim=1) # [B]
+            self.initial_grasp_dist = initial_grasp_dist
+            t_iter = list(range(0, T, sample_step))
+            t_iter = t_iter + [T-1] if t_iter[-1] != T-1 else t_iter
+
+            for t in t_iter:
+                if recalibrate_interval> 0 and t % recalibrate_interval == 0:
+                    ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+                    obj_w = self.object_prim.data.root_state_w[:, :7]                                 # [B,7]
+                    T_ee_ws = []
+                    T_obj_ws = []
+                    for b in range(B):
+                        T_ee_w  = pose_to_mat(ee_w[b, :3],  ee_w[b, 3:7])
+                        T_obj_w = pose_to_mat(obj_w[b, :3], obj_w[b, 3:7])
+                        T_ee_ws.append(T_ee_w)
+                        T_obj_ws.append(T_obj_w)
+                    print(f"[INFO] recalibrated at step {t}/{T}")
+                goal_pos_list, goal_quat_list = [], []
+                print(f"[INFO] follow object goal step {t}/{T}")
+                for b in range(B):
+                    T_obj_goal = obj_goal_all[b, t]            # (4,4)
+                    trans_offset = T_obj_goal - T_obj_ws[b]
+                    T_ee_goal  = T_ee_ws[b] + trans_offset
+                    pos_b, quat_b = mat_to_pose(T_ee_goal)
+
+                    goal_pos_list.append(pos_b.astype(np.float32))
+                    goal_quat_list.append(quat_b.astype(np.float32))
+
+
+                goal_pos  = torch.as_tensor(np.stack(goal_pos_list),  dtype=torch.float32, device=self.sim.device)
+                goal_quat = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 3:7]
+
+                if visualize:
+                    self.show_goal(goal_pos, goal_quat)
+                ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                    root_w[:, :3], root_w[:, 3:7], goal_pos, goal_quat
+                )
+                joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False)
+                print('[INFO] goal pose', obj_goal_all[:, t], 'current obj pose', self.object_prim.data.root_state_w[:, :3])
+                print('[INFO]current ee obj trans diff', self.object_prim.data.root_state_w[:, :3] - self.robot.data.root_state_w[:, :3])
+                self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
+            is_success = self.holding_is_success(position_threshold = 0.10)
+            print('[INFO] last obj goal', obj_goal_all[:, -1])
+            print('[INFO] last obj pos', self.object_prim.data.root_state_w[:, :3])
+            joint_pos = self.wait(gripper_open=not self.final_gripper_closed, steps=30)
+            return joint_pos, torch.where(is_success)[0]
+
+
+
+
+    def viz_object_goals(self, sample_step=1, hold_steps=20):
+        self.reset()
+        self.wait(gripper_open=True, steps=10)
+        B = self.scene.num_envs
+        env_ids = torch.arange(B, device=self.object_prim.device, dtype=torch.long)
+        goals = self.obj_goal_traj_w
+        t_iter = list(range(0, goals.shape[1], sample_step))
+        t_iter = t_iter + [goals.shape[1]-1] if t_iter[-1] != goals.shape[1]-1 else t_iter
+        for t in t_iter:
+            print(f"[INFO] viz object goal step {t}/{goals.shape[1]}")
+            pos_list, quat_list = [], []
+            for b in range(B):
+                pos, quat = mat_to_pose(goals[b, t])
+                pos_list.append(pos.astype(np.float32))
+                quat_list.append(quat.astype(np.float32))
+            pose = self.object_prim.data.root_state_w[:, :7]
+            # pose = self.object_prim.data.root_com_state_w[:, :7]
+            pose[:, :3]   = torch.tensor(np.stack(pos_list),  dtype=torch.float32, device=pose.device)
+            pose[:, 3:7]  = torch.tensor(np.stack(quat_list), dtype=torch.float32, device=pose.device)
+            self.show_goal(pose[:, :3], pose[:, 3:7])
+
+            for _ in range(hold_steps):
+                self.object_prim.write_root_pose_to_sim(pose, env_ids=env_ids)
+                self.object_prim.write_data_to_sim()
+                self.step()
+
+    # ---------- Helpers ----------
+    def _to_base(self, pos_w: np.ndarray | torch.Tensor, quat_w: np.ndarray | torch.Tensor):
+        """World → robot base frame for all envs."""
+        root = self.robot.data.root_state_w[:, 0:7]  # [B,7]
+        p_w, q_w = self._ensure_batch_pose(pos_w, quat_w)
+        pb, qb = subtract_frame_transforms(
+            root[:, 0:3], root[:, 3:7], p_w, q_w
+        )
+        return pb, qb  # [B,3], [B,4]
+
+    # ---------- Batched execution & lift-check ----------
+    def execute_and_lift_once_batch(self, info: dict, lift_height=0.12) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Reset → pre → grasp → close → lift → hold; return (success[B], score[B]).
+        """
+        B = self.scene.num_envs
+        self.reset()
+
+        # open gripper buffer
+        jp = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
+        self.wait(gripper_open=True, steps=4)
+
+        # pre-grasp
+        jp, success = self.move_to(info["pre_p_b"], info["pre_q_b"], gripper_open=True)
+        if torch.any(success==False): return np.zeros(B, bool), np.zeros(B, np.float32)
+        jp = self.wait(gripper_open=True, steps=3)
+
+        # grasp
+        jp, success = self.move_to(info["p_b"], info["q_b"], gripper_open=True)
+        if torch.any(success==False): return np.zeros(B, bool), np.zeros(B, np.float32)
+        jp = self.wait(gripper_open=True, steps=2)
+
+        # close
+        jp = self.wait(gripper_open=False, steps=6)
+
+        # initial heights
+        obj0 = self.object_prim.data.root_com_pos_w[:, 0:3]     # [B,3]
+        ee_w = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+        ee_p0 = ee_w[:, :3]
+        robot_ee_z0 = ee_p0[:, 2].clone()
+        obj_z0 = obj0[:, 2].clone()
+        print(f"[INFO] mean object z0={obj_z0.mean().item():.3f} m, mean EE z0={robot_ee_z0.mean().item():.3f} m")
+
+        # lift: keep orientation, add height
+        ee_q = ee_w[:, 3:7]
+        target_p = ee_p0.clone()
+        target_p[:, 2] += lift_height
+
+        root = self.robot.data.root_state_w[:, 0:7]
+        p_lift_b, q_lift_b = subtract_frame_transforms(
+            root[:, 0:3], root[:, 3:7],
+            target_p, ee_q
+        )
+        jp, success = self.move_to(p_lift_b, q_lift_b, gripper_open=False)
+        if torch.any(success==False): return np.zeros(B, bool), np.zeros(B, np.float32)
+        jp = self.wait(gripper_open=False, steps=8)
+
+        # final heights
+        obj1 = self.object_prim.data.root_com_pos_w[:, 0:3]
+        ee_w1 = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]
+        robot_ee_z1 = ee_w1[:, 2]
+        obj_z1 = obj1[:, 2]
+        print(f"[INFO] mean object z1={obj_z1.mean().item():.3f} m, mean EE z1={robot_ee_z1.mean().item():.3f} m")
+
+        # lifted if EE and object rise similarly (tight coupling)
+        ee_diff  = robot_ee_z1 - robot_ee_z0
+        obj_diff = obj_z1 - obj_z0
+        lifted = (torch.abs(ee_diff - obj_diff) <= 0.03) & \
+            (torch.abs(ee_diff) >= 0.5 * lift_height) & \
+            (torch.abs(obj_diff) >= 0.5 * lift_height)  # [B] bool
+
+        score = torch.zeros_like(ee_diff)
+        score[lifted] = 1000.0
+        return lifted.detach().cpu().numpy().astype(bool), score.detach().cpu().numpy().astype(np.float32)
+
+    def lift_up(self, height=0.12, gripper_open=False, steps=8):
+        """
+        Lift up by a certain height (m) from current EE pose.
+        """
+        ee_w = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]
+        target_p = ee_w[:, :3].clone()
+        target_p[:, 2] += height
+
+        root = self.robot.data.root_state_w[:, 0:7]
+        p_lift_b, q_lift_b = subtract_frame_transforms(
+            root[:, 0:3], root[:, 3:7],
+            target_p, ee_w[:, 3:7]
+        ) # [B,3], [B,4]  
+        jp, success = self.move_to(p_lift_b, q_lift_b, gripper_open=gripper_open)
+        jp = self.wait(gripper_open=gripper_open, steps=steps)
+        return jp
+ 
+    def goal_is_success(self, position_threshold: float = 0.10) -> torch.Tensor:
+        """
+        Verify if the manipulation task succeeded by checking:
+        1. Object is at Goal (Distance < 10cm)
+        """
+        current_obj_pose = self.object_prim.data.root_state_w[:, :3]
+        final_goal_pose = self.obj_goal_traj_w[:, -1, :, :]  # [B, 4, 4]
+        obj_success = torch.norm(current_obj_pose - final_goal_pose, dim=1) <= position_threshold
+        return obj_success
+    
+    def holding_is_success(self) -> torch.Tensor:
+        ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]
+        obj_w = self.object_prim.data.root_com_pos_w[:, 0:3]
+        dist = torch.norm(obj_w[:, :3] - ee_w[:, :3], dim=1) # [B]
+        print(f"[INFO] holding dist: {dist.mean().item():.3f} m")
+      
+        return (dist < 0.15).to(torch.bool)
+    
+    def is_success(self, position_threshold: float = 0.10, gripper_threshold: float = 0.10, holding_threshold: float = 0.02) -> torch.Tensor:
+        """
+        Verify if the manipulation task succeeded by checking:
+        1. Object is at Goal (Distance < 10cm)
+        2. Gripper is at Goal (Distance < 10cm) - Explicit check using T_ee_in_obj
+        3. Object is in Gripper (Deviation < 2cm)
+        
+        Args:
+            position_threshold: Distance threshold for Object-Goal check (default: 0.10m = 10cm)
+            skip_envs: Boolean array [B] indicating which envs to skip from verification
+        
+        Returns:
+            torch.Tensor: Boolean tensor [B] indicating success for each environment
+        """
+        B = self.scene.num_envs
+        # --- 1. Object Goal Check ---
+        final_goal_matrices = self.obj_goal_traj_w[:, -1, :, :]  # [B, 4, 4]
+        goal_positions_np = final_goal_matrices[:, :3, 3]
+        goal_positions = torch.tensor(goal_positions_np, dtype=torch.float32, device=self.sim.device)
+        
+        # Current Root and COM
+        current_root_state = self.object_prim.data.root_state_w
+        current_root_pos = current_root_state[:, :3]
+        current_root_quat = current_root_state[:, 3:7]
+        current_com_pos = self.object_prim.data.root_com_pos_w[:, :3]
+        
+        # Calculate Goal COM positions
+        # The COM offset is constant in the object's local frame.
+        # We need to: (1) Get the current COM offset in local frame, (2) Apply to goal pose
+        goal_com_positions_list = []
+        for b in range(B):
+            # Current state
+            root_pos_cur = current_root_pos[b].cpu().numpy()
+            root_quat_cur = current_root_quat[b].cpu().numpy()  # wxyz format
+            com_pos_cur = current_com_pos[b].cpu().numpy()
+            
+            # COM offset in world frame
+            com_offset_world = com_pos_cur - root_pos_cur  # [3]
+            
+            # Convert COM offset to object's local frame
+            # R_cur^T @ com_offset_world gives the offset in local coords (assuming no scale)
+            R_cur = transforms3d.quaternions.quat2mat(root_quat_cur)  # Convert wxyz to rotation matrix
+            com_offset_local = R_cur.T @ com_offset_world  # Rotate to local frame
+            
+            # Goal state
+            T_goal_root = final_goal_matrices[b]  # [4, 4] numpy array
+            goal_root_pos = T_goal_root[:3, 3]
+            R_goal = T_goal_root[:3, :3]
+            
+            # Apply local offset to goal pose
+            # goal_com_pos = goal_root_pos + R_goal @ com_offset_local
+            com_offset_world_goal = R_goal @ com_offset_local
+            goal_com_pos = goal_root_pos + com_offset_world_goal
+            
+            goal_com_positions_list.append(goal_com_pos)
+            
+        goal_com_positions = torch.tensor(np.array(goal_com_positions_list), dtype=torch.float32, device=self.sim.device)
+        
+        # Calculate Distances
+        root_dist = torch.norm(current_root_pos - goal_positions, dim=1)
+        com_dist = torch.norm(current_com_pos - goal_com_positions, dim=1)
+        
+        # Success requires BOTH Root and COM to be within threshold
+        obj_success = (root_dist <= position_threshold) & (com_dist <= position_threshold)
+
+        # --- 2. Gripper Goal Check ---
+        # Calculate Target Gripper Pose: T_ee_goal = T_obj_goal @ T_ee_in_obj
+        # We use the stored T_ee_in_obj from the start of the trajectory
+        ee_pos_final = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:3]
+        
+        if hasattr(self, 'T_ee_in_obj') and self.T_ee_in_obj is not None:
+            target_ee_pos_list = []
+            for b in range(B):
+                T_obj_goal = final_goal_matrices[b]
+                T_ee_goal = T_obj_goal @ self.T_ee_in_obj[b]
+                target_ee_pos_list.append(T_ee_goal[:3, 3])
+            
+            target_ee_pos = torch.tensor(np.array(target_ee_pos_list), dtype=torch.float32, device=self.sim.device)
+            grip_goal_dist = torch.norm(ee_pos_final - target_ee_pos, dim=1)
+            gripper_success = grip_goal_dist <= 0.10
+        else:
+            # Fallback if T_ee_in_obj not available (should not happen if follow_object_goals ran)
+            print("[WARN] T_ee_in_obj not found, skipping explicit Gripper Goal Check")
+            gripper_success = torch.ones(B, dtype=torch.bool, device=self.sim.device)
+            grip_goal_dist = torch.zeros(B, dtype=torch.float32, device=self.sim.device)
+
+        # --- 3. Holding Check (Object in Gripper) ---
+        # Check if the distance between Gripper and Object COM has remained stable.
+        # We compare the final distance to the initial grasp distance.
+        obj_com_final = self.object_prim.data.root_com_pos_w[:, 0:3]
+        current_grip_com_dist = torch.norm(ee_pos_final - obj_com_final, dim=1)
+        
+        if hasattr(self, 'initial_grasp_dist') and self.initial_grasp_dist is not None:
+            grasp_deviation = torch.abs(current_grip_com_dist - self.initial_grasp_dist)
+            holding_success = grasp_deviation <= 0.02
+            
+            holding_metric = grasp_deviation
+            holding_threshold = 0.02
+            holding_metric_name = "Grip-COM Deviation"
+        else:
+            holding_success = current_grip_com_dist <= 0.15
+            holding_metric = current_grip_com_dist
+            holding_threshold = 0.15
+            holding_metric_name = "Grip-COM Dist"
+
+        # Combined Success
+        success_mask = obj_success & gripper_success & holding_success
+       
+        print("="*50)
+        print(f"TASK VERIFIER: SUCCESS - {success_mask.sum().item()}/{B} environments succeeded")
+        print("="*50 + "\n")
+        
+        return success_mask
+
+
+
+    def build_grasp_info(
+        self,
+        grasp_pos_w_batch: np.ndarray,   # (B,3)  GraspNet proposal in world frame
+        grasp_quat_w_batch: np.ndarray,  # (B,4)  wxyz
+        pre_dist_batch: np.ndarray,      # (B,)
+        delta_batch: np.ndarray          # (B,)
+    ) -> dict:
+        """
+        return grasp info dict for all envs in batch.
+        """
+        B = self.scene.num_envs
+        p_w   = np.asarray(grasp_pos_w_batch,  dtype=np.float32).reshape(B, 3)
+        q_w   = np.asarray(grasp_quat_w_batch, dtype=np.float32).reshape(B, 4)
+        pre_d = np.asarray(pre_dist_batch,     dtype=np.float32).reshape(B)
+        delt  = np.asarray(delta_batch,        dtype=np.float32).reshape(B)
+
+        a_batch = grasp_approach_axis_batch(q_w)  # (B,3)
+
+        pre_p_w = (p_w - pre_d[:, None] * a_batch).astype(np.float32)
+        gra_p_w = (p_w + delt[:,  None] * a_batch).astype(np.float32)
+
+        origins = self.scene.env_origins.detach().cpu().numpy().astype(np.float32)  # (B,3)
+        pre_p_w = pre_p_w + origins
+        gra_p_w = gra_p_w + origins
+
+        pre_pb, pre_qb = self._to_base(pre_p_w, q_w)
+        gra_pb, gra_qb = self._to_base(gra_p_w, q_w)
+
+        return {
+            "pre_p_w": pre_p_w, "p_w": gra_p_w, "q_w": q_w,
+            "pre_p_b": pre_pb,  "pre_q_b": pre_qb,
+            "p_b": gra_pb,      "q_b": gra_qb,
+            "pre_dist": pre_d,  "delta": delt,
+        }
+
+
+    def grasp_trials(self, gg):
+
+        B = self.scene.num_envs
+        idx_all = list(range(len(gg)))
+        if len(idx_all) == 0:
+            print("[ERR] empty grasp list.")
+            return False
+
+        rng = np.random.default_rng()
+
+        pre_dist_const = 0.12  # m
+
+        success = False
+        chosen_pose_w = None    # (p_w, q_w)
+        chosen_pre    = None
+        chosen_delta  = None
+
+        # assign different grasp proposals to different envs\
+      
+        winners = []
+        success = False
+
+
+        for start in range(self.grasp_round, len(idx_all), B):
+            block = idx_all[start : start + B]
+            if len(block) < B:
+                block = block + [block[-1]] * (B - len(block))
+            grasp_pos_w_batch, grasp_quat_w_batch = [], []
+            for idx in block:
+                p_w, q_w = grasp_to_world(gg[int(idx)])
+                grasp_pos_w_batch.append(p_w.astype(np.float32))
+                grasp_quat_w_batch.append(q_w.astype(np.float32))
+            grasp_pos_w_batch  = np.stack(grasp_pos_w_batch,  axis=0)  # (B,3)
+            grasp_quat_w_batch = np.stack(grasp_quat_w_batch, axis=0)  # (B,4)
+            self.show_goal(grasp_pos_w_batch, grasp_quat_w_batch)
+            # random disturbance along approach axis
+            pre_dist_batch = np.full((B,), pre_dist_const, dtype=np.float32)
+            delta_batch    = rng.normal(-0.002, self.std, size=(B,)).astype(np.float32)
+
+            info = self.build_grasp_info(grasp_pos_w_batch, grasp_quat_w_batch,
+                                        pre_dist_batch, delta_batch)
+
+            ok_batch, score_batch = self.execute_and_lift_once_batch(info)
+            if start + B > len(idx_all):
+                ok_batch = ok_batch[:(len(idx_all) - start)]
+                score_batch = score_batch[:(len(idx_all) - start)]
+            ok_cnt = int(ok_batch.sum())
+            print(f"[SEARCH] block[{start}:{start+B}] -> success {ok_cnt}/{B}")
+            if ok_cnt > 0:
+                for candidate in range(len(score_batch)):
+                    if score_batch[candidate] == np.max(score_batch):
+                        winner = candidate
+                        chosen_pose_w = (grasp_pos_w_batch[winner], grasp_quat_w_batch[winner])
+                        chosen_pre    = float(pre_dist_batch[winner])
+                        chosen_delta  = float(delta_batch[winner])
+                        success = True
+                        winners.append({
+                            "success": success,
+                            "chosen_pose_w": chosen_pose_w,
+                            "chosen_pre": chosen_pre,
+                            "chosen_delta": chosen_delta,
+                        })
+                self.grasp_round = start + B
+                if self.grasp_round >= len(idx_all):
+                    self.grasp_round = 0
+                    self.std = self.std + 0.003
+                    self.trial_num +=1
+                return winners
+        if not success:
+            print("[ERR] no proposal succeeded to lift after full search.")
+            return [{
+                "success": success,
+                "chosen_pose_w": None,
+                "chosen_pre": None,
+                "chosen_delta": None,
+            }]
+
+
+    def replay_actions(self, actions: np.ndarray):
+        """
+        Replay a sequence of recorded actions: (p[B,3], q[B,4], gripper[B,1])
+        """
+        n_steps = actions.shape[0]
+
+        self.reset()
+        self.wait(gripper_open=True, steps=10)
+
+        for t in range(n_steps):
+            print(f"[INFO] replay step {t}/{n_steps}")
+            act = actions[t:t+1]
+            p_b = torch.as_tensor(act[:, 0:3], dtype=torch.float32, device=self.sim.device)
+            q_b = torch.as_tensor(act[:, 3:7], dtype=torch.float32, device=self.sim.device)
+            g_b = act[:, 7] < 0.5
+            jp, success = self.move_to(p_b, q_b, gripper_open=g_b)
+            if torch.any(success==False):
+                print(f"[ERR] replay step {t} failed.")
+                return False
+            jp = self.wait(gripper_open=g_b, steps=3)
+        return True
+
+    def inference(self, std: float = 0.0) -> list[int]:
+        """
+        Main function of the heuristic manipulation policy.
+        Physical trial-and-error grasping with approach-axis perturbation:
+          • Multiple grasp proposals executed in parallel;
+          • Every attempt does reset → pre → grasp → close → lift → check;
+          • Early stops when any env succeeds; then re-exec for logging.
+        """
+        B = self.scene.num_envs
+
+        self.wait(gripper_open=True, steps=10)
+
+        # read grasp proposals
+        npy_path = self.grasp_path
+        if npy_path is None or (not os.path.exists(npy_path)):
+            print(f"[ERR] grasps npy not found: {npy_path}")
+            return []
+        gg = GraspGroup().from_npy(npy_file_path=npy_path)
+        success_num = 0
+        success_pregrasp_poses= []
+        success_grasp_poses = []
+        end_object_poses= []
+        robot_poses = []
+        assert self.grasp_idx < 0 or self.grasp_num == 1, "[ERR] grasp_idx and grasp_num cannot be set together"
+        while success_num < self.grasp_num and self.trial_num < self.num_trials:
+            if success_num > 0 and self.grasp_idx >= 0:
+                return success_num
+            if self.grasp_idx >= 0:
+                if self.grasp_idx >= len(gg):
+                    print(f"[ERR] grasp_idx {self.grasp_idx} out of range [0,{len(gg)})")
+                    return []
+                print(f"[INFO] using fixed grasp index {self.grasp_idx} for all envs.")
+                p_w, q_w = grasp_to_world(gg[int(self.grasp_idx)])
+                ret = {
+                    "success": True,
+                    "chosen_pose_w": (p_w.astype(np.float32), q_w.astype(np.float32)),
+                    "chosen_pre": self.grasp_pre if self.grasp_pre is not None else 0.12,
+                    "chosen_delta": self.grasp_delta if self.grasp_delta is not None else 0.0,
+                }
+                print(f"[INFO] grasp delta (m): {ret['chosen_delta']:.4f}")
+                rets = [ret]
+            
+            else:
+                rets = self.grasp_trials(gg)
+
+            print("[INFO] Re-exec all envs with the winning grasp, then follow object goals.")
+            if (rets is None or rets[0]["success"] == False) and self.grasp_idx >= 0:
+                print("[ERR] no proposal succeeded to lift after full search.")
+                return 0
+
+
+        
+
+            B = self.scene.num_envs
+            for i in range(0, len(rets), B):
+                block = list(range(i, min(i + B, len(rets))))
+                if len(block) < B:
+                    block = block + [block[-1]] * (B - len(block))
+                p_win = [rets[j]["chosen_pose_w"][0] for j in block]
+                q_win = [rets[j]["chosen_pose_w"][1] for j in block]
+                p_all = np.array(p_win)
+                q_all = np.array(q_win)
+                pre_all = np.array([rets[j]["chosen_pre"] for j in block], dtype=np.float32)
+                del_all = np.array([rets[j]["chosen_delta"] for j in block], dtype=np.float32)
+
+                info_all = self.build_grasp_info(p_all, q_all, pre_all, del_all)
+
+                # reset and conduct main process: open→pre→grasp→close→follow_object_goals
+                self.reset()
+
+                init_manip_object_com = self.object_prim.data.root_com_pos_w[:, :3].cpu().numpy()
+                init_manip_object_com -= self.scene.env_origins.cpu().numpy()
+                self.init_manip_object_com = init_manip_object_com[0]
+                #print(self.object_prim.data.root_state_w[:, :7].cpu().numpy())
+                cam_p = self.camera.data.pos_w
+                cam_q = self.camera.data.quat_w_ros
+                gp_w  = torch.as_tensor(info_all["p_w"],     dtype=torch.float32, device=self.sim.device)
+                gq_w  = torch.as_tensor(info_all["q_w"],     dtype=torch.float32, device=self.sim.device)
+                pre_w = torch.as_tensor(info_all["pre_p_w"], dtype=torch.float32, device=self.sim.device)
+                gp_cam,  gq_cam  = subtract_frame_transforms(cam_p, cam_q, gp_w,  gq_w)
+                pre_cam, pre_qcm = subtract_frame_transforms(cam_p, cam_q, pre_w, gq_w)
+            
+                self.save_dict["grasp_pose_w"] = torch.cat([gp_w,  gq_w],  dim=1).cpu().numpy()
+                self.save_dict["pregrasp_pose_w"] = torch.cat([pre_w, gq_w], dim=1).cpu().numpy()
+                self.save_dict["grasp_pose_cam"]    = torch.cat([gp_cam,  gq_cam],  dim=1).cpu().numpy()
+                self.save_dict["pregrasp_pose_cam"] = torch.cat([pre_cam, pre_qcm], dim=1).cpu().numpy()
+
+                jp = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
+                self.wait(gripper_open=True, steps=4)
+
+                # pre → grasp
+                jp, success = self.move_to(info_all["pre_p_b"], info_all["pre_q_b"], gripper_open=True)
+                if torch.any(success==False): return []
+                self.save_dict["actions"].append(np.concatenate([info_all["pre_p_b"].cpu().numpy(), info_all["pre_q_b"].cpu().numpy(), np.zeros((B, 1))], axis=1))
+                jp = self.wait(gripper_open=True, steps=3)
+
+                jp, success = self.move_to(info_all["p_b"], info_all["q_b"], gripper_open=True)
+                if torch.any(success==False): return []
+                self.save_dict["actions"].append(np.concatenate([info_all["p_b"].cpu().numpy(), info_all["q_b"].cpu().numpy(), np.zeros((B, 1))], axis=1))
+
+                # close gripper
+                jp = self.wait(gripper_open=False, steps=50)
+                self.save_dict["actions"].append(np.concatenate([info_all["p_b"].cpu().numpy(), info_all["q_b"].cpu().numpy(), np.ones((B, 1))], axis=1))
+
+                # object goal following
+                print(f"[INFO] lifting up by {self.goal_offset[2]} meters")
+                self.lift_up(height=self.goal_offset[2], gripper_open=False, steps=8)
+            
+                # if self.task_type == "simple_pick_place" or self.task_type == "simple_pick":
+                #     jp, success_ids = self.follow_object_centers(jp, sample_step=1, visualize=True)
+                # elif self.task_type == "targetted_pick_place":
+                #     jp, success_ids = self.follow_object_goals(jp, sample_step=1, visualize=True)
+                # else:
+                #     raise ValueError(f"[ERR] Invalid task type: {self.task_type}")
+                jp, success_ids = self.follow_object_goals(jp, sample_step=1, visualize=True)
+            
+                object_prim_world_pose = self.object_prim.data.root_state_w[:, :7].cpu().numpy()
+                object_prim_world_pose[:, :3] = object_prim_world_pose[:, :3] - self.scene.env_origins.cpu().numpy()
+            
+                
+                robot_world_pose = self.robot.data.root_state_w[:, :7].cpu().numpy()
+                robot_world_pose[:, :3] = robot_world_pose[:, :3] - self.scene.env_origins.cpu().numpy()
+            
+
+                
+            
+                # Properly handle the case when success_ids is a numpy array
+                # Convert it to a torch tensor if needed, before calling torch.whe\+
+                print(f"[INFO] success_ids: {success_ids}")
+                # If success_ids is already a tensor, we keep as-is
+                left_rets = len(rets) - i
+                cleaned_success_ids = []
+                for m in success_ids:
+                    if m < left_rets:
+                        cleaned_success_ids.append(m)
+                success_num += len(cleaned_success_ids)
+                #import pdb; pdb.set_trace()
+                if len(cleaned_success_ids) > 0:
+                    self.save_data(ignore_keys=["segmask", "depth"], env_ids=cleaned_success_ids, export_hdf5=False)
+                    for k in cleaned_success_ids:
+                        pregrasp_pose = self.save_dict["pregrasp_pose_w"][k]
+                        pregrasp_pose[:3] = pregrasp_pose[:3] - self.scene.env_origins.cpu().numpy()[k]
+                    
+                        grasp_pose = self.save_dict["grasp_pose_w"][k]
+                        grasp_pose[:3] = grasp_pose[:3] - self.scene.env_origins.cpu().numpy()[k]
+                 
+                        end_object_pose = object_prim_world_pose[k]
+                        end_object_pose[:3] = end_object_pose[:3] - self.scene.env_origins.cpu().numpy()[k]
+                        robot_pose = robot_world_pose[k]
+                        robot_pose[:3] = robot_pose[:3] - self.scene.env_origins.cpu().numpy()[k]
+                
+                        self.pregrasp_poses.append(pregrasp_pose)
+                        self.grasp_poses.append(grasp_pose)
+                        self.end_object_poses.append(end_object_pose)
+                        self.robot_poses.append(robot_pose)
+                print(f"[INFO] success_num: {success_num}")
+        
+        
+        return success_num
+           
+
+    def from_data_to_task_cfg(self, key:str, already_exist: bool = False) -> TaskCfg:
+        try:
+            BASE_DIR = Path.cwd()
+            scene_json_path = BASE_DIR / "outputs" / key / "simulation" / "scene.json"
+            task_base_folder = BASE_DIR / "tasks"
+            base_folder = task_base_folder / key
+            scene_dict = json.load(open(scene_json_path))
+            if not already_exist:
+                task_cfg, base_folder = construct_task_config(key, scene_dict, task_base_folder)
+                return task_cfg
+            else:
+                task_cfg = get_task_cfg(key, base_folder)
+
+
+            object_trajectory = np.load(self.traj_path).astype(np.float32)
+            pose_quat_traj = []
+            for pose_mat in object_trajectory:
+                pose, quat = mat_to_pose(pose_mat)
+                pose_quat = np.concatenate([np.array(pose), np.array(quat)])
+                pose_quat_traj.append(pose_quat)
+            pose_quat_traj = np.array(pose_quat_traj).reshape(-1, 7).tolist()
+                                
+            # Convert pregrasp and grasp poses from world to env-local frame
+        
+            trajectory_cfg_list = []
+            final_gripper_close = self.final_gripper_closed
+            init_manip_object_com = self.init_manip_object_com.tolist()
+            for i in range(len(self.pregrasp_poses)):
+                if task_cfg.task_type == TaskType.TARGETTED_PICK_PLACE:
+                    success_metric = SuccessMetric(
+                        success_metric_type = SuccessMetricType.TARGET_POINT,
+                        end_pose  = self.end_object_poses[0].tolist(),
+                        final_gripper_close = final_gripper_close,
+                    )
+                elif task_cfg.task_type == TaskType.SIMPLE_PICK_PLACE:
+                    final_object_world_pose = self.end_object_poses[i]
+                    ground_value = float(final_object_world_pose[2])
+                    success_metric = SuccessMetric(
+                        success_metric_type = SuccessMetricType.TARGET_PLANE,
+                        ground_value = ground_value,
+                        final_gripper_close = final_gripper_close,
+                        end_pose  = self.end_object_poses[i].tolist()
+                    )
+                else:
+                    success_metric = SuccessMetric(
+                        success_metric_type = SuccessMetricType.SIMPLE_LIFT,
+                        lift_height = 0.05,
+                        final_gripper_close = final_gripper_close,
+                        end_pose  = self.end_object_poses[i].tolist()
+                    )
+                object_poses = {}
+                for obj in task_cfg.objects:
+                    object_poses[obj.object_id] = [0, 0, 0, 1, 0, 0, 0]
+                
+                if self.robot_type == 'franka':
+                    robot_type = RobotType.FRANKA
+                elif self.robot_type == 'ur5':
+                    robot_type = RobotType.UR5
+                else:
+                    raise ValueError(f"[ERR] Invalid robot type: {self.robot_type}")
+
+                trajectory_cfg = TrajectoryCfg(
+                    robot_pose = self.robot_poses[i],
+                    object_poses =object_poses,
+                    object_trajectory = pose_quat_traj,
+                    final_gripper_close = final_gripper_close,
+                    success_metric = success_metric,
+                    pregrasp_pose = self.pregrasp_poses[i],
+                    grasp_pose = self.grasp_poses[i],
+                    robot_type = robot_type,
+                    init_manip_object_com = init_manip_object_com, 
+                )
+                trajectory_cfg_list.append(trajectory_cfg)
+            add_reference_trajectory(task_cfg, trajectory_cfg_list, base_folder)
+        except Exception as e:
+            print(f"[ERR] Error: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+        return task_cfg
+
+
+def sim_heuristic_manip(key: str, args_cli: argparse.Namespace, config_path: Optional[str] = None, config_dict: Optional[dict] = None):
+    """`
+    Run heuristic manipulation simulation.
+    
+    Args:
+        keys: List of scene keys to run (e.g., ["demo_video"])
+        args_cli: Command-line arguments (argparse.Namespace)
+        config_path: Path to config file (yaml/json) - alternative to args_cli
+        config_dict: Config dictionary - alternative to args_cli
+        
+    Usage:
+        # Option 1: Use command-line args (original)
+        sim_heuristic_manip(["demo_video"], args_cli)
+        
+        # Option 2: Use config file
+        sim_heuristic_manip(["demo_video"], config_path="config.yaml")
+        
+        # Option 3: Use config dict
+        sim_heuristic_manip(["demo_video"], config_dict={"num_envs": 4, "num_trials": 10})
+    """
+    # Create args from config if not provided
+    # if args_cli is None:    
+    #     args_cli = create_args_from_config(config_path=config_path, config_dict=config_dict)
+    BASE_DIR   = Path.cwd()
+    out_dir    = BASE_DIR / "outputs"
+    data_dir   = BASE_DIR / "h5py"
+
+    args_cli.key = key
+    local_img_folder = key
+    # Load config from running_cfg, allow CLI args to override
+    heuristic_cfg = get_heuristic_config(key)
+    # Fix: argparse.Namespace does not have 'hasattr' method.
+    # Use hasattr(args_cli, "num_envs") instead of args_cli.hasattr("num_envs")
+    # Same for num_trials and grasp_num below
+    num_envs = args_cli.num_envs if hasattr(args_cli, "num_envs") and args_cli.num_envs is not None else heuristic_cfg.num_envs
+    num_trials = args_cli.num_trials if hasattr(args_cli, "num_trials") and args_cli.num_trials is not None else heuristic_cfg.num_trials
+    grasp_num = args_cli.grasp_num if hasattr(args_cli, "grasp_num") and args_cli.grasp_num is not None else heuristic_cfg.grasp_num
+    robot = args_cli.robot if hasattr(args_cli, "robot") and args_cli.robot is not None else heuristic_cfg.robot
+    print(f"[INFO] Using config for key '{key}': num_envs={num_envs}, num_trials={num_trials}, grasp_num={grasp_num}, robot={robot}")
+    
+    sim_cfgs = load_sim_parameters(BASE_DIR, key)
+    env, _ = make_env(
+        cfgs=sim_cfgs, num_envs=num_envs,
+        device=args_cli.device,
+        bg_simplify=False,
+    )
+    sim, scene = env.sim, env.scene
+
+    my_sim = HeuristicManipulation(
+        sim, scene, sim_cfgs=sim_cfgs,
+        args=args_cli, out_dir=out_dir, img_folder=local_img_folder,
+        data_dir = data_dir, num_trials = num_trials, grasp_num = grasp_num, robot = robot )
+
+    robot_pose = torch.tensor(sim_cfgs["robot_cfg"]["robot_pose"], dtype=torch.float32, device=my_sim.sim.device)  # [7], pos(3)+quat(wxyz)(4)
+    my_sim.set_robot_pose(robot_pose)
+    my_sim.reset()
+    task_cfg = my_sim.from_data_to_task_cfg(key, already_exist=False)
+    my_sim.task_cfg = task_cfg
+    success_num = my_sim.inference()
+    print(f"[INFO] success_num: {success_num}")
+    my_sim.from_data_to_task_cfg(key, already_exist=True)
+    env.close()
+    return True 
+
+def main():
+    base_dir = Path.cwd()
+    key = args_cli.key
+    sim_heuristic_manip(key, args_cli)
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        print(f"[ERR] Error: {e}")
+        sys.exit(1)
+    finally:
+        simulation_app.close()
diff --git a/openreal2sim/simulation/isaaclab/demo/sim_randomize_rollout.py b/openreal2sim/simulation/isaaclab/demo/sim_randomize_rollout.py
new file mode 100755
index 0000000..9f61f24
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/demo/sim_randomize_rollout.py
@@ -0,0 +1,795 @@
+"""
+Heuristic manipulation policy in Isaac Lab simulation.
+Using grasping and motion planning to perform object manipulation tasks.
+"""
+from __future__ import annotations
+
+# ─────────── AppLauncher ───────────
+import argparse, os, json, random, transforms3d
+from pathlib import Path
+import numpy as np
+import torch
+import yaml
+import sys
+from isaaclab.app import AppLauncher
+from typing import Optional, List
+file_path = Path(__file__).resolve()
+import imageio 
+
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
+from envs.task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg
+from envs.task_construct import construct_task_config, add_reference_trajectory, load_task_cfg, add_generated_trajectories
+from envs.randomizer import Randomizer
+from envs.running_cfg import get_rollout_config, get_randomizer_config
+# ─────────── CLI ───────────
+parser = argparse.ArgumentParser("sim_policy")
+parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
+parser.add_argument("--robot", type=str, default="franka")
+parser.add_argument("--num_envs", type=int, default=None, help="Number of environments (overrides running_cfg)")
+parser.add_argument("--total_num", type=int, default=None, help="Total number of trajectories required (overrides running_cfg)")
+AppLauncher.add_app_launcher_args(parser)
+args_cli = parser.parse_args()
+args_cli.enable_cameras = True
+args_cli.headless = True # headless mode for batch execution
+app_launcher = AppLauncher(vars(args_cli))
+simulation_app = app_launcher.app
+
+# ─────────── Runtime imports ───────────
+import isaaclab.sim as sim_utils
+from isaaclab.utils.math import subtract_frame_transforms
+
+# ─────────── Simulation environments ───────────
+from sim_base import BaseSimulator, get_next_demo_id
+from sim_env_factory import make_env
+from sim_utils.transform_utils import pose_to_mat, mat_to_pose, grasp_to_world, grasp_approach_axis_batch
+from sim_utils.sim_utils import load_sim_parameters
+
+BASE_DIR   = Path.cwd()
+
+out_dir    = BASE_DIR / "outputs"
+
+def pose_distance(T1, T2):
+    """
+    Compute translation and rotation (angle) distance between two SE(3) transformation matrices in torch.
+    Args:
+        T1, T2: [..., 4, 4] torch.Tensor or np.ndarray, can be batched
+    Returns:
+        trans_dist: translation distance(s)
+        angle: rotational angle(s)
+    """
+    if not torch.is_tensor(T1):
+        T1 = torch.tensor(T1, dtype=torch.float32)
+    if not torch.is_tensor(T2):
+        T2 = torch.tensor(T2, dtype=torch.float32)
+    
+    # Translation distance
+    t1 = T1[..., :3, 3]
+    t2 = T2[..., :3, 3]
+    trans_dist = torch.norm(t2 - t1, dim=-1)
+
+    # Rotation difference (angle)
+    R1 = T1[..., :3, :3]
+    R2 = T2[..., :3, :3]
+    dR = torch.matmul(R2, R1.transpose(-2, -1))
+    trace = dR[..., 0, 0] + dR[..., 1, 1] + dR[..., 2, 2]
+    cos_angle = (trace - 1) / 2
+    
+    # Handle numerical precision
+    cos_angle = torch.clamp(cos_angle, -1.0, 1.0)
+    angle = torch.acos(cos_angle)
+    return trans_dist, angle
+
+def compute_poses_from_traj_cfg(traj_cfg_list):
+    """
+    Extract poses and trajectories from a list of TrajectoryCfg objects.
+    
+    Args:
+        traj_cfg_list: List of TrajectoryCfg objects
+        
+    Returns:
+        robot_poses_list: List of robot poses [7] for each trajectory
+        object_poses_dict: Dict mapping oid -> list of (pos, quat) tuples
+        object_trajectory_list: List of object trajectories
+        final_gripper_state_list: List of final gripper states
+        grasping_phase_list: List of grasping phases
+        placing_phrase_list: List of placing phases
+    """
+    robot_poses_list = []
+    object_poses_dict = {}  # {oid: [(pos, quat), ...]}
+    object_trajectory_list = []
+    final_gripper_state_list = []
+    pregrasp_pose_list = []
+    grasp_pose_list = []
+    final_gripper_close_list = []
+    end_pose_list = []
+
+
+    for traj_cfg in traj_cfg_list:
+        robot_poses_list.append(traj_cfg.robot_pose)
+        
+        # Extract object poses: traj_cfg.object_poses is a list of (oid, pose) tuples
+        for oid in traj_cfg.object_poses.keys():
+            pose = traj_cfg.object_poses[oid]
+            oid_str = str(oid)
+            if oid_str not in object_poses_dict:
+                object_poses_dict[oid_str] = []
+            object_poses_dict[oid_str].append(np.array(pose, dtype=np.float32))
+        traj = []
+        for i in range(len(traj_cfg.object_trajectory)):
+            mat = pose_to_mat(traj_cfg.object_trajectory[i][:3], traj_cfg.object_trajectory[i][3:7])
+            traj.append(mat)
+        object_trajectory_list.append(np.array(traj, dtype=np.float32))
+        final_gripper_state_list.append(traj_cfg.final_gripper_close)
+        pregrasp_pose_list.append(np.array(traj_cfg.pregrasp_pose, dtype=np.float32))
+        grasp_pose_list.append(np.array(traj_cfg.grasp_pose, dtype=np.float32))
+        final_gripper_close_list.append(traj_cfg.final_gripper_close)
+        end_pose_list.append(np.array(traj_cfg.success_metric.end_pose, dtype=np.float32))
+  
+    return robot_poses_list, object_poses_dict, object_trajectory_list, final_gripper_state_list, pregrasp_pose_list, grasp_pose_list, end_pose_list
+
+def get_initial_com_pose(traj_cfg):
+    if traj_cfg.init_manip_object_com is not None:
+        return traj_cfg.init_manip_object_com
+    else:
+        return None
+
+
+# ────────────────────────────Heuristic Manipulation ────────────────────────────
+class RandomizeExecution(BaseSimulator):
+    """
+    Physical trial-and-error grasping with approach-axis perturbation:
+      • Multiple grasp proposals executed in parallel;
+      • Every attempt does reset → pre → grasp → close → lift → check;
+      • Early stops when any env succeeds; then re-exec for logging.
+    """
+    def __init__(self, sim, scene, sim_cfgs: dict, data_dir: Path, record: bool = True, args_cli: Optional[argparse.Namespace] = None, bg_rgb: Optional[np.ndarray] = None):
+        robot_pose = torch.tensor(
+            sim_cfgs["robot_cfg"]["robot_pose"],
+            dtype=torch.float32,
+            device=sim.device,
+
+        )
+        super().__init__(
+            sim=sim, sim_cfgs=sim_cfgs, scene=scene, args=args_cli,
+            robot_pose=robot_pose, cam_dict=sim_cfgs["cam_cfg"],
+            out_dir=out_dir, img_folder=args_cli.key, data_dir=data_dir,
+            enable_motion_planning=True,
+            set_physics_props=True, debug_level=0,
+        )
+
+        self.selected_object_id = sim_cfgs["demo_cfg"]["manip_object_id"]
+        self._selected_object_id = str(self.selected_object_id)  # Store as string for mapping
+        self._update_object_prim()  # Update object_prim based on selected_object_id
+        self.record = record  # Store whether to record data
+        #self.traj_cfg_list = traj_cfg_list
+       
+        self.task_type = sim_cfgs["demo_cfg"]["task_type"]
+        self.goal_offset = [0, 0, sim_cfgs["demo_cfg"]["goal_offset"]]
+        
+       
+    
+
+    def reset(self, env_ids=None):
+        super().reset(env_ids)
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)  # (B,)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)  # (M,)
+        M = int(env_ids_t.shape[0])
+
+        # --- object pose/vel: set object at env origins with identity quat ---
+        env_origins = self.scene.env_origins.to(device)[env_ids_t]  # (M,3)
+        
+        # Set poses for all objects from object_poses_dict
+        from sim_env_factory import get_prim_name_from_oid
+        for oid in self.object_poses_dict.keys():
+            # Get prim name from oid
+            prim_name = get_prim_name_from_oid(str(oid))
+            
+            object_prim = self.scene[prim_name]
+            
+            # Get pose for this object (first pose in the list for now)
+            # object_poses_dict[oid] is a list of (pos, quat) tuples from mat_to_pose
+            if len(self.object_poses_dict[oid]) == 0:
+                continue
+            #import ipdb; ipdb.set_trace()
+            pos, quat = np.array(self.object_poses_dict[oid], dtype = np.float32)[env_ids_t.cpu().numpy(), :3], np.array(self.object_poses_dict[oid], dtype = np.float32)[env_ids_t.cpu().numpy(), 3:7]
+            object_pose = torch.zeros((M, 7), device=device, dtype=torch.float32)
+            object_pose[:, :3] = env_origins + torch.tensor(pos, dtype=torch.float32, device=device)
+            object_pose[:, 3:7] = torch.tensor(quat, dtype=torch.float32, device=device)  # wxyz
+            
+            object_prim.write_root_pose_to_sim(object_pose, env_ids=env_ids_t)
+            object_prim.write_root_velocity_to_sim(
+                torch.zeros((M, 6), device=device, dtype=torch.float32), env_ids=env_ids_t
+            )
+        
+            object_prim.write_data_to_sim()
+        
+        rp_local = np.array(self.robot_poses_list, dtype=np.float32)
+        env_origins_robot = self.scene.env_origins.to(device)[env_ids_t]
+        import copy
+        robot_pose_world = copy.deepcopy(rp_local)
+        robot_pose_world[:, :3] = env_origins_robot.cpu().numpy() + robot_pose_world[env_ids_t.cpu().numpy(), :3]
+        #robot_pose_world[:, 3:7] = [1.0, 0.0, 0.0, 0.0]
+        self.robot.write_root_pose_to_sim(torch.tensor(robot_pose_world, dtype=torch.float32, device=device), env_ids=env_ids_t)
+        self.robot.write_root_velocity_to_sim(
+            torch.zeros((M, 6), device=device, dtype=torch.float32), env_ids=env_ids_t
+        )
+
+
+        joint_pos = self.robot.data.default_joint_pos.to(self.robot.device)[env_ids_t]  # (M,7)
+        joint_vel = self.robot.data.default_joint_vel.to(self.robot.device)[env_ids_t]  # (M,7)
+        self.robot.write_joint_state_to_sim(joint_pos, joint_vel, env_ids=env_ids_t)
+        self.robot.write_data_to_sim()
+
+        self.clear_data()
+
+    def compute_components(self):
+        self.robot_poses_list, self.object_poses_dict, self.object_trajectory_list, self.final_gripper_state_list, self.pregrasp_pose_list, self.grasp_pose_list, self.end_pose_list = compute_poses_from_traj_cfg(self.traj_cfg_list)
+       
+
+
+    def compute_object_goal_traj(self):
+        B = self.scene.num_envs
+        # obj_state = self.object_prim.data.root_com_state_w[:, :7]  # [B,7], pos(3)+quat(wxyz)(4)
+        obj_state = self.object_prim.data.root_state_w[:, :7]  # [B,7], pos(3)+quat(wxyz)(4)
+        self.show_goal(obj_state[:, :3], obj_state[:, 3:7])
+
+        obj_state_np = obj_state.detach().cpu().numpy().astype(np.float32)
+        offset_np = np.asarray(self.goal_offset, dtype=np.float32)
+        obj_state_np[:, :3] += offset_np  # raise a bit to avoid collision
+
+        # Note: here the relative traj Δ_w is defined in world frame with origin (0,0,0),
+        # Hence, we need to normalize it to each env's origin frame.
+        origins = self.scene.env_origins.detach().cpu().numpy().astype(np.float32)  # (B,3)
+        obj_state_np[:, :3] -= origins # normalize to env origin frame
+
+        # —— 3) Precompute absolute object goals for all envs ——
+        T = self.object_trajectory_list[0].shape[0]
+        obj_goal = np.zeros((B, T, 4, 4), dtype=np.float32)
+        for b in range(B):
+            for t in range(1, T):
+                goal_4x4 = self.object_trajectory_list[b][t]
+                goal_4x4[:3, 3] += origins[b]  # back to world frame
+                goal_4x4[:3, 3] += offset_np
+                obj_goal[b, t] = goal_4x4
+                
+        self.obj_goal_traj_w = obj_goal
+
+
+    def lift_up(self, height=0.12, gripper_open=False, steps=8):
+        """
+        Lift up by a certain height (m) from current EE pose.
+        """
+        ee_w = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]
+        target_p = ee_w[:, :3].clone()
+        target_p[:, 2] += height
+
+        root = self.robot.data.root_state_w[:, 0:7]
+        p_lift_b, q_lift_b = subtract_frame_transforms(
+            root[:, 0:3], root[:, 3:7],
+            target_p, ee_w[:, 3:7]
+        ) # [B,3], [B,4]
+        jp, success = self.move_to(p_lift_b, q_lift_b, gripper_open=gripper_open)
+        jp = self.wait(gripper_open=gripper_open, steps=steps)
+        return jp
+
+    def follow_object_goals(self, start_joint_pos, sample_step=1, recalibrate_interval = 3, visualize=True):
+        """
+        follow precompute object absolute trajectory: self.obj_goal_traj_w:
+          T_obj_goal[t] = Δ_w[t] @ T_obj_init
+
+        EE-object transform is fixed:
+          T_ee_goal[t] = T_obj_goal[t] @ (T_obj_grasp^{-1} @ T_ee_grasp)
+        Here T_obj_grasp / T_ee_grasp is the transform at the moment of grasping.
+        """
+        B = self.scene.num_envs
+        obj_goal_all = self.obj_goal_traj_w  # [B, T, 4, 4]
+        T = obj_goal_all.shape[1]
+
+       
+        joint_pos = start_joint_pos
+        root_w = self.robot.data.root_state_w[:, 0:7]  # robot base poses per env
+
+        t_iter = list(range(1, T, sample_step))
+        t_iter = t_iter + [T-1] if t_iter[-1] != T-1 else t_iter
+
+
+        ee_pos_initial = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:3]
+        obj_pos_initial = self.object_prim.data.root_com_pos_w[:, 0:3]
+        initial_grasp_dist = torch.norm(ee_pos_initial - obj_pos_initial, dim=1) # [B]
+        self.initial_grasp_dist = initial_grasp_dist
+        
+        for t in t_iter:
+            if recalibrate_interval> 0 and (t-1) % recalibrate_interval == 0:
+                ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+                obj_w = self.object_prim.data.root_state_w[:, :7]                                 # [B,7]
+                T_ee_in_obj = []
+                for b in range(B):
+                    T_ee_w  = pose_to_mat(ee_w[b, :3],  ee_w[b, 3:7])
+                    T_obj_w = pose_to_mat(obj_w[b, :3], obj_w[b, 3:7])
+                    T_ee_in_obj.append((np.linalg.inv(T_obj_w) @ T_ee_w).astype(np.float32))
+            goal_pos_list, goal_quat_list = [], []
+            print(f"[INFO] follow object goal step {t}/{T}")
+            for b in range(B):
+                T_obj_goal = obj_goal_all[b, t]            # (4,4)
+                T_ee_goal  = T_obj_goal @ T_ee_in_obj[b]   # (4,4)
+                pos_b, quat_b = mat_to_pose(T_ee_goal)
+                goal_pos_list.append(pos_b.astype(np.float32))
+                goal_quat_list.append(quat_b.astype(np.float32))
+
+            goal_pos  = torch.as_tensor(np.stack(goal_pos_list),  dtype=torch.float32, device=self.sim.device)
+            goal_quat = torch.as_tensor(np.stack(goal_quat_list), dtype=torch.float32, device=self.sim.device)
+
+            if visualize:
+                self.show_goal(goal_pos, goal_quat)
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_w[:, :3], root_w[:, 3:7], goal_pos, goal_quat
+            )
+            joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False)
+            self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
+
+        is_grasp_success = self.is_grasp_success()
+        is_success = self.is_success()
+
+        print('[INFO] last obj goal', obj_goal_all[:, -1])
+        print('[INFO] last obj pos', self.object_prim.data.root_state_w[:, :3])
+        for b in range(B):
+            if self.final_gripper_state_list[b]:
+                self.wait(gripper_open=False, steps=10, record = self.record)
+            else:
+                self.wait(gripper_open=True, steps=10, record = self.record)
+
+        return joint_pos, is_success
+
+
+    def follow_object_centers(self, start_joint_pos, sample_step=1, recalibrate_interval = 3, visualize=True):
+        B = self.scene.num_envs
+        obj_goal_all = self.obj_goal_traj_w  # [B, T, 4, 4]
+        T = obj_goal_all.shape[1]
+
+        joint_pos = start_joint_pos
+        root_w = self.robot.data.root_state_w[:, 0:7]  # robot base poses per env
+
+        t_iter = list(range(1, T, sample_step))
+        t_iter = t_iter + [T-1] if t_iter[-1] != T-1 else t_iter
+
+        ee_pos_initial = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:3]
+        obj_pos_initial = self.object_prim.data.root_com_pos_w[:, 0:3]
+        initial_grasp_dist = torch.norm(ee_pos_initial - obj_pos_initial, dim=1) # [B]
+        self.initial_grasp_dist = initial_grasp_dist
+
+
+        for t in t_iter:
+            if recalibrate_interval> 0 and (t-1) % recalibrate_interval == 0:
+                ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+                # obj_w = self.object_prim.data.root_com_state_w[:, :7]                                 # [B,7]
+                obj_w = self.object_prim.data.root_state_w[:, :7]                                 # [B,7]
+                T_ee_ws = []
+                T_obj_ws = []
+                for b in range(B):
+                    T_ee_w  = pose_to_mat(ee_w[b, :3],  ee_w[b, 3:7])
+                    T_obj_w = pose_to_mat(obj_w[b, :3], obj_w[b, 3:7])
+                    T_ee_ws.append(T_ee_w)
+                    T_obj_ws.append(T_obj_w)
+                print(f"[INFO] recalibrated at step {t}/{T}")
+
+            goal_pos_list, goal_quat_list = [], []
+            print(f"[INFO] follow object goal step {t}/{T}")
+            for b in range(B):
+                T_obj_goal = obj_goal_all[b, t]          
+                trans_offset = T_obj_goal - T_obj_ws[b]
+                T_ee_goal  = T_ee_ws[b] + trans_offset
+                pos_b, quat_b = mat_to_pose(T_ee_goal)
+
+                goal_pos_list.append(pos_b.astype(np.float32))
+                goal_quat_list.append(quat_b.astype(np.float32))
+
+            goal_pos  = torch.as_tensor(np.stack(goal_pos_list),  dtype=torch.float32, device=self.sim.device)
+            goal_quat = ee_w[:, 3:7]
+
+            if visualize:
+                self.show_goal(goal_pos, goal_quat)
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_w[:, :3], root_w[:, 3:7], goal_pos, goal_quat
+            )
+            joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False, record = self.record)
+
+            print(obj_goal_all[:,t])
+            print(self.object_prim.data.root_state_w[:, :7])
+            self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
+        is_grasp_success = self.is_grasp_success()
+        print('[INFO] last obj goal', obj_goal_all[:, -1])
+        print('[INFO] last obj pos', self.object_prim.data.root_state_w[:, :3])
+        for b in range(B):
+            if self.final_gripper_state_list[b]:
+                self.wait(gripper_open=False, steps=10, record = self.record)
+            else:
+                self.wait(gripper_open=True, steps=10, record = self.record)
+
+        return joint_pos, is_grasp_success
+
+    def refine_grasp_pose(self, init_manip_object_com, grasp_pose, pregrasp_pose):
+        current_manip_object_com = self.object_prim.data.root_com_pos_w[:, :3].cpu().numpy()
+        current_manip_object_com -= self.scene.env_origins.cpu().numpy()
+        grasp_pose_w = grasp_pose
+        pregrasp_pose_w = pregrasp_pose
+        grasp_pose_w[:, :3] += current_manip_object_com - init_manip_object_com
+        pregrasp_pose_w[:, :3] += current_manip_object_com - init_manip_object_com
+        return grasp_pose_w, pregrasp_pose_w
+
+    # def is_success(self):
+    #     obj_w = self.object_prim.data.root_state_w[:, :7]
+    #     origins = self.scene.env_origins
+    #     obj_w[:, :3] -= origins
+    #     trans_dist_list = []
+    #     angle_list = []
+    #     B = self.scene.num_envs
+    #     for b in range(B):
+    #         obj_pose_l = pose_to_mat(obj_w[b, :3], obj_w[b, 3:7])
+    #         goal_pose_l = pose_to_mat(self.end_pose_list[b][:3], self.end_pose_list[b][3:7])
+    #         trans_dist, angle = pose_distance(obj_pose_l, goal_pose_l)
+    #         trans_dist_list.append(trans_dist)
+    #         angle_list.append(angle)
+    #     trans_dist = torch.tensor(np.stack(trans_dist_list))
+    #     angle = torch.tensor(np.stack(angle_list))
+    #     print(f"[INFO] trans_dist: {trans_dist}, angle: {angle}")
+    #     if self.task_type == "simple_pick_place" or self.task_type == "simple_pick":
+    #         is_success = trans_dist < 0.10
+    #     elif self.task_type == "targetted_pick_place":
+    #         is_success = (trans_dist < 0.10) & (angle < np.radians(10))
+    #     else:
+    #         raise ValueError(f"[ERR] Invalid task type: {self.task_type}")
+    #     return is_success.cpu().numpy()
+
+    
+    def is_success(self, position_threshold: float = 0.10, gripper_threshold: float = 0.10, holding_threshold: float = 0.02) -> torch.Tensor:
+        """
+        Verify if the manipulation task succeeded by checking:
+        1. Object is at Goal (Distance < 10cm)
+        2. Gripper is at Goal (Distance < 10cm) - Explicit check using T_ee_in_obj
+        3. Object is in Gripper (Deviation < 2cm)
+        
+        Args:
+            position_threshold: Distance threshold for Object-Goal check (default: 0.10m = 10cm)
+            skip_envs: Boolean array [B] indicating which envs to skip from verification
+        
+        Returns:
+            torch.Tensor: Boolean tensor [B] indicating success for each environment
+        """
+        B = self.scene.num_envs
+        # --- 1. Object Goal Check ---
+        final_goal_matrices = self.obj_goal_traj_w[:, -1, :, :]  # [B, 4, 4]
+        goal_positions_np = final_goal_matrices[:, :3, 3]
+        goal_positions = torch.tensor(goal_positions_np, dtype=torch.float32, device=self.sim.device)
+        
+        # Current Root and COM
+        current_root_state = self.object_prim.data.root_state_w
+        current_root_pos = current_root_state[:, :3]
+        current_root_quat = current_root_state[:, 3:7]
+        current_com_pos = self.object_prim.data.root_com_pos_w[:, :3]
+        
+        # Calculate Goal COM positions
+        # The COM offset is constant in the object's local frame.
+        # We need to: (1) Get the current COM offset in local frame, (2) Apply to goal pose
+        goal_com_positions_list = []
+        for b in range(B):
+            # Current state
+            root_pos_cur = current_root_pos[b].cpu().numpy()
+            root_quat_cur = current_root_quat[b].cpu().numpy()  # wxyz format
+            com_pos_cur = current_com_pos[b].cpu().numpy()
+            
+            # COM offset in world frame
+            com_offset_world = com_pos_cur - root_pos_cur  # [3]
+            
+            # Convert COM offset to object's local frame
+            # R_cur^T @ com_offset_world gives the offset in local coords (assuming no scale)
+            R_cur = transforms3d.quaternions.quat2mat(root_quat_cur)  # Convert wxyz to rotation matrix
+            com_offset_local = R_cur.T @ com_offset_world  # Rotate to local frame
+            
+            # Goal state
+            T_goal_root = final_goal_matrices[b]  # [4, 4] numpy array
+            goal_root_pos = T_goal_root[:3, 3]
+            R_goal = T_goal_root[:3, :3]
+            
+            # Apply local offset to goal pose
+            # goal_com_pos = goal_root_pos + R_goal @ com_offset_local
+            com_offset_world_goal = R_goal @ com_offset_local
+            goal_com_pos = goal_root_pos + com_offset_world_goal
+            
+            goal_com_positions_list.append(goal_com_pos)
+            
+        goal_com_positions = torch.tensor(np.array(goal_com_positions_list), dtype=torch.float32, device=self.sim.device)
+        
+        # Calculate Distances
+        root_dist = torch.norm(current_root_pos - goal_positions, dim=1)
+        com_dist = torch.norm(current_com_pos - goal_com_positions, dim=1)
+        
+        # Success requires BOTH Root and COM to be within threshold
+        obj_success = (root_dist <= position_threshold) & (com_dist <= position_threshold)
+
+        # --- 2. Gripper Goal Check ---
+        # Calculate Target Gripper Pose: T_ee_goal = T_obj_goal @ T_ee_in_obj
+        # We use the stored T_ee_in_obj from the start of the trajectory
+        ee_pos_final = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:3]
+        
+        if hasattr(self, 'T_ee_in_obj') and self.T_ee_in_obj is not None:
+            target_ee_pos_list = []
+            for b in range(B):
+                T_obj_goal = final_goal_matrices[b]
+                T_ee_goal = T_obj_goal @ self.T_ee_in_obj[b]
+                target_ee_pos_list.append(T_ee_goal[:3, 3])
+            
+            target_ee_pos = torch.tensor(np.array(target_ee_pos_list), dtype=torch.float32, device=self.sim.device)
+            grip_goal_dist = torch.norm(ee_pos_final - target_ee_pos, dim=1)
+            gripper_success = grip_goal_dist <= 0.10
+        else:
+            # Fallback if T_ee_in_obj not available (should not happen if follow_object_goals ran)
+            print("[WARN] T_ee_in_obj not found, skipping explicit Gripper Goal Check")
+            gripper_success = torch.ones(B, dtype=torch.bool, device=self.sim.device)
+            grip_goal_dist = torch.zeros(B, dtype=torch.float32, device=self.sim.device)
+
+        # --- 3. Holding Check (Object in Gripper) ---
+        # Check if the distance between Gripper and Object COM has remained stable.
+        # We compare the final distance to the initial grasp distance.
+        obj_com_final = self.object_prim.data.root_com_pos_w[:, 0:3]
+        current_grip_com_dist = torch.norm(ee_pos_final - obj_com_final, dim=1)
+        
+        if hasattr(self, 'initial_grasp_dist') and self.initial_grasp_dist is not None:
+            grasp_deviation = torch.abs(current_grip_com_dist - self.initial_grasp_dist)
+            holding_success = grasp_deviation <= 0.02
+            
+            holding_metric = grasp_deviation
+            holding_threshold = 0.02
+            holding_metric_name = "Grip-COM Deviation"
+        else:
+            holding_success = current_grip_com_dist <= 0.15
+            holding_metric = current_grip_com_dist
+            holding_threshold = 0.15
+            holding_metric_name = "Grip-COM Dist"
+
+        # Combined Success
+        success_mask = obj_success & gripper_success & holding_success
+       
+        print("="*50)
+        print(f"TASK VERIFIER: SUCCESS - {success_mask.sum().item()}/{B} environments succeeded")
+        print("="*50 + "\n")
+        
+        return success_mask
+
+
+    def is_grasp_success(self):
+        ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]
+        obj_w = self.object_prim.data.root_com_pos_w[:, 0:3]
+        dist = torch.norm(obj_w[:, :3] - ee_w[:, :3], dim=1) # [B]
+        print(f"[INFO] dist: {dist}")
+        return (dist < 0.15).cpu().numpy()
+
+
+    def viz_object_goals(self, sample_step=1, hold_steps=20):
+        self.reset()
+        self.wait(gripper_open=True, steps=10, record = False)
+        B = self.scene.num_envs
+        env_ids = torch.arange(B, device=self.object_prim.device, dtype=torch.long)
+        goals = self.obj_goal_traj_w
+        t_iter = list(range(0, goals.shape[1], sample_step))
+        t_iter = t_iter + [goals.shape[1]-1] if t_iter[-1] != goals.shape[1]-1 else t_iter
+        t_iter = t_iter[-1:]
+        for t in t_iter:
+            print(f"[INFO] viz object goal step {t}/{goals.shape[1]}")
+            pos_list, quat_list = [], []
+            for b in range(B):
+                pos, quat = mat_to_pose(goals[b, t])
+                pos_list.append(pos.astype(np.float32))
+                quat_list.append(quat.astype(np.float32))
+            pose = self.object_prim.data.root_state_w[:, :7]
+            # pose = self.object_prim.data.root_com_state_w[:, :7]
+            pose[:, :3]   = torch.tensor(np.stack(pos_list),  dtype=torch.float32, device=pose.device)
+            pose[:, 3:7]  = torch.tensor(np.stack(quat_list), dtype=torch.float32, device=pose.device)
+            self.show_goal(pose[:, :3], pose[:, 3:7])
+
+            for _ in range(hold_steps):
+                self.object_prim.write_root_pose_to_sim(pose, env_ids=env_ids)
+                self.object_prim.write_data_to_sim()
+                self.step()
+
+    # ---------- Helpers ----------
+    def _to_base(self, pos_w: np.ndarray | torch.Tensor, quat_w: np.ndarray | torch.Tensor):
+        """World → robot base frame for all envs."""
+        root = self.robot.data.root_state_w[:, 0:7]  # [B,7]
+        p_w, q_w = self._ensure_batch_pose(pos_w, quat_w)
+        pb, qb = subtract_frame_transforms(
+            root[:, 0:3], root[:, 3:7], p_w, q_w
+        )
+        return pb, qb  # [B,3], [B,4]
+
+    # ---------- Batched execution & lift-check ----------
+    def build_grasp_info(
+        self,
+        grasp_pos_w_batch: np.ndarray,   # (B,3)  GraspNet proposal in world frame
+        grasp_quat_w_batch: np.ndarray,  # (B,4)  wxyz
+        pregrasp_pos_w_batch: np.ndarray,
+        pregrasp_quat_w_batch: np.ndarray,
+
+    ) -> dict:
+        B = self.scene.num_envs
+        p_w   = np.asarray(grasp_pos_w_batch,  dtype=np.float32).reshape(B, 3)
+        q_w   = np.asarray(grasp_quat_w_batch, dtype=np.float32).reshape(B, 4)
+        pre_p_w = np.asarray(pregrasp_pos_w_batch, dtype=np.float32).reshape(B, 3)
+        pre_q_w = np.asarray(pregrasp_quat_w_batch, dtype=np.float32).reshape(B, 4)
+    
+
+        origins = self.scene.env_origins.detach().cpu().numpy().astype(np.float32)  # (B,3)
+        pre_p_w = pre_p_w + origins
+        p_w = p_w + origins
+
+        pre_pb, pre_qb = self._to_base(pre_p_w, pre_q_w)
+        pb, qb = self._to_base(p_w, q_w)
+
+        return {
+            "pre_p_w": pre_p_w, "p_w": p_w, "q_w": q_w,
+            "pre_p_b": pre_pb,  "pre_q_b": pre_qb,
+            "p_b": pb,      "q_b": qb,
+        }
+    
+
+    def inference(self, std: float = 0.0) -> list[int]:
+        """
+        Main function of the heuristic manipulation policy.
+        Physical trial-and-error grasping with approach-axis perturbation:
+          • Multiple grasp proposals executed in parallel;
+          • Every attempt does reset → pre → grasp → close → lift → check;
+          • Early stops when any env succeeds; then re-exec for logging.
+        """
+        B = self.scene.num_envs
+
+        #self.wait(gripper_open=True, steps=10, record = self.record)
+        # reset and conduct main process: open→pre→grasp→close→follow_object_goals
+        self.reset()
+
+        cam_p = self.camera.data.pos_w
+        cam_q = self.camera.data.quat_w_ros
+        gp_w  = torch.as_tensor(np.array(self.grasp_pose_list,  dtype=np.float32)[:,:3], dtype=torch.float32, device=self.sim.device)
+        gq_w  = torch.as_tensor(np.array(self.grasp_pose_list, dtype=np.float32)[:,3:7], dtype=torch.float32, device=self.sim.device)
+        pre_w = torch.as_tensor(np.array(self.pregrasp_pose_list, dtype=np.float32)[:,:3], dtype=torch.float32, device=self.sim.device)
+        init_manip_object_com = get_initial_com_pose(self.task_cfg.reference_trajectory[-1])
+        if init_manip_object_com is not None:
+            gp_w, pre_w = self.refine_grasp_pose(init_manip_object_com, gp_w, pre_w)
+            gp_w = torch.as_tensor(gp_w, dtype=torch.float32, device=self.sim.device)
+            pre_w = torch.as_tensor(pre_w, dtype=torch.float32, device=self.sim.device)
+        gp_cam,  gq_cam  = subtract_frame_transforms(cam_p, cam_q, gp_w,  gq_w)
+        pre_cam, pre_qcm = subtract_frame_transforms(cam_p, cam_q, pre_w, gq_w)
+        self.save_dict["grasp_pose_cam"]    = torch.cat([gp_cam,  gq_cam],  dim=1).unsqueeze(0).cpu().numpy()
+        self.save_dict["pregrasp_pose_cam"] = torch.cat([pre_cam, pre_qcm], dim=1).unsqueeze(0).cpu().numpy()
+
+        jp = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
+        self.wait(gripper_open=True, steps=4, record = self.record)
+
+        # pre → grasp
+        info_all = self.build_grasp_info(gp_w.cpu().numpy(), gq_w.cpu().numpy(), pre_w.cpu().numpy(), gq_w.cpu().numpy())
+        jp, success = self.move_to(info_all["pre_p_b"], info_all["pre_q_b"], gripper_open=True, record = self.record)
+        if torch.any(success==False): return []
+        self.save_dict["actions"].append(np.concatenate([info_all["pre_p_b"].cpu().numpy(), info_all["pre_q_b"].cpu().numpy(), np.zeros((B, 1))], axis=1))
+        jp = self.wait(gripper_open=True, steps=3, record = self.record)
+
+        jp, success = self.move_to(info_all["p_b"], info_all["q_b"], gripper_open=True, record = self.record)
+        if torch.any(success==False): return []
+        self.save_dict["actions"].append(np.concatenate([info_all["p_b"].cpu().numpy(), info_all["q_b"].cpu().numpy(), np.zeros((B, 1))], axis=1))
+
+        # close gripper
+        jp = self.wait(gripper_open=False, steps=50, record = self.record)
+        self.save_dict["actions"].append(np.concatenate([info_all["p_b"].cpu().numpy(), info_all["q_b"].cpu().numpy(), np.ones((B, 1))], axis=1))
+
+        # object goal following
+        self.lift_up(height=self.goal_offset[2], gripper_open=False, steps=8)
+        # if self.task_type == "simple_pick_place" or self.task_type == "simple_pick":
+        #     jp, is_success = self.follow_object_centers(jp, sample_step=1, visualize=True)
+        # elif self.task_type == "targetted_pick_place":
+        #     jp, is_success = self.follow_object_goals(jp, sample_step=1, visualize=True)
+        # else:
+        #     raise ValueError(f"[ERR] Invalid task type: {self.task_type}")
+        #jp = self.follow_object_goals(jp, sample_step=1, visualize=True)
+        jp, is_success = self.follow_object_goals(jp, sample_step=1, visualize=True)
+
+        is_success = is_success #& self.is_success()
+        # Arrange the output: we want to collect only the successful env ids as a list.
+        is_success = torch.tensor(is_success, dtype=torch.bool, device=self.sim.device)
+        success_env_ids = torch.where(is_success)[0].cpu().numpy().tolist()
+
+        print(f"[INFO] success_env_ids: {success_env_ids}")
+        if self.record:
+            self.save_data(ignore_keys=["segmask", "depth"], env_ids=success_env_ids, export_hdf5=True)
+        
+        return success_env_ids
+
+    def run_batch_trajectory(self, traj_cfg_list: List[TrajectoryCfg]):
+        self.traj_cfg_list = traj_cfg_list
+        self.compute_components()
+        self.compute_object_goal_traj()
+        
+        return self.inference()
+
+
+
+# ──────────────────────────── Entry Point ────────────────────────────
+
+  
+
+
+def sim_randomize_rollout(keys: list[str], args_cli: argparse.Namespace):
+    for key in keys:
+        # Load config from running_cfg, allow CLI args to override
+        rollout_cfg = get_rollout_config(key)
+        randomizer_cfg = get_randomizer_config(key)
+        
+        total_require_traj_num = args_cli.total_num if args_cli.total_num is not None else rollout_cfg.total_num
+        num_envs = args_cli.num_envs if args_cli.num_envs is not None else rollout_cfg.num_envs
+        
+        print(f"[INFO] Using config for key '{key}': num_envs={num_envs}, total_num={total_require_traj_num}")
+        print(f"[INFO] Randomizer config: {randomizer_cfg.to_kwargs()}")
+        
+        success_trajectory_config_list = []
+        task_json_path = BASE_DIR / "tasks" / key / "task.json"
+        task_cfg = load_task_cfg(task_json_path)
+        randomizer = Randomizer(task_cfg)
+        
+        # Use randomizer config from running_cfg
+        randomizer_kwargs = randomizer_cfg.to_kwargs()
+        random_task_cfg_list = randomizer.generate_randomized_scene_cfg(**randomizer_kwargs)
+
+        args_cli.key = key
+        sim_cfgs = load_sim_parameters(BASE_DIR, key)
+      
+        data_dir = BASE_DIR / "h5py" / key
+        current_timestep = 0
+        env, _ = make_env(
+                cfgs=sim_cfgs, num_envs=num_envs,
+                device=args_cli.device,
+                bg_simplify=False,
+            )
+        sim, scene = env.sim, env.scene
+
+        my_sim = RandomizeExecution(sim, scene, sim_cfgs=sim_cfgs, data_dir=data_dir, record=True, args_cli=args_cli)
+        my_sim.task_cfg = task_cfg
+        while len(success_trajectory_config_list) < total_require_traj_num:
+            traj_cfg_list = random_task_cfg_list[current_timestep: current_timestep + num_envs]
+            current_timestep += num_envs
+           
+            success_env_ids = my_sim.run_batch_trajectory(traj_cfg_list)
+            
+            env.close()
+            if len(success_env_ids) > 0:
+                for env_id in success_env_ids:
+                    success_trajectory_config_list.append(traj_cfg_list[env_id])
+            
+            print(f"[INFO] success_trajectory_config_list: {len(success_trajectory_config_list)}")
+            print(total_require_traj_num)
+
+        # for timestep in range(len(success_trajectory_config_list),10):
+        #     traj_cfg_list = random_task_cfg_list[timestep: min(timestep + 10, len(random_task_cfg_list))]
+        #     my_sim = RandomizeExecution(sim, scene, sim_cfgs=sim_cfgs, traj_cfg_list=traj_cfg_list, record=True)
+        #     success_env_ids = my_sim.inference()
+        #     del my_sim
+        #     torch.cuda.empty_cache()
+        add_generated_trajectories(task_cfg, success_trajectory_config_list, task_json_path.parent)
+    
+    return success_trajectory_config_list
+
+def main():
+    base_dir = Path.cwd()
+    cfg = yaml.safe_load((base_dir / "config" / "config.yaml").open("r"))
+    keys = cfg["keys"]
+    sim_randomize_rollout(keys, args_cli)
+  
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+    finally:
+        simulation_app.close()
\ No newline at end of file
diff --git a/openreal2sim/simulation/isaaclab/demo/tools/demo_judge.py b/openreal2sim/simulation/isaaclab/demo/tools/demo_judge.py
new file mode 100644
index 0000000..e69de29
diff --git a/openreal2sim/simulation/isaaclab/demo/tools/grasp_judge.py b/openreal2sim/simulation/isaaclab/demo/tools/grasp_judge.py
new file mode 100644
index 0000000..e69de29
diff --git a/openreal2sim/simulation/isaaclab/sim_base.py b/openreal2sim/simulation/isaaclab/sim_base.py
index 987a0f2..1970172 100755
--- a/openreal2sim/simulation/isaaclab/sim_base.py
+++ b/openreal2sim/simulation/isaaclab/sim_base.py
@@ -3,17 +3,53 @@
 import json
 import os
 import random
+import shutil
 from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-
-import curobo
+from typing import Any, Optional, Dict, Sequence, Tuple
+from typing import List
+import copy
+import numpy as np
+import torch
 import imageio
+import cv2
+import h5py
+import sys
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
+from envs.task_cfg import CameraInfo, TaskCfg, TrajectoryCfg
 
 # Isaac Lab
 import isaaclab.sim as sim_utils
-import numpy as np
-import torch
+from isaaclab.sensors.camera import Camera
+from isaaclab.managers import SceneEntityCfg
+from isaaclab.controllers import DifferentialIKController, DifferentialIKControllerCfg
+from isaaclab.utils.math import subtract_frame_transforms, transform_points, unproject_depth
+from isaaclab.devices import Se3Keyboard, Se3SpaceMouse, Se3Gamepad
+from isaaclab.markers import VisualizationMarkers, VisualizationMarkersCfg
+from isaaclab.utils.assets import ISAAC_NUCLEUS_DIR
+
+ # Curobo imports are done lazily in prepare_curobo() to avoid warp/inspect issues with isaaclab namespace package
+# Patch inspect.getfile to handle namespace packages (like isaaclab)
+# This is needed because warp's set_module_options() uses inspect.stack() which tries to get file paths
+# for all modules in the call stack, including namespace packages that don't have a __file__ attribute
+import inspect
+_original_getfile = inspect.getfile
+def _patched_getfile(object):
+    """Patched getfile that handles namespace packages."""
+    try:
+        return _original_getfile(object)
+    except TypeError as e:
+        if "is a built-in module" in str(e) or "namespace" in str(e).lower():
+            # For namespace packages, return a dummy path to avoid errors
+            # This allows warp's inspect.stack() to work even when isaaclab is in the call stack
+            if hasattr(object, '__name__'):
+                return f'<namespace:{object.__name__}>'
+            return '<namespace:unknown>'
+        raise
+inspect.getfile = _patched_getfile
+
+import curobo
 from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.types.robot import JointState, RobotConfig
@@ -87,6 +123,10 @@ def __init__(
         set_physics_props: bool = True,
         enable_motion_planning: bool = True,
         debug_level: int = 1,
+        demo_dir: Optional[Path] = None,
+        data_dir: Optional[Path] = None,
+        task_cfg: Optional[TaskCfg] = None,
+        traj_cfg_list: Optional[List[TrajectoryCfg]] = None,
     ) -> None:
         # basic simulation setup
         self.sim: sim_utils.SimulationContext = sim
@@ -102,7 +142,11 @@ def __init__(
         self.cam_dict = cam_dict
         self.out_dir: Path = out_dir
         self.img_folder: str = img_folder
-
+       
+        
+        self.data_dir: Optional[Path] = data_dir
+        if self.data_dir is not None:
+            self.data_dir.mkdir(parents=True, exist_ok=True)
         # scene entities
         self.robot = scene["robot"]
         if robot_pose.ndim == 1:
@@ -114,13 +158,19 @@ def __init__(
                 f"robot_pose must be [B,7], got {robot_pose.shape}"
             )
             self.robot_pose = robot_pose.to(self.robot.device).contiguous()
-
-        self.object_prim = scene["object_00"]
-        self.other_object_prims = [
-            scene[key]
-            for key in scene.keys()
-            if f"object_" in key and key != "object_00"
-        ]
+        self.task_cfg = task_cfg
+        self.traj_cfg_list = traj_cfg_list
+        # Get object prim based on selected_object_id
+        # Default to object_00 for backward compatibility
+        # Note: selected_object_id is set in subclasses after super().__init__()
+        # So we use a helper method that can be called later
+        self._selected_object_id = None  # Will be set by subclasses
+        self.object_prim = scene["object_00"]  # Default, will be updated if needed
+        self._update_object_prim()
+        
+        # Get all other object prims (excluding the main object)
+        self.other_object_prims = [scene[key] for key in scene.keys() 
+                                   if f"object_" in key and key != "object_00"]
         self.background_prim = scene["background"]
         self.camera: Camera = scene["camera"]
 
@@ -198,21 +248,19 @@ def __init__(
         self.count = 0
         self.demo_id = 0
         self.save_dict = {
-            "rgb": [],
-            "depth": [],
-            "segmask": [],
-            "joint_pos": [],
-            "joint_vel": [],
-            "actions": [],
-            "gripper_pos": [],
-            "gripper_cmd": [],
+            "rgb": [], "depth": [], "segmask": [],
+            "joint_pos": [], "joint_vel": [], "actions": [],
+            "gripper_pos": [], "gripper_cmd": [], "ee_pose_cam": [],
+            "composed_rgb": []  # composed rgb image with background and foreground
         }
 
         # visualization
         self.selected_object_id = 0
+        self._selected_object_id = 0
         self.debug_level = debug_level
 
         self.goal_vis_list = []
+        
         if self.debug_level > 0:
             for b in range(self.num_envs):
                 cfg = VisualizationMarkersCfg(
@@ -235,9 +283,27 @@ def __init__(
             print(f"prepare curobo motion planning: {enable_motion_planning}")
             self.prepare_curobo()
             print("curobo motion planning ready.")
+        
+    def _update_object_prim(self):
+        """Update object_prim based on selected_object_id. Called after selected_object_id is set."""
+        if self._selected_object_id is None:
+            return
+        try:
+            from sim_env_factory import get_prim_name_from_oid
+            oid_str = str(self._selected_object_id)
+            prim_name = get_prim_name_from_oid(oid_str)
+            if prim_name in self.scene:
+                self.object_prim = self.scene[prim_name]
+                # Update other_object_prims
+                self.other_object_prims = [self.scene[key] for key in self.scene.keys() 
+                                           if f"object_" in key and key != prim_name]
+        except (ImportError, ValueError, KeyError) as e:
+            # Fallback to object_00 if mapping not available
+            pass
 
     # -------- Curobo Motion Planning ----------
     def prepare_curobo(self):
+
         setup_curobo_logger("error")
         # tensor_args = TensorDeviceType()
         tensor_args = TensorDeviceType(device=self.sim.device, dtype=torch.float32)
@@ -291,113 +357,312 @@ def _traj_to_BT7(self, traj):
         return flat.view(B, T, 7).contiguous()
 
     # ---------- Planning / Execution (Single) ----------
+    def reinitialize_motion_gen(self):
+        """
+        Reinitialize the motion generation object.
+        Call this after a crash to restore a clean state.
+        """
+        print("[INFO] Reinitializing motion planner...")
+        try:
+            # Clear CUDA cache
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            
+            # Recreate the motion planner
+            from curobo.types.base import TensorDeviceType
+            from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig
+            from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+            from curobo.types.robot import RobotConfig
+            import curobo
+            
+            tensor_args = TensorDeviceType(device=self.sim.device, dtype=torch.float32)
+            curobo_path = curobo.__file__.split("/__init__")[0]
+            robot_file = f"{curobo_path}/content/configs/robot/franka.yml"
+            
+            motion_gen_config = MotionGenConfig.load_from_robot_config(
+                robot_cfg=robot_file,
+                world_model=None,
+                tensor_args=tensor_args,
+                interpolation_dt=self.sim_dt,
+                use_cuda_graph=True if self.num_envs == 1 else False,
+            )
+            
+            self.motion_gen = MotionGen(motion_gen_config)
+            
+            if self.num_envs == 1:
+                self.motion_gen.warmup(enable_graph=True)
+            
+            print("[INFO] Motion planner reinitialized successfully")
+            return True
+            
+        except Exception as e:
+            print(f"[ERROR] Failed to reinitialize motion planner: {e}")
+            return False
+
+  
     def motion_planning_single(
-        self, position, quaternion, max_attempts=1, use_graph=True
+        self, position, quaternion, max_attempts=1, use_graph=True, max_retries=1
     ):
         """
-        single environment planning: prefer plan_single (supports graph / CUDA graph warmup better).
-        Returns [1, T, 7], returns None on failure.
+        Single environment planning with automatic recovery from crashes.
+        Returns None on complete failure to signal restart needed.
         """
-        # current joint position
         joint_pos0 = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids][
             0:1
-        ].contiguous()  # [1,7]
-        start_state = JointState.from_position(joint_pos0)
-
-        # goal (ensure [1,3]/[1,4])
+        ].contiguous()
+        
         pos_b, quat_b = self._ensure_batch_pose(position, quaternion)
         pos_b = pos_b[0:1]
         quat_b = quat_b[0:1]
-        goal_pose = Pose(position=pos_b, quaternion=quat_b)
-
-        plan_cfg = MotionGenPlanConfig(
-            max_attempts=max_attempts, enable_graph=use_graph
-        )
-
-        result = self.motion_gen.plan_single(start_state, goal_pose, plan_cfg)
-
-        traj = result.get_interpolated_plan()  # JointState
-
-        if result.success[0] == True:
-            T = traj.position.shape[-2]
-            BT7 = (
-                traj.position.to(self.sim.device).to(torch.float32).unsqueeze(0)
-            )  # [1,T,7]
-        else:
-            print(f"[WARN] motion planning failed.")
-            BT7 = joint_pos0.unsqueeze(1)  # [1,1,7]
-
-        return BT7, result.success
+        
+        for retry in range(max_retries):
+            try:
+                start_state = JointState.from_position(joint_pos0)
+                goal_pose = Pose(position=pos_b, quaternion=quat_b)
+                plan_cfg = MotionGenPlanConfig(
+                    max_attempts=max_attempts, enable_graph=use_graph
+                )
+                
+                result = self.motion_gen.plan_single(start_state, goal_pose, plan_cfg)
+                
+                # Check if result is valid
+                if result is None:
+                    print(f"[ERROR] Motion planning returned None result on attempt {retry+1}/{max_retries}")
+                    if retry < max_retries - 1:
+                        if self.reinitialize_motion_gen():
+                            print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                            continue
+                    break
+                
+                traj = result.get_interpolated_plan()
+                
+                # Check if trajectory is valid
+                if traj is None:
+                    print(f"[ERROR] Motion planning returned None trajectory on attempt {retry+1}/{max_retries}")
+                    if retry < max_retries - 1:
+                        if self.reinitialize_motion_gen():
+                            print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                            continue
+                    break
+                
+                if result.success[0] == True:
+                    BT7 = (
+                        traj.position.to(self.sim.device).to(torch.float32).unsqueeze(0)
+                    )
+                else:
+                    print(f"[WARN] Motion planning failed.")
+                    BT7 = joint_pos0.unsqueeze(1)
+                
+                return BT7, result.success
+                
+            except AttributeError as e:
+                print(f"[ERROR] Motion planner crash on attempt {retry+1}/{max_retries}: {e}")
+                
+                if retry < max_retries - 1:
+                    if self.reinitialize_motion_gen():
+                        print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                        continue
+                    else:
+                        break
+                else:
+                    print("[ERROR] Max retries reached")
+                    
+            except Exception as e:
+                # Safe error message extraction
+                try:
+                    error_msg = str(e)
+                    error_type = type(e).__name__
+                except:
+                    error_msg = "Unknown error"
+                    error_type = "Exception"
+                
+                print(f"[ERROR] Unexpected error: {error_type}: {error_msg}")
+                
+                # Check for recoverable errors
+                is_recoverable = False
+                try:
+                    is_recoverable = ("cuda graph" in error_msg.lower() or 
+                                    "NoneType" in error_msg or 
+                                    "has no len()" in error_msg)
+                except:
+                    pass
+                
+                if retry < max_retries - 1 and is_recoverable:
+                    if self.reinitialize_motion_gen():
+                        continue
+                break
+        
+        # Complete failure - return dummy trajectory with False success
+        print("[ERROR] Motion planning failed critically - returning dummy trajectory")
+        joint_pos0 = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids][0:1].contiguous()
+        # Return current position as 1-step trajectory with failure
+        dummy_traj = joint_pos0.unsqueeze(1)  # (1, 1, dof)
+        dummy_success = torch.zeros(1, dtype=torch.bool, device=self.sim.device)
+        return dummy_traj, dummy_success
 
     # ---------- Planning / Execution (Batched) ----------
+   
+
     def motion_planning_batch(
-        self, position, quaternion, max_attempts=1, allow_graph=False
+        self, position, quaternion, max_attempts=1, allow_graph=False, max_retries=1
     ):
         """
-        multi-environment planning: use plan_batch.
-        Default require_all=True: if any env fails, return None (keep your original semantics).
-        Returns [B, T, 7].
+        Multi-environment planning with automatic recovery from crashes.
+        Returns None on complete failure to signal restart needed.
         """
         B = self.scene.num_envs
         joint_pos = self.robot.data.joint_pos[
             :, self.robot_entity_cfg.joint_ids
-        ].contiguous()  # [B,7]
-        start_state = JointState.from_position(joint_pos)
-
-        pos_b, quat_b = self._ensure_batch_pose(position, quaternion)  # [B,3], [B,4]
-        goal_pose = Pose(position=pos_b, quaternion=quat_b)
-
-        plan_cfg = MotionGenPlanConfig(
-            max_attempts=max_attempts, enable_graph=allow_graph
-        )
-
-        result = self.motion_gen.plan_batch(start_state, goal_pose, plan_cfg)
-
-        try:
-            paths = result.get_paths()  # List[JointState]
-            T_max = 1
-            for i, p in enumerate(paths):
-                if not result.success[i]:
-                    print(f"[WARN] motion planning failed for env {i}.")
-                else:
-                    T_max = max(T_max, int(p.position.shape[-2]))
-            dof = joint_pos.shape[-1]
-            BT7 = torch.zeros(
-                (B, T_max, dof), device=self.sim.device, dtype=torch.float32
-            )
-            for i, p in enumerate(paths):
-                if result.success[i] == False:
-                    BT7[i, :, :] = (
-                        joint_pos[i : i + 1, :].unsqueeze(1).repeat(1, T_max, 1)
-                    )
+        ].contiguous()
+        
+        pos_b, quat_b = self._ensure_batch_pose(position, quaternion)
+        
+        for retry in range(max_retries):
+            try:
+                # Attempt planning
+                start_state = JointState.from_position(joint_pos)
+                goal_pose = Pose(position=pos_b, quaternion=quat_b)
+                plan_cfg = MotionGenPlanConfig(
+                    max_attempts=max_attempts, enable_graph=allow_graph
+                )
+                
+                try:
+                    result = self.motion_gen.plan_batch(start_state, goal_pose, plan_cfg)
+                except Exception as plan_err:
+                    print(f"[ERROR] curobo.plan_batch raised exception: {plan_err}")
+                    raise plan_err
+                
+                # Check if result is valid
+                if result is None:
+                    print(f"[ERROR] Motion planning returned None result on attempt {retry+1}/{max_retries}")
+                    if retry < max_retries - 1:
+                        if self.reinitialize_motion_gen():
+                            print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                            continue
+                    break
+                
+                # Process results
+                paths = result.get_paths()
+                
+                # Check if paths is valid - use try-except to safely check if it's iterable
+                paths_valid = False
+                try:
+                    if paths is not None:
+                        # Try to get length to verify it's iterable and not empty
+                        _ = len(paths)
+                        if len(paths) > 0:
+                            paths_valid = True
+                except:
+                    pass
+                
+                if not paths_valid:
+                    print(f"[ERROR] Motion planning returned invalid paths on attempt {retry+1}/{max_retries}")
+                    if retry < max_retries - 1:
+                        if self.reinitialize_motion_gen():
+                            print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                            continue
+                        else:
+                            print("[ERROR] Failed to recover motion planner")
+                    # Skip to next retry iteration or exit loop
+                    continue
+                
+                # Double-check paths is still valid (defensive programming)
+                if paths is None:
+                    print(f"[ERROR] paths became None after validation check")
+                    continue
+                
+                T_max = 1
+                
+                # Check if result.success is valid
+                if result.success is None:
+                     print(f"[WARN] result.success is None. Assuming failure for all envs.")
+                     # Create dummy failure tensor
+                     result.success = torch.zeros(B, dtype=torch.bool, device=self.sim.device)
+
+                try:
+                    for i, p in enumerate(paths):
+                        if not result.success[i]:
+                            print(f"[WARN] Motion planning failed for env {i}.")
+                        else:
+                            T_max = max(T_max, int(p.position.shape[-2]))
+                except TypeError as te:
+                    print(f"[ERROR] TypeError when processing paths: {te}")
+                    print(f"[DEBUG] paths type: {type(paths)}, paths value: {paths}")
+                    continue
+                
+                dof = joint_pos.shape[-1]
+                BT7 = torch.zeros(
+                    (B, T_max, dof), device=self.sim.device, dtype=torch.float32
+                )
+                
+                for i, p in enumerate(paths):
+                    if result.success[i] == False:
+                        BT7[i, :, :] = (
+                            joint_pos[i : i + 1, :].unsqueeze(1).repeat(1, T_max, 1)
+                        )
+                    else:
+                        Ti = p.position.shape[-2]
+                        BT7[i, :Ti, :] = p.position.to(self.sim.device).to(torch.float32)
+                        if Ti < T_max:
+                            BT7[i, Ti:, :] = BT7[i, Ti - 1 : Ti, :]
+                
+                success = result.success if result.success is not None else torch.zeros(
+                    B, dtype=torch.bool, device=self.sim.device
+                )
+                
+                # Success! Return the trajectory
+                return BT7, success
+                
+            except AttributeError as e:
+                print(f"[ERROR] Motion planner crash on attempt {retry+1}/{max_retries}: {e}")
+                
+                if retry < max_retries - 1:
+                    if self.reinitialize_motion_gen():
+                        print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                        continue
+                    else:
+                        print("[ERROR] Failed to recover motion planner")
+                        break
                 else:
-                    Ti = p.position.shape[-2]
-                    BT7[i, :Ti, :] = p.position.to(self.sim.device).to(torch.float32)
-                    if Ti < T_max:
-                        BT7[i, Ti:, :] = BT7[i, Ti - 1 : Ti, :]
-        except Exception as e:
-            print(f"[WARN] motion planning all failed with exception: {e}")
-            success = torch.zeros(
-                B, dtype=torch.bool, device=self.sim.device
-            )  # set to all false
-            BT7 = joint_pos.unsqueeze(1)  # [B,1,7]
-
-        # check exceptions
-        if result.success is None or result.success.shape[0] != B:
-            print(f"[WARN] motion planning success errors: {result.success}")
-            success = torch.zeros(
-                B, dtype=torch.bool, device=self.sim.device
-            )  # set to all false
-            BT7 = joint_pos.unsqueeze(1)  # [B,1,7]
-        else:
-            success = result.success
-        if BT7.shape[0] != B or BT7.shape[2] != joint_pos.shape[1]:
-            print(
-                f"[WARN] motion planning traj dim mismatch: {BT7.shape} vs {[B, 'T', joint_pos.shape[1]]}"
-            )
-            BT7 = joint_pos.unsqueeze(1)  # [B,1,7]
+                    print("[ERROR] Max retries reached, motion planning failed critically")
+                    
+            except Exception as e:
+                # Safe error message extraction
+                try:
+                    error_msg = str(e)
+                    error_type = type(e).__name__
+                except:
+                    error_msg = "Unknown error"
+                    error_type = "Exception"
+                
+                print(f"[ERROR] Unexpected error in motion planning: {error_type}: {error_msg}")
+                
+                # Check for recoverable errors
+                is_recoverable = False
+                try:
+                    is_recoverable = ("cuda graph" in error_msg.lower() or 
+                                    "NoneType" in error_msg or 
+                                    "has no len()" in error_msg)
+                except:
+                    pass
+                
+                if retry < max_retries - 1 and is_recoverable:
+                    if self.reinitialize_motion_gen():
+                        print(f"[INFO] Retrying after error (attempt {retry+2}/{max_retries})...")
+                        continue
+                break
+        
+        # If we get here, all retries failed - return dummy trajectory with all False success
+        print("[ERROR] Motion planning failed critically - returning dummy trajectory")
+        B = self.scene.num_envs
+        joint_pos = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids].contiguous()
+        dof = joint_pos.shape[-1]
+        # Return current position as 1-step trajectory with all failures
+        dummy_traj = joint_pos.unsqueeze(1)  # (B, 1, dof)
+        dummy_success = torch.zeros(B, dtype=torch.bool, device=self.sim.device)
+        return dummy_traj, dummy_success
+
 
-        return BT7, success
 
     def motion_planning(self, position, quaternion, max_attempts=1):
         if self.scene.num_envs == 1:
@@ -433,98 +698,6 @@ def move_to_motion_planning(
             last = joint_pos_des
         return last, success
 
-    def compose_real_video(self, env_id: int = 0):
-        """
-        Composite simulated video onto real background using mask-based rendering.
-        
-        Args:
-            env_id: Environment ID to process
-        """
-    
-        def pad_to_even(frame):
-            """Pad frame to even dimensions for video encoding."""
-            H, W = frame.shape[:2]
-            pad_h = H % 2
-            pad_w = W % 2
-            if pad_h or pad_w:
-                pad = ((0, pad_h), (0, pad_w)) if frame.ndim == 2 else ((0, pad_h), (0, pad_w), (0, 0))
-                frame = np.pad(frame, pad, mode='edge')
-            return frame
-        
-        # Construct paths
-        base_path = self.out_dir / self.img_folder
-        demo_path = self._demo_dir() / f"env_{env_id:03d}"
-        
-        SIM_VIDEO_PATH = demo_path / "sim_video.mp4"
-        MASK_VIDEO_PATH = demo_path / "mask_video.mp4"
-        REAL_BACKGROUND_PATH = base_path / "simulation" / "background.jpg"
-        OUTPUT_PATH = demo_path / "real_video.mp4"
-        
-        # Check if required files exist
-        if not SIM_VIDEO_PATH.exists():
-            print(f"[WARN] Simulated video not found: {SIM_VIDEO_PATH}")
-            return False
-        if not MASK_VIDEO_PATH.exists():
-            print(f"[WARN] Mask video not found: {MASK_VIDEO_PATH}")
-            return False
-        if not REAL_BACKGROUND_PATH.exists():
-            print(f"[WARN] Real background not found: {REAL_BACKGROUND_PATH}")
-            return False
-        
-        # Load real background image
-        print(f"[INFO] Loading real background: {REAL_BACKGROUND_PATH}")
-        real_img = np.array(Image.open(REAL_BACKGROUND_PATH))
-        
-        H, W, _ = real_img.shape
-        print(f"[INFO] Background size: {W}x{H}")
-        
-        # Open videos
-        print(f"[INFO] Loading simulated video: {SIM_VIDEO_PATH}")
-        rgb_reader = imageio.get_reader(SIM_VIDEO_PATH)
-        print(f"[INFO] Loading mask video: {MASK_VIDEO_PATH}")
-        mask_reader = imageio.get_reader(MASK_VIDEO_PATH)
-        
-        # Get metadata from original video
-        N = rgb_reader.count_frames()
-        fps = rgb_reader.get_meta_data()['fps']
-        print(f"[INFO] Processing {N} frames at {fps} FPS...")
-        
-        composed_images = []
-        
-        for i in range(N):
-            # Read frames
-            sim_rgb = rgb_reader.get_data(i)
-            sim_mask = mask_reader.get_data(i)
-            
-            # Convert mask to binary (grayscale > 127 = foreground)
-            if sim_mask.ndim == 3:
-                sim_mask = cv2.cvtColor(sim_mask, cv2.COLOR_RGB2GRAY)
-            sim_mask = sim_mask > 127
-            
-            # Resize sim frames to match real background if needed
-            if sim_rgb.shape[:2] != (H, W):
-                sim_rgb = cv2.resize(sim_rgb, (W, H))
-                sim_mask = cv2.resize(sim_mask.astype(np.uint8), (W, H)) > 0
-            
-            # Compose: real background + simulated foreground (where mask is True)
-            composed = real_img.copy()
-            composed = pad_to_even(composed)
-            composed[sim_mask] = sim_rgb[sim_mask]
-            
-            composed_images.append(composed)
-            
-            if (i + 1) % 10 == 0:
-                print(f"[INFO] Processed {i + 1}/{N} frames")
-        
-        # Save composed video
-        print(f"[INFO] Saving composed video to: {OUTPUT_PATH}")
-        writer = imageio.get_writer(OUTPUT_PATH, fps=fps, macro_block_size=None)
-        for frame in composed_images:
-            writer.append_data(frame)
-        writer.close()
-        
-        print(f"[INFO] Done! Saved {len(composed_images)} frames to {OUTPUT_PATH} at {fps} FPS")
-        return True
     # ---------- Visualization ----------
     def show_goal(self, pos, quat):
         """
@@ -745,6 +918,7 @@ def reset(self, env_ids=None):
         env_origins_robot = env_origins.to(self.robot.device)  # (M,3)
         robot_pose_world = rp_local.clone()
         robot_pose_world[:, :3] = env_origins_robot + robot_pose_world[:, :3]
+        #print(f"[INFO] robot_pose_world: {robot_pose_world}")
         self.robot.write_root_pose_to_sim(robot_pose_world, env_ids=env_ids_t)
         self.robot.write_root_velocity_to_sim(
             torch.zeros((M, 6), device=self.robot.device, dtype=torch.float32),
@@ -854,98 +1028,268 @@ def record_data(self, obs: Dict[str, torch.Tensor]):
         self.save_dict["gripper_pos"].append(obs["gripper_pos"].cpu().numpy())  # [B,3]
         self.save_dict["gripper_cmd"].append(obs["gripper_cmd"].cpu().numpy())  # [B,1]
         self.save_dict["joint_vel"].append(obs["joint_vel"].cpu().numpy())
-
+        self.save_dict["ee_pose_cam"].append(obs["ee_pose_cam"].cpu().numpy())
+        
     def clear_data(self):
         for key in self.save_dict.keys():
             self.save_dict[key] = []
-
-    def _demo_dir(self) -> Path:
-        return self.out_dir / self.img_folder / "demos" / f"demo_{self.demo_id}"
-
+ 
     def _env_dir(self, base: Path, b: int) -> Path:
         d = base / f"env_{b:03d}"
         d.mkdir(parents=True, exist_ok=True)
         return d
+    
+    def _get_next_demo_dir(self, base: Path) -> Path:
+        already_existing_num = len(list(base.iterdir()))
+        return base / f"demo_{already_existing_num:03d}.mp4"
 
-    def save_data(self, ignore_keys: List[str] = []):
-        save_root = self._demo_dir()
-        save_root.mkdir(parents=True, exist_ok=True)
-
+    def save_data(self, ignore_keys: List[str] = [], env_ids: Optional[List[int]] = None, export_hdf5: bool = False, save_other_things: bool = False):
         stacked = {k: np.array(v) for k, v in self.save_dict.items()}
+        if env_ids is None:
+            env_ids = self._all_env_ids.cpu().numpy()
+        video_dir = self.out_dir / self.img_folder / "videos"
+        video_dir.mkdir(parents=True, exist_ok=True)
+        composed_rgb = copy.deepcopy(stacked["rgb"])
+        self.save_dict["composed_rgb"] = composed_rgb
+        hdf5_names = []
+        video_paths = []
+        for b in env_ids:
+            demo_path = self._get_next_demo_dir(video_dir)
+            hdf5_names.append(demo_path.name.replace(".mp4", ""))
+            if save_other_things:
+                demo_dir = self.out_dir / self.img_folder/ "demos"
+                demo_dir = self._get_next_demo_dir(demo_dir).replace(".mp4", "")
+                env_dir = self._env_dir(demo_dir, b)
+                for key, arr in stacked.items():
+                    if key in ignore_keys:  # skip the keys for storage
+                        continue
+                    if key == "rgb":
+                        video_path = env_dir / "sim_video.mp4"
+                        writer = imageio.get_writer(
+                            video_path, fps=50, macro_block_size=None
+                        )
+                        for t in range(arr.shape[0]):
+                            writer.append_data(arr[t, b])
+                        writer.close()
+                        
+                    elif key == "segmask":
+                        video_path = env_dir / "mask_video.mp4"
+                        writer = imageio.get_writer(
+                            video_path, fps=50, macro_block_size=None
+                        )
+                        for t in range(arr.shape[0]):
+                            writer.append_data((arr[t, b].astype(np.uint8) * 255))
+                        writer.close()
+                    elif key == "depth":
+                        depth_seq = arr[:, b]
+                        flat = depth_seq[depth_seq > 0]
+                        max_depth = np.percentile(flat, 99) if flat.size > 0 else 1.0
+                        depth_norm = np.clip(depth_seq / max_depth * 255.0, 0, 255).astype(
+                            np.uint8
+                        )
+                        video_path = env_dir / "depth_video.mp4"
+                        writer = imageio.get_writer(
+                            video_path, fps=50, macro_block_size=None
+                        )
+                        for t in range(depth_norm.shape[0]):
+                            writer.append_data(depth_norm[t])
+                        writer.close()
+                        np.save(env_dir / f"{key}.npy", depth_seq)
+                    elif key != "composed_rgb":
+                        np.save(env_dir / f"{key}.npy", arr[:, b])
+            writer = imageio.get_writer(demo_path, fps=50, macro_block_size=None)
+            rgb = np.array(self.save_dict["rgb"])
+            mask = np.array(self.save_dict["segmask"])
+            bg_rgb_path = self.task_cfg.background_cfg.background_rgb_path
+            self.bg_rgb = imageio.imread(bg_rgb_path)
+            for t in range(rgb.shape[0]):
+                composed = self.convert_real(mask[t, b], self.bg_rgb, rgb[t, b])    
+                self.save_dict["composed_rgb"][t, b] = composed
+                writer.append_data(composed)
+            writer.close()
+            video_paths.append(str(demo_path))
+            print(f"[INFO]: Demonstration is saved at: {demo_path}")
+        if export_hdf5:
+            self.export_batch_data_to_hdf5(hdf5_names, video_paths)
+       
+
+
+    def export_batch_data_to_hdf5(self, hdf5_names: List[str], video_paths: List[str]) -> int:
+        """Export buffered trajectories to RoboTwin-style HDF5 episodes."""
+        if self.data_dir is not None:
+            target_root = self.data_dir
+        else:
+            target_root = self._demo_dir() / "hdf5"
+        data_dir = Path(target_root) 
+        data_dir.mkdir(parents=True, exist_ok=True)
 
-        for b in range(self.num_envs):
-            env_dir = self._env_dir(save_root, b)
-            for key, arr in stacked.items():
-                if key in ignore_keys:  # skip the keys for storage
-                    continue
-                if key == "rgb":
-                    video_path = env_dir / "sim_video.mp4"
-                    writer = imageio.get_writer(
-                        video_path, fps=50, macro_block_size=None
+       
+        num_envs = len(hdf5_names)
+        stacked = {k: np.array(v) for k, v in self.save_dict.items()}
+       
+        episode_names = []
+        for idx, name in enumerate(hdf5_names):
+            name = str(name)
+            episode_names.append(name.replace("demo_", "episode_"))
+      
+        camera_params = self._get_camera_parameters()
+
+        for env_idx, (episode_name, video_path) in enumerate(zip(episode_names, video_paths)):
+            hdf5_path = data_dir / f"{episode_name}.hdf5"
+            hdf5_path.parent.mkdir(parents=True, exist_ok=True)
+
+            with h5py.File(hdf5_path, "w") as f:
+                obs_grp = f.create_group("observation")
+                camera_group_name = "head_camera" if camera_params is not None else "camera"
+                cam_grp = obs_grp.create_group(camera_group_name)
+                if camera_params is not None:
+                    intrinsics, extrinsics, resolution = camera_params
+                    cam_grp.create_dataset("intrinsics", data=intrinsics)
+                    cam_grp.create_dataset("extrinsics", data=extrinsics)
+                    cam_grp.attrs["resolution"] = resolution
+
+                # Read video frames and encode using JPEG compression (RoboTwin-style)
+                video_reader = imageio.get_reader(video_path)
+                rgb_frames = []
+                for frame in video_reader:
+                    rgb_frames.append(frame)
+                video_reader.close()
+                rgb_frames = np.array(rgb_frames)  # (T, H, W, 3)
+                
+                # Encode frames using JPEG compression (similar to RoboTwin's images_encoding)
+                encode_data = []
+                max_len = 0
+                for i in range(len(rgb_frames)):
+                    success, encoded_image = cv2.imencode(".jpg", rgb_frames[i])
+                    if not success:
+                        raise RuntimeError(f"Failed to encode frame {i}")
+                    jpeg_data = encoded_image.tobytes()
+                    encode_data.append(jpeg_data)
+                    max_len = max(max_len, len(jpeg_data))
+                
+                # Pad all encoded frames to the same length
+                padded_data = []
+                for jpeg_data in encode_data:
+                    padded_data.append(jpeg_data.ljust(max_len, b"\0"))
+                
+                # Store encoded data
+                cam_grp.create_dataset(
+                    "rgb", 
+                    data=np.array(padded_data, dtype=f"S{max_len}"),
+                    compression="lzf"
+                )
+                cam_grp.attrs["encoding"] = "jpeg"
+                cam_grp.attrs["channels"] = 3
+                cam_grp.attrs["original_shape"] = rgb_frames.shape
+              
+
+                joint_grp = f.create_group("joint_action")
+                if "joint_pos" in stacked:
+                    joint_grp.create_dataset(
+                        "joint_pos", data=stacked["joint_pos"][:, env_idx].astype(np.float32)
+                    )
+                if "joint_vel" in stacked:
+                    joint_grp.create_dataset(
+                        "joint_vel", data=stacked["joint_vel"][:, env_idx].astype(np.float32)
                     )
-                    for t in range(arr.shape[0]):
-                        writer.append_data(arr[t, b])
-                    writer.close()
-                elif key == "segmask":
-                    video_path = env_dir / "mask_video.mp4"
-                    writer = imageio.get_writer(
-                        video_path, fps=50, macro_block_size=None
+                if "gripper_cmd" in stacked:
+                    joint_grp.create_dataset(
+                        "gripper_cmd", data=stacked["gripper_cmd"][:, env_idx].astype(np.float32)
                     )
-                    for t in range(arr.shape[0]):
-                        writer.append_data((arr[t, b].astype(np.uint8) * 255))
-                    writer.close()
-                elif key == "depth":
-                    depth_seq = arr[:, b]
-                    flat = depth_seq[depth_seq > 0]
-                    max_depth = np.percentile(flat, 99) if flat.size > 0 else 1.0
-                    depth_norm = np.clip(depth_seq / max_depth * 255.0, 0, 255).astype(
-                        np.uint8
+                if len(joint_grp.keys()) == 0:
+                    del f["joint_action"]
+
+                if "gripper_pos" in stacked:
+                    endpose_grp = f.create_group("endpose")
+                    endpose_grp.create_dataset(
+                        "gripper_pos", data=stacked["gripper_pos"][:, env_idx].astype(np.float32)
                     )
-                    video_path = env_dir / "depth_video.mp4"
-                    writer = imageio.get_writer(
-                        video_path, fps=50, macro_block_size=None
+                    if "gripper_cmd" in stacked:
+                        endpose_grp.create_dataset(
+                            "gripper_cmd", data=stacked["gripper_cmd"][:, env_idx].astype(np.float32)
+                        )
+
+                if "actions" in stacked:
+                    f.create_dataset(
+                        "actions", data=stacked["actions"][:, env_idx].astype(np.float32)
                     )
-                    for t in range(depth_norm.shape[0]):
-                        writer.append_data(depth_norm[t])
-                    writer.close()
-                    np.save(env_dir / f"{key}.npy", depth_seq)
-                else:
-                    np.save(env_dir / f"{key}.npy", arr[:, b])
-            json.dump(self.sim_cfgs, open(env_dir / "config.json", "w"), indent=2)
-        
-        print("[INFO]: Demonstration is saved at: ", save_root)
-        
-        # Compose real videos for all environments
-        print("\n[INFO] Composing real videos with background...")
-        for b in range(self.num_envs):
-            print(f"[INFO] Processing environment {b}/{self.num_envs}...")
-            success = self.compose_real_video(env_id=b)
-            if success:
-                print(f"[INFO] Real video composed successfully for env {b}")
+                
+                if "ee_pose_cam" in stacked:
+                    endpose_grp = f.create_group("ee_pose_cam")
+                    endpose_grp.create_dataset(
+                        "ee_pose_cam", data=stacked["ee_pose_cam"][:, env_idx].astype(np.float32)
+                    )
+                extras_grp = f.create_group("extras")
+                if self.task_cfg is not None:
+                    extras_grp.create_dataset("task_desc", data=self.task_cfg.task_desc)
+                if self.traj_cfg_list is not None:
+                    traj_i = self.traj_cfg_list[env_idx]
+                    traj_grp = extras_grp.create_group("traj")
+                    traj_grp.create_dataset("robot_pose", data=traj_i.robot_pose)
+                    traj_grp.create_dataset("pregrasp_pose", data=traj_i.pregrasp_pose)
+                    traj_grp.create_dataset("grasp_pose", data=traj_i.grasp_pose)
+
+                frame_count = stacked["rgb"].shape[0]
+                meta_grp = f.create_group("meta")
+                meta_grp.attrs["env_index"] = int(env_idx)
+                meta_grp.attrs["frame_dt"] = float(self.sim_dt)
+                meta_grp.attrs["frame_count"] = int(frame_count)
+                meta_grp.attrs["source"] = "OpenReal2Sim"
+                meta_grp.attrs["episode_name"] = episode_name
+                meta_grp.create_dataset("frame_indices", data=np.arange(frame_count, dtype=np.int32))
+
+        print(f"[INFO]: Exported {num_envs} HDF5 episodes to {data_dir}")
+        return num_envs
+
+
+
+    def convert_real(self,segmask, bg_rgb, fg_rgb):
+        segmask_2d = segmask[..., 0]
+        composed = bg_rgb.copy()
+        composed[segmask_2d] = fg_rgb[segmask_2d]
+        return composed
+
+    def _quat_to_rot(self, quat: Sequence[float]) -> np.ndarray:
+        w, x, y, z = quat
+        rot = np.array(
+            [
+                [1 - 2 * (y ** 2 + z ** 2), 2 * (x * y - z * w), 2 * (x * z + y * w)],
+                [2 * (x * y + z * w), 1 - 2 * (x ** 2 + z ** 2), 2 * (y * z - x * w)],
+                [2 * (x * z - y * w), 2 * (y * z + x * w), 1 - 2 * (x ** 2 + y ** 2)],
+            ],
+            dtype=np.float32,
+        )
+        return rot
+
+    def _get_camera_parameters(self) -> Optional[Tuple[np.ndarray, np.ndarray, Tuple[int, int]]]:
+        if self.task_cfg is None:
+            return None
+        camera_info = getattr(self.task_cfg, "camera_info", None)
+        if camera_info is None:
+            return None
+
+        intrinsics = np.array(
+            [
+                [camera_info.fx, 0.0, camera_info.cx],
+                [0.0, camera_info.fy, camera_info.cy],
+                [0.0, 0.0, 1.0],
+            ],
+            dtype=np.float32,
+        )
+
+        if getattr(camera_info, "camera_opencv_to_world", None) is not None:
+            extrinsics = np.array(camera_info.camera_opencv_to_world, dtype=np.float32)
+        else: 
+            extrinsics = np.eye(4, dtype=np.float32)
+            if getattr(camera_info, "camera_heading_wxyz", None) is not None:
+                rot = self._quat_to_rot(camera_info.camera_heading_wxyz)
             else:
-                print(f"[WARN] Failed to compose real video for env {b}")
-
-        demo_root = self.out_dir / "all_demos"
-        demo_root.mkdir(parents=True, exist_ok=True)
-        total_demo_id = get_next_demo_id(demo_root)
-        demo_save_path = demo_root / f"demo_{total_demo_id}"
-        demo_save_path.mkdir(parents=True, exist_ok=True)
-        meta_info = {
-            "path": str(save_root),
-            "fps": 50,
-        }
-        with open(demo_save_path / "meta_info.json", "w") as f:
-            json.dump(meta_info, f)
-        os.system(f"cp -r {save_root}/* {demo_save_path}")
-        print("[INFO]: Demonstration is saved at: ", demo_save_path)
-
-    def delete_data(self):
-        save_path = self._demo_dir()
-        failure_root = self.out_dir / self.img_folder / "demos_failures"
-        failure_root.mkdir(parents=True, exist_ok=True)
-        fail_demo_id = get_next_demo_id(failure_root)
-        failure_path = failure_root / f"demo_{fail_demo_id}"
-        os.system(f"mv {save_path} {failure_path}")
-        for key in self.save_dict.keys():
-            self.save_dict[key] = []
-        print("[INFO]: Clear up the folder: ", save_path)
+                rot = np.eye(3, dtype=np.float32)
+            extrinsics[:3, :3] = rot
+            if getattr(camera_info, "camera_position", None) is not None:
+                extrinsics[:3, 3] = np.array(camera_info.camera_position, dtype=np.float32)
+        resolution = (
+            int(camera_info.width),
+            int(camera_info.height),
+        )
+        return intrinsics, extrinsics, resolution
diff --git a/openreal2sim/simulation/isaaclab/sim_env_factory.py b/openreal2sim/simulation/isaaclab/sim_env_factory.py
index 073668b..51523b9 100755
--- a/openreal2sim/simulation/isaaclab/sim_env_factory.py
+++ b/openreal2sim/simulation/isaaclab/sim_env_factory.py
@@ -53,6 +53,8 @@ class SceneCtx:
     obj_physics: List[Dict] = None
     use_ground_plane: bool = False
     ground_z: Optional[float] = None
+    oid_to_index: Optional[Dict[str, int]] = None  # Maps oid (str) to prim index
+    index_to_oid: Optional[Dict[int, str]] = None  # Maps prim index to oid (str)
 
 
 _SCENE_CTX: Optional[SceneCtx] = None
@@ -264,7 +266,23 @@ def init_scene_from_scene_dict(
     If robot pose not provided, sample one with robot_placement_candidates_v2().
     """
     cam_dict = cfgs["cam_cfg"]
-    obj_paths = [o["usd"] for o in scene["objects"].values()]
+    
+    # Sort objects by oid to ensure consistent ordering
+    # scene["objects"] keys are oid strings (e.g., "1", "2", ...)
+    sorted_objects = sorted(scene["objects"].items(), key=lambda x: int(x[0]))
+    
+    # Build oid <-> index mapping
+    oid_to_index = {}
+    index_to_oid = {}
+    obj_paths = []
+    obj_physics_list = []
+    
+    for index, (oid_str, obj_data) in enumerate(sorted_objects):
+        oid_to_index[oid_str] = index
+        index_to_oid[index] = oid_str
+        obj_paths.append(obj_data["usd"])
+        obj_physics_list.append(obj_data.get("physics", None))
+    
     background_path = scene["background"]["usd"]
 
     # overwrite physics
@@ -272,9 +290,9 @@ def init_scene_from_scene_dict(
     obj_physics = cfgs["physics_cfg"]["obj_physics"]
     bg_physics = cfgs["physics_cfg"]["bg_physics"]
     if obj_physics is None:
-        obj_physics = [o.get("physics", None) for o in scene["objects"].values()]
+        obj_physics = obj_physics_list  # Use physics from sorted objects
     elif isinstance(obj_physics, dict):
-        obj_physics = [obj_physics for _ in scene["objects"].values()]
+        obj_physics = [obj_physics for _ in range(len(obj_paths))]
     elif isinstance(obj_physics, list):
         assert len(obj_physics) == len(scene["objects"]), (
             "obj_physics must be a list of the same length as scene['objects'] if provided."
@@ -310,6 +328,8 @@ def init_scene_from_scene_dict(
         obj_physics=list(obj_physics),
         use_ground_plane=use_ground_plane,
         ground_z=ground_z,
+        oid_to_index=oid_to_index,
+        index_to_oid=index_to_oid,
     )
 
     return {
@@ -395,6 +415,51 @@ def load_scene_json(key: str) -> dict:
     return json.load(open(scene_path))
 
 
+def get_prim_name_from_oid(oid: str | int) -> str:
+    """
+    Get the prim name (e.g., "object_00") from object id.
+    
+    Args:
+        oid: Object ID as string (e.g., "1") or integer (e.g., 1)
+        
+    Returns:
+        Prim name string (e.g., "object_00")
+    """
+    assert _SCENE_CTX is not None, "init_scene_from_scene_dict must be called first."
+    oid_str = str(oid)
+    if _SCENE_CTX.oid_to_index is None:
+        raise ValueError("oid_to_index mapping not available. Scene may not have been initialized from scene dict.")
+    index = _SCENE_CTX.oid_to_index.get(oid_str)
+    if index is None:
+        raise ValueError(f"OID '{oid_str}' not found in scene. Available OIDs: {list(_SCENE_CTX.oid_to_index.keys())}")
+    return f"object_{index:02d}"
+
+
+def get_oid_from_prim_name(prim_name: str) -> str:
+    """
+    Get the object id from prim name (e.g., "object_00" -> "1").
+    
+    Args:
+        prim_name: Prim name string (e.g., "object_00")
+        
+    Returns:
+        OID as string (e.g., "1")
+    """
+    assert _SCENE_CTX is not None, "init_scene_from_scene_dict must be called first."
+    if not prim_name.startswith("object_"):
+        raise ValueError(f"Invalid prim name format: {prim_name}")
+    try:
+        index = int(prim_name.split("_")[1])
+    except (IndexError, ValueError):
+        raise ValueError(f"Invalid prim name format: {prim_name}")
+    if _SCENE_CTX.index_to_oid is None:
+        raise ValueError("index_to_oid mapping not available. Scene may not have been initialized from scene dict.")
+    oid = _SCENE_CTX.index_to_oid.get(index)
+    if oid is None:
+        raise ValueError(f"Index {index} not found in scene. Available indices: {list(_SCENE_CTX.index_to_oid.keys())}")
+    return oid
+
+
 def make_env(
     cfgs: dict,
     num_envs: int = 1,
diff --git a/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py b/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
index 49b2f50..2e98438 100755
--- a/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
+++ b/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
@@ -9,7 +9,7 @@
 AppLauncher.add_app_launcher_args(parser)
 args_isaaclab = parser.parse_args()
 # Force GUI unless you toggle via IsaacLab flags
-args_isaaclab.headless = False
+args_isaaclab.headless = True
 app_launcher = AppLauncher(args_isaaclab)
 simulation_app = app_launcher.app
 
@@ -378,9 +378,11 @@ def count_rigid_api(usd_path):
 ########################################################
 ## Per-key conversion (batch)
 ########################################################
-def run_conversion_for_key(key: str):
+from typing import Optional
+def run_conversion_for_key(key: str, args_cli:Optional[argparse.Namespace] = None):
     global args
-
+    if args_cli is None:
+        args_cli = args_isaaclab
     scene_json = out_dir / key / "simulation" / "scene.json"
     assert scene_json.exists(), f"[{key}] scene.json not found: {scene_json}"
     scene_dict = json.load(open(scene_json, "r"))
@@ -401,7 +403,7 @@ def run_conversion_for_key(key: str):
     args = Args(input=input_list, output=output_list, make_instanceable=False,
                 collision_approximation="convexDecomposition",
                 mass=1.0, disable_gravity=False, kinematic_enabled=False,
-                headless=args_isaaclab.headless, exit_on_finish=True)
+                headless=args_cli.headless, exit_on_finish=True)
 
     # Run the original main conversion logic for this key
     log.info(f"[{key}] converting {len(args.input)} assets...")
@@ -552,6 +554,14 @@ def run_conversion(scene_dict: dict):
             log.info(f'Rigid body count: {count_rigid_api(output_path)}')
             log.info(f"Saved USD file to {os.path.abspath(output_path)}")
 
+
+def usd_conversion(keys, args_cli):
+    for key in keys:
+        log.info(f"\n========== [USD Convert] key: {key} ==========")
+        run_conversion_for_key(key, args_cli)
+    #simulation_app.close()
+    print('[Info] USD conversion completed.')
+
 ########################################################
 ## Batch entry
 ########################################################
diff --git a/openreal2sim/simulation/isaaclab/sim_trajectory_replay.py b/openreal2sim/simulation/isaaclab/sim_trajectory_replay.py
index 1ed673c..33e5996 100755
--- a/openreal2sim/simulation/isaaclab/sim_trajectory_replay.py
+++ b/openreal2sim/simulation/isaaclab/sim_trajectory_replay.py
@@ -23,7 +23,7 @@
 AppLauncher.add_app_launcher_args(parser)
 args_cli = parser.parse_args()
 args_cli.enable_cameras = True
-args_cli.headless = False  # headless mode for batch execution
+args_cli.headless = True  # headless mode for batch execution
 app_launcher = AppLauncher(vars(args_cli))
 simulation_app = app_launcher.app
 
diff --git a/openreal2sim/simulation/isaaclab/sim_utils/grasp_group_utils.py b/openreal2sim/simulation/isaaclab/sim_utils/grasp_group_utils.py
new file mode 100755
index 0000000..950d8f5
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/sim_utils/grasp_group_utils.py
@@ -0,0 +1,71 @@
+"""
+Light-weight GraspGroup fallback.
+
+If graspnetAPI is installed we reuse its GraspGroup. Otherwise we provide a very
+small compatible subset that can load npy files saved by graspnetAPI and expose
+rotation/translation information for the manipulation scripts.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Sequence, Union
+
+import numpy as np
+
+try:  # pragma: no cover - only executed when dependency exists.
+    from graspnetAPI.grasp import GraspGroup as _ExternalGraspGroup  # type: ignore
+
+    GraspGroup = _ExternalGraspGroup
+except ImportError:
+
+    @dataclass
+    class SimpleGrasp:
+        rotation_matrix: np.ndarray  # (3, 3)
+        translation: np.ndarray  # (3,)
+
+    class GraspGroup:
+        """Minimal GraspGroup replacement supporting npy loading and indexing."""
+
+        def __init__(self, array: np.ndarray | None = None):
+            if array is None:
+                self._array = np.zeros((0, 17), dtype=np.float32)
+            else:
+                self._array = np.asarray(array, dtype=np.float32)
+
+        @property
+        def grasp_group_array(self) -> np.ndarray:
+            return self._array
+
+        def __len__(self) -> int:
+            return len(self._array)
+
+        def __getitem__(
+            self, index: Union[int, slice, Sequence[int], np.ndarray]
+        ) -> Union[SimpleGrasp, "GraspGroup"]:
+            if isinstance(index, int):
+                row = self._array[index]
+                return SimpleGrasp(
+                    rotation_matrix=row[4:13].reshape(3, 3),
+                    translation=row[13:16],
+                )
+            if isinstance(index, slice):
+                return GraspGroup(self._array[index])
+            if isinstance(index, (list, tuple, np.ndarray)):
+                return GraspGroup(self._array[index])
+            raise TypeError(f"Unsupported index type {type(index)!r} for GraspGroup")
+
+        def from_npy(self, npy_file_path: str) -> "GraspGroup":
+            data = np.load(npy_file_path)
+            if data.ndim == 1:
+                data = data.reshape(1, -1)
+            self._array = data.astype(np.float32, copy=False)
+            return self
+
+        # convenience for legacy code expecting classmethod behaviour
+        @classmethod
+        def load(cls, npy_file_path: str) -> "GraspGroup":
+            return cls().from_npy(npy_file_path)
+
+__all__ = ["GraspGroup"]
+
+
diff --git a/openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py b/openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py
index 2ddd68b..bbe83cf 100755
--- a/openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py
+++ b/openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py
@@ -11,7 +11,7 @@
 default_config = {
     "physics": "default",
     "extrinsics": None,
-    "goal_offset": 0,
+    "goal_offset": 0.05,
     "grasp_idx": -1,
     "grasp_pre": None,
     "grasp_delta": None,
@@ -38,6 +38,8 @@ def load_sim_parameters(basedir, key) -> dict:
     exp_config = compose_configs(key, exp_config)
 
     # cam configs
+    manip_object_id = scene_json["manipulated_oid"]
+    object_center = scene_json["objects"][manip_object_id]["object_center"]
     cam_cfg = {
         "width":  int(scene_json["camera"]["width"]),
         "height": int(scene_json["camera"]["height"]),
@@ -50,7 +52,7 @@ def load_sim_parameters(basedir, key) -> dict:
             "move_to":      list(scene_json["camera"]["camera_position"]),
             "scene_min":    list(scene_json["aabb"]["scene_min"]),
             "scene_max":    list(scene_json["aabb"]["scene_max"]),
-            "object_center":list(scene_json["objects"]["1"]["object_center"]),
+            "object_center":list(object_center),
         },
     }
 
@@ -67,23 +69,33 @@ def load_sim_parameters(basedir, key) -> dict:
     }
 
     # demo configs
-    goal_offset = exp_config.get("goal_offset", 0)
+   
+    goal_offset = exp_config.get("goal_offset", 0.05)
     grasp_idx = exp_config.get("grasp_idx", -1)
     grasp_pre = exp_config.get("grasp_pre", None)
     grasp_delta = exp_config.get("grasp_delta", None)
-    traj_key = exp_config.get("traj_key", "fdpose_trajs") # "fdpose_trajs", "simple_trajs", "hybrid_trajs"
-    manip_object_id = exp_config.get("manip_object_id", "1")
-    traj_path = scene_json["objects"][manip_object_id][traj_key]
-    grasp_path = scene_json["objects"][manip_object_id].get("grasps", None)
+    manip_object_id = scene_json["manipulated_oid"]
+    traj_key = exp_config.get("traj_key", "fdpose_trajs")
+    if scene_json["objects"][manip_object_id].get("final_trajs", None) is not None:
+        traj_path = scene_json["objects"][manip_object_id]["final_trajs"]
+    else:
+        traj_path = scene_json["objects"][manip_object_id]["trajs"][traj_key]
+    if scene_json["objects"][manip_object_id].get("rescored_grasps", None) is not None:
+        grasp_path = scene_json["objects"][manip_object_id]["rescored_grasps"]
+    else:
+        grasp_path = scene_json["objects"][manip_object_id].get("grasps", None)
+    task_type = scene_json["task_type"]
+    final_gripper_closed = scene_json["gripper_closed"]
     demo_cfg = {
         "manip_object_id": manip_object_id,
-        "traj_key": traj_key,
         "traj_path": traj_path,
         "goal_offset": goal_offset,
         "grasp_idx": grasp_idx,
         "grasp_pre": grasp_pre,
         "grasp_delta": grasp_delta,
         "grasp_path": grasp_path,
+        "final_gripper_closed": final_gripper_closed,
+        "task_type": task_type,
     }
 
     # physics configs
diff --git a/r2sVLA/algos/DP/.gitignore b/r2sVLA/algos/DP/.gitignore
new file mode 100644
index 0000000..0dd75ff
--- /dev/null
+++ b/r2sVLA/algos/DP/.gitignore
@@ -0,0 +1,2 @@
+data/*
+checkpoints/*
\ No newline at end of file
diff --git a/r2sVLA/algos/DP/__init__.py b/r2sVLA/algos/DP/__init__.py
new file mode 100644
index 0000000..d4b6770
--- /dev/null
+++ b/r2sVLA/algos/DP/__init__.py
@@ -0,0 +1 @@
+from .deploy_policy import *
diff --git a/r2sVLA/algos/DP/deploy_policy.py b/r2sVLA/algos/DP/deploy_policy.py
new file mode 100644
index 0000000..6e9e0c3
--- /dev/null
+++ b/r2sVLA/algos/DP/deploy_policy.py
@@ -0,0 +1,51 @@
+import numpy as np
+from .dp_model import DP
+import yaml
+
+def encode_obs(observation):
+    head_cam = (np.moveaxis(observation["observation"]["head_camera"]["rgb"], -1, 0) / 255)
+    left_cam = (np.moveaxis(observation["observation"]["left_camera"]["rgb"], -1, 0) / 255)
+    right_cam = (np.moveaxis(observation["observation"]["right_camera"]["rgb"], -1, 0) / 255)
+    obs = dict(
+        head_cam=head_cam,
+        left_cam=left_cam,
+        right_cam=right_cam,
+    )
+    obs["agent_pos"] = observation["joint_action"]["vector"]
+    return obs
+
+
+def get_model(usr_args):
+    ckpt_file = f"./policy/DP/checkpoints/{usr_args['task_name']}-{usr_args['ckpt_setting']}-{usr_args['expert_data_num']}-{usr_args['seed']}/{usr_args['checkpoint_num']}.ckpt"
+    action_dim = usr_args['left_arm_dim'] + usr_args['right_arm_dim'] + 2 # 2 gripper
+    
+    load_config_path = f'./policy/DP/diffusion_policy/config/robot_dp_{action_dim}.yaml'
+    with open(load_config_path, "r", encoding="utf-8") as f:
+        model_training_config = yaml.safe_load(f)
+    
+    n_obs_steps = model_training_config['n_obs_steps']
+    n_action_steps = model_training_config['n_action_steps']
+    
+    return DP(ckpt_file, n_obs_steps=n_obs_steps, n_action_steps=n_action_steps)
+
+
+def eval(TASK_ENV, model, observation):
+    """
+    TASK_ENV: Task Environment Class, you can use this class to interact with the environment
+    model: The model from 'get_model()' function
+    observation: The observation about the environment
+    """
+    obs = encode_obs(observation)
+    instruction = TASK_ENV.get_instruction()
+
+    # ======== Get Action ========
+    actions = model.get_action(obs)
+
+    for action in actions:
+        TASK_ENV.take_action(action)
+        observation = TASK_ENV.get_obs()
+        obs = encode_obs(observation)
+        model.update_obs(obs)
+
+def reset_model(model):
+    model.reset_obs()
diff --git a/r2sVLA/algos/DP/deploy_policy.yml b/r2sVLA/algos/DP/deploy_policy.yml
new file mode 100644
index 0000000..14f39b1
--- /dev/null
+++ b/r2sVLA/algos/DP/deploy_policy.yml
@@ -0,0 +1,11 @@
+# Basic experiment configuration
+policy_name: DP
+task_name: null
+task_config: null
+ckpt_setting: null
+seed: null
+instruction_type: unseen
+
+expert_data_num: null
+checkpoint_num: 600
+head_camera_type: D435
\ No newline at end of file
diff --git a/r2sVLA/algos/DP/deploy_policy_double_env.py b/r2sVLA/algos/DP/deploy_policy_double_env.py
new file mode 100644
index 0000000..762163a
--- /dev/null
+++ b/r2sVLA/algos/DP/deploy_policy_double_env.py
@@ -0,0 +1,36 @@
+import numpy as np
+try:
+    from .dp_model import DP
+except:
+    pass
+
+def encode_obs(observation):
+    head_cam = (np.moveaxis(observation["observation"]["head_camera"]["rgb"], -1, 0) / 255)
+    left_cam = (np.moveaxis(observation["observation"]["left_camera"]["rgb"], -1, 0) / 255)
+    right_cam = (np.moveaxis(observation["observation"]["right_camera"]["rgb"], -1, 0) / 255)
+    obs = dict(
+        head_cam=head_cam,
+        left_cam=left_cam,
+        right_cam=right_cam,
+    )
+    obs["agent_pos"] = observation["joint_action"]["vector"]
+    return obs
+
+
+def get_model(usr_args):
+    ckpt_file = f"./policy/DP/checkpoints/{usr_args['task_name']}-{usr_args['ckpt_setting']}-{usr_args['expert_data_num']}-{usr_args['seed']}/{usr_args['checkpoint_num']}.ckpt"
+    return DP(ckpt_file)
+
+
+def eval(TASK_ENV, model, observation):
+    obs = encode_obs(observation)
+    instruction = TASK_ENV.get_instruction()
+
+    # ======== Get Action ========
+    actions = model.call(func_name='get_action', obs=obs)
+
+    for action in actions:
+        TASK_ENV.take_action(action)
+        observation = TASK_ENV.get_obs()
+        obs = encode_obs(observation)
+        model.call(func_name='update_obs', obs=obs)
diff --git a/r2sVLA/algos/DP/diffusion_policy/__init__.py b/r2sVLA/algos/DP/diffusion_policy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/checkpoint_util.py b/r2sVLA/algos/DP/diffusion_policy/common/checkpoint_util.py
new file mode 100644
index 0000000..f3ce00a
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/checkpoint_util.py
@@ -0,0 +1,61 @@
+from typing import Optional, Dict
+import os
+
+
+class TopKCheckpointManager:
+
+    def __init__(
+        self,
+        save_dir,
+        monitor_key: str,
+        mode="min",
+        k=1,
+        format_str="epoch={epoch:03d}-train_loss={train_loss:.3f}.ckpt",
+    ):
+        assert mode in ["max", "min"]
+        assert k >= 0
+
+        self.save_dir = save_dir
+        self.monitor_key = monitor_key
+        self.mode = mode
+        self.k = k
+        self.format_str = format_str
+        self.path_value_map = dict()
+
+    def get_ckpt_path(self, data: Dict[str, float]) -> Optional[str]:
+        if self.k == 0:
+            return None
+
+        value = data[self.monitor_key]
+        ckpt_path = os.path.join(self.save_dir, self.format_str.format(**data))
+
+        if len(self.path_value_map) < self.k:
+            # under-capacity
+            self.path_value_map[ckpt_path] = value
+            return ckpt_path
+
+        # at capacity
+        sorted_map = sorted(self.path_value_map.items(), key=lambda x: x[1])
+        min_path, min_value = sorted_map[0]
+        max_path, max_value = sorted_map[-1]
+
+        delete_path = None
+        if self.mode == "max":
+            if value > min_value:
+                delete_path = min_path
+        else:
+            if value < max_value:
+                delete_path = max_path
+
+        if delete_path is None:
+            return None
+        else:
+            del self.path_value_map[delete_path]
+            self.path_value_map[ckpt_path] = value
+
+            if not os.path.exists(self.save_dir):
+                os.mkdir(self.save_dir)
+
+            if os.path.exists(delete_path):
+                os.remove(delete_path)
+            return ckpt_path
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/cv2_util.py b/r2sVLA/algos/DP/diffusion_policy/common/cv2_util.py
new file mode 100644
index 0000000..469ac0f
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/cv2_util.py
@@ -0,0 +1,150 @@
+from typing import Tuple
+import math
+import cv2
+import numpy as np
+
+
+def draw_reticle(img, u, v, label_color):
+    """
+    Draws a reticle (cross-hair) on the image at the given position on top of
+    the original image.
+    @param img (In/Out) uint8 3 channel image
+    @param u X coordinate (width)
+    @param v Y coordinate (height)
+    @param label_color tuple of 3 ints for RGB color used for drawing.
+    """
+    # Cast to int.
+    u = int(u)
+    v = int(v)
+
+    white = (255, 255, 255)
+    cv2.circle(img, (u, v), 10, label_color, 1)
+    cv2.circle(img, (u, v), 11, white, 1)
+    cv2.circle(img, (u, v), 12, label_color, 1)
+    cv2.line(img, (u, v + 1), (u, v + 3), white, 1)
+    cv2.line(img, (u + 1, v), (u + 3, v), white, 1)
+    cv2.line(img, (u, v - 1), (u, v - 3), white, 1)
+    cv2.line(img, (u - 1, v), (u - 3, v), white, 1)
+
+
+def draw_text(
+        img,
+        *,
+        text,
+        uv_top_left,
+        color=(255, 255, 255),
+        fontScale=0.5,
+        thickness=1,
+        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
+        outline_color=(0, 0, 0),
+        line_spacing=1.5,
+):
+    """
+    Draws multiline with an outline.
+    """
+    assert isinstance(text, str)
+
+    uv_top_left = np.array(uv_top_left, dtype=float)
+    assert uv_top_left.shape == (2, )
+
+    for line in text.splitlines():
+        (w, h), _ = cv2.getTextSize(
+            text=line,
+            fontFace=fontFace,
+            fontScale=fontScale,
+            thickness=thickness,
+        )
+        uv_bottom_left_i = uv_top_left + [0, h]
+        org = tuple(uv_bottom_left_i.astype(int))
+
+        if outline_color is not None:
+            cv2.putText(
+                img,
+                text=line,
+                org=org,
+                fontFace=fontFace,
+                fontScale=fontScale,
+                color=outline_color,
+                thickness=thickness * 3,
+                lineType=cv2.LINE_AA,
+            )
+        cv2.putText(
+            img,
+            text=line,
+            org=org,
+            fontFace=fontFace,
+            fontScale=fontScale,
+            color=color,
+            thickness=thickness,
+            lineType=cv2.LINE_AA,
+        )
+
+        uv_top_left += [0, h * line_spacing]
+
+
+def get_image_transform(
+        input_res: Tuple[int, int] = (1280, 720),
+        output_res: Tuple[int, int] = (640, 480),
+        bgr_to_rgb: bool = False,
+):
+
+    iw, ih = input_res
+    ow, oh = output_res
+    rw, rh = None, None
+    interp_method = cv2.INTER_AREA
+
+    if (iw / ih) >= (ow / oh):
+        # input is wider
+        rh = oh
+        rw = math.ceil(rh / ih * iw)
+        if oh > ih:
+            interp_method = cv2.INTER_LINEAR
+    else:
+        rw = ow
+        rh = math.ceil(rw / iw * ih)
+        if ow > iw:
+            interp_method = cv2.INTER_LINEAR
+
+    w_slice_start = (rw - ow) // 2
+    w_slice = slice(w_slice_start, w_slice_start + ow)
+    h_slice_start = (rh - oh) // 2
+    h_slice = slice(h_slice_start, h_slice_start + oh)
+    c_slice = slice(None)
+    if bgr_to_rgb:
+        c_slice = slice(None, None, -1)
+
+    def transform(img: np.ndarray):
+        assert img.shape == ((ih, iw, 3))
+        # resize
+        img = cv2.resize(img, (rw, rh), interpolation=interp_method)
+        # crop
+        img = img[h_slice, w_slice, c_slice]
+        return img
+
+    return transform
+
+
+def optimal_row_cols(n_cameras, in_wh_ratio, max_resolution=(1920, 1080)):
+    out_w, out_h = max_resolution
+    out_wh_ratio = out_w / out_h
+
+    n_rows = np.arange(n_cameras, dtype=np.int64) + 1
+    n_cols = np.ceil(n_cameras / n_rows).astype(np.int64)
+    cat_wh_ratio = in_wh_ratio * (n_cols / n_rows)
+    ratio_diff = np.abs(out_wh_ratio - cat_wh_ratio)
+    best_idx = np.argmin(ratio_diff)
+    best_n_row = n_rows[best_idx]
+    best_n_col = n_cols[best_idx]
+    best_cat_wh_ratio = cat_wh_ratio[best_idx]
+
+    rw, rh = None, None
+    if best_cat_wh_ratio >= out_wh_ratio:
+        # cat is wider
+        rw = math.floor(out_w / best_n_col)
+        rh = math.floor(rw / in_wh_ratio)
+    else:
+        rh = math.floor(out_h / best_n_row)
+        rw = math.floor(rh * in_wh_ratio)
+
+    # crop_resolution = (rw, rh)
+    return rw, rh, best_n_col, best_n_row
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/env_util.py b/r2sVLA/algos/DP/diffusion_policy/common/env_util.py
new file mode 100644
index 0000000..30622fa
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/env_util.py
@@ -0,0 +1,28 @@
+import cv2
+import numpy as np
+
+
+def render_env_video(env, states, actions=None):
+    observations = states
+    imgs = list()
+    for i in range(len(observations)):
+        state = observations[i]
+        env.set_state(state)
+        if i == 0:
+            env.set_state(state)
+        img = env.render()
+        # draw action
+        if actions is not None:
+            action = actions[i]
+            coord = (action / 512 * 96).astype(np.int32)
+            cv2.drawMarker(
+                img,
+                coord,
+                color=(255, 0, 0),
+                markerType=cv2.MARKER_CROSS,
+                markerSize=8,
+                thickness=1,
+            )
+        imgs.append(img)
+    imgs = np.array(imgs)
+    return imgs
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/json_logger.py b/r2sVLA/algos/DP/diffusion_policy/common/json_logger.py
new file mode 100644
index 0000000..f4bcab7
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/json_logger.py
@@ -0,0 +1,115 @@
+from typing import Optional, Callable, Any, Sequence
+import os
+import copy
+import json
+import numbers
+import pandas as pd
+
+
+def read_json_log(path: str, required_keys: Sequence[str] = tuple(), **kwargs) -> pd.DataFrame:
+    """
+    Read json-per-line file, with potentially incomplete lines.
+    kwargs passed to pd.read_json
+    """
+    lines = list()
+    with open(path, "r") as f:
+        while True:
+            # one json per line
+            line = f.readline()
+            if len(line) == 0:
+                # EOF
+                break
+            elif not line.endswith("\n"):
+                # incomplete line
+                break
+            is_relevant = False
+            for k in required_keys:
+                if k in line:
+                    is_relevant = True
+                    break
+            if is_relevant:
+                lines.append(line)
+    if len(lines) < 1:
+        return pd.DataFrame()
+    json_buf = (f'[{",".join([line for line in (line.strip() for line in lines) if line])}]')
+    df = pd.read_json(json_buf, **kwargs)
+    return df
+
+
+class JsonLogger:
+
+    def __init__(self, path: str, filter_fn: Optional[Callable[[str, Any], bool]] = None):
+        if filter_fn is None:
+            filter_fn = lambda k, v: isinstance(v, numbers.Number)
+
+        # default to append mode
+        self.path = path
+        self.filter_fn = filter_fn
+        self.file = None
+        self.last_log = None
+
+    def start(self):
+        # use line buffering
+        try:
+            self.file = file = open(self.path, "r+", buffering=1)
+        except FileNotFoundError:
+            self.file = file = open(self.path, "w+", buffering=1)
+
+        # Move the pointer (similar to a cursor in a text editor) to the end of the file
+        pos = file.seek(0, os.SEEK_END)
+
+        # Read each character in the file one at a time from the last
+        # character going backwards, searching for a newline character
+        # If we find a new line, exit the search
+        while pos > 0 and file.read(1) != "\n":
+            pos -= 1
+            file.seek(pos, os.SEEK_SET)
+        # now the file pointer is at one past the last '\n'
+        # and pos is at the last '\n'.
+        last_line_end = file.tell()
+
+        # find the start of second last line
+        pos = max(0, pos - 1)
+        file.seek(pos, os.SEEK_SET)
+        while pos > 0 and file.read(1) != "\n":
+            pos -= 1
+            file.seek(pos, os.SEEK_SET)
+        # now the file pointer is at one past the second last '\n'
+        last_line_start = file.tell()
+
+        if last_line_start < last_line_end:
+            # has last line of json
+            last_line = file.readline()
+            self.last_log = json.loads(last_line)
+
+        # remove the last incomplete line
+        file.seek(last_line_end)
+        file.truncate()
+
+    def stop(self):
+        self.file.close()
+        self.file = None
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+
+    def log(self, data: dict):
+        filtered_data = dict(filter(lambda x: self.filter_fn(*x), data.items()))
+        # save current as last log
+        self.last_log = filtered_data
+        for k, v in filtered_data.items():
+            if isinstance(v, numbers.Integral):
+                filtered_data[k] = int(v)
+            elif isinstance(v, numbers.Number):
+                filtered_data[k] = float(v)
+        buf = json.dumps(filtered_data)
+        # ensure one line per json
+        buf = buf.replace("\n", "") + "\n"
+        self.file.write(buf)
+
+    def get_last_log(self):
+        return copy.deepcopy(self.last_log)
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/nested_dict_util.py b/r2sVLA/algos/DP/diffusion_policy/common/nested_dict_util.py
new file mode 100644
index 0000000..013bd0b
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/nested_dict_util.py
@@ -0,0 +1,34 @@
+import functools
+
+
+def nested_dict_map(f, x):
+    """
+    Map f over all leaf of nested dict x
+    """
+
+    if not isinstance(x, dict):
+        return f(x)
+    y = dict()
+    for key, value in x.items():
+        y[key] = nested_dict_map(f, value)
+    return y
+
+
+def nested_dict_reduce(f, x):
+    """
+    Map f over all values of nested dict x, and reduce to a single value
+    """
+    if not isinstance(x, dict):
+        return x
+
+    reduced_values = list()
+    for value in x.values():
+        reduced_values.append(nested_dict_reduce(f, value))
+    y = functools.reduce(f, reduced_values)
+    return y
+
+
+def nested_dict_check(f, x):
+    bool_dict = nested_dict_map(f, x)
+    result = nested_dict_reduce(lambda x, y: x and y, bool_dict)
+    return result
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/normalize_util.py b/r2sVLA/algos/DP/diffusion_policy/common/normalize_util.py
new file mode 100644
index 0000000..0b0e3f1
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/normalize_util.py
@@ -0,0 +1,197 @@
+from diffusion_policy.model.common.normalizer import SingleFieldLinearNormalizer
+from diffusion_policy.common.pytorch_util import (
+    dict_apply,
+    dict_apply_reduce,
+    dict_apply_split,
+)
+import numpy as np
+
+
+def get_range_normalizer_from_stat(stat, output_max=1, output_min=-1, range_eps=1e-7):
+    # -1, 1 normalization
+    input_max = stat["max"]
+    input_min = stat["min"]
+    input_range = input_max - input_min
+    ignore_dim = input_range < range_eps
+    input_range[ignore_dim] = output_max - output_min
+    scale = (output_max - output_min) / input_range
+    offset = output_min - scale * input_min
+    offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
+
+    return SingleFieldLinearNormalizer.create_manual(scale=scale, offset=offset, input_stats_dict=stat)
+
+
+def get_image_range_normalizer():
+    scale = np.array([2], dtype=np.float32)
+    offset = np.array([-1], dtype=np.float32)
+    stat = {
+        "min": np.array([0], dtype=np.float32),
+        "max": np.array([1], dtype=np.float32),
+        "mean": np.array([0.5], dtype=np.float32),
+        "std": np.array([np.sqrt(1 / 12)], dtype=np.float32),
+    }
+    return SingleFieldLinearNormalizer.create_manual(scale=scale, offset=offset, input_stats_dict=stat)
+
+
+def get_identity_normalizer_from_stat(stat):
+    scale = np.ones_like(stat["min"])
+    offset = np.zeros_like(stat["min"])
+    return SingleFieldLinearNormalizer.create_manual(scale=scale, offset=offset, input_stats_dict=stat)
+
+
+def robomimic_abs_action_normalizer_from_stat(stat, rotation_transformer):
+    result = dict_apply_split(stat, lambda x: {"pos": x[..., :3], "rot": x[..., 3:6], "gripper": x[..., 6:]})
+
+    def get_pos_param_info(stat, output_max=1, output_min=-1, range_eps=1e-7):
+        # -1, 1 normalization
+        input_max = stat["max"]
+        input_min = stat["min"]
+        input_range = input_max - input_min
+        ignore_dim = input_range < range_eps
+        input_range[ignore_dim] = output_max - output_min
+        scale = (output_max - output_min) / input_range
+        offset = output_min - scale * input_min
+        offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
+
+        return {"scale": scale, "offset": offset}, stat
+
+    def get_rot_param_info(stat):
+        example = rotation_transformer.forward(stat["mean"])
+        scale = np.ones_like(example)
+        offset = np.zeros_like(example)
+        info = {
+            "max": np.ones_like(example),
+            "min": np.full_like(example, -1),
+            "mean": np.zeros_like(example),
+            "std": np.ones_like(example),
+        }
+        return {"scale": scale, "offset": offset}, info
+
+    def get_gripper_param_info(stat):
+        example = stat["max"]
+        scale = np.ones_like(example)
+        offset = np.zeros_like(example)
+        info = {
+            "max": np.ones_like(example),
+            "min": np.full_like(example, -1),
+            "mean": np.zeros_like(example),
+            "std": np.ones_like(example),
+        }
+        return {"scale": scale, "offset": offset}, info
+
+    pos_param, pos_info = get_pos_param_info(result["pos"])
+    rot_param, rot_info = get_rot_param_info(result["rot"])
+    gripper_param, gripper_info = get_gripper_param_info(result["gripper"])
+
+    param = dict_apply_reduce([pos_param, rot_param, gripper_param], lambda x: np.concatenate(x, axis=-1))
+    info = dict_apply_reduce([pos_info, rot_info, gripper_info], lambda x: np.concatenate(x, axis=-1))
+
+    return SingleFieldLinearNormalizer.create_manual(scale=param["scale"],
+                                                     offset=param["offset"],
+                                                     input_stats_dict=info)
+
+
+def robomimic_abs_action_only_normalizer_from_stat(stat):
+    result = dict_apply_split(stat, lambda x: {"pos": x[..., :3], "other": x[..., 3:]})
+
+    def get_pos_param_info(stat, output_max=1, output_min=-1, range_eps=1e-7):
+        # -1, 1 normalization
+        input_max = stat["max"]
+        input_min = stat["min"]
+        input_range = input_max - input_min
+        ignore_dim = input_range < range_eps
+        input_range[ignore_dim] = output_max - output_min
+        scale = (output_max - output_min) / input_range
+        offset = output_min - scale * input_min
+        offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
+
+        return {"scale": scale, "offset": offset}, stat
+
+    def get_other_param_info(stat):
+        example = stat["max"]
+        scale = np.ones_like(example)
+        offset = np.zeros_like(example)
+        info = {
+            "max": np.ones_like(example),
+            "min": np.full_like(example, -1),
+            "mean": np.zeros_like(example),
+            "std": np.ones_like(example),
+        }
+        return {"scale": scale, "offset": offset}, info
+
+    pos_param, pos_info = get_pos_param_info(result["pos"])
+    other_param, other_info = get_other_param_info(result["other"])
+
+    param = dict_apply_reduce([pos_param, other_param], lambda x: np.concatenate(x, axis=-1))
+    info = dict_apply_reduce([pos_info, other_info], lambda x: np.concatenate(x, axis=-1))
+
+    return SingleFieldLinearNormalizer.create_manual(scale=param["scale"],
+                                                     offset=param["offset"],
+                                                     input_stats_dict=info)
+
+
+def robomimic_abs_action_only_dual_arm_normalizer_from_stat(stat):
+    Da = stat["max"].shape[-1]
+    Dah = Da // 2
+    result = dict_apply_split(
+        stat,
+        lambda x: {
+            "pos0": x[..., :3],
+            "other0": x[..., 3:Dah],
+            "pos1": x[..., Dah:Dah + 3],
+            "other1": x[..., Dah + 3:],
+        },
+    )
+
+    def get_pos_param_info(stat, output_max=1, output_min=-1, range_eps=1e-7):
+        # -1, 1 normalization
+        input_max = stat["max"]
+        input_min = stat["min"]
+        input_range = input_max - input_min
+        ignore_dim = input_range < range_eps
+        input_range[ignore_dim] = output_max - output_min
+        scale = (output_max - output_min) / input_range
+        offset = output_min - scale * input_min
+        offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
+
+        return {"scale": scale, "offset": offset}, stat
+
+    def get_other_param_info(stat):
+        example = stat["max"]
+        scale = np.ones_like(example)
+        offset = np.zeros_like(example)
+        info = {
+            "max": np.ones_like(example),
+            "min": np.full_like(example, -1),
+            "mean": np.zeros_like(example),
+            "std": np.ones_like(example),
+        }
+        return {"scale": scale, "offset": offset}, info
+
+    pos0_param, pos0_info = get_pos_param_info(result["pos0"])
+    pos1_param, pos1_info = get_pos_param_info(result["pos1"])
+    other0_param, other0_info = get_other_param_info(result["other0"])
+    other1_param, other1_info = get_other_param_info(result["other1"])
+
+    param = dict_apply_reduce(
+        [pos0_param, other0_param, pos1_param, other1_param],
+        lambda x: np.concatenate(x, axis=-1),
+    )
+    info = dict_apply_reduce(
+        [pos0_info, other0_info, pos1_info, other1_info],
+        lambda x: np.concatenate(x, axis=-1),
+    )
+
+    return SingleFieldLinearNormalizer.create_manual(scale=param["scale"],
+                                                     offset=param["offset"],
+                                                     input_stats_dict=info)
+
+
+def array_to_stats(arr: np.ndarray):
+    stat = {
+        "min": np.min(arr, axis=0),
+        "max": np.max(arr, axis=0),
+        "mean": np.mean(arr, axis=0),
+        "std": np.std(arr, axis=0),
+    }
+    return stat
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/pose_trajectory_interpolator.py b/r2sVLA/algos/DP/diffusion_policy/common/pose_trajectory_interpolator.py
new file mode 100644
index 0000000..091aca9
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/pose_trajectory_interpolator.py
@@ -0,0 +1,211 @@
+from typing import Union
+import numbers
+import numpy as np
+import scipy.interpolate as si
+import scipy.spatial.transform as st
+
+
+def rotation_distance(a: st.Rotation, b: st.Rotation) -> float:
+    return (b * a.inv()).magnitude()
+
+
+def pose_distance(start_pose, end_pose):
+    start_pose = np.array(start_pose)
+    end_pose = np.array(end_pose)
+    start_pos = start_pose[:3]
+    end_pos = end_pose[:3]
+    start_rot = st.Rotation.from_rotvec(start_pose[3:])
+    end_rot = st.Rotation.from_rotvec(end_pose[3:])
+    pos_dist = np.linalg.norm(end_pos - start_pos)
+    rot_dist = rotation_distance(start_rot, end_rot)
+    return pos_dist, rot_dist
+
+
+class PoseTrajectoryInterpolator:
+
+    def __init__(self, times: np.ndarray, poses: np.ndarray):
+        assert len(times) >= 1
+        assert len(poses) == len(times)
+        if not isinstance(times, np.ndarray):
+            times = np.array(times)
+        if not isinstance(poses, np.ndarray):
+            poses = np.array(poses)
+
+        if len(times) == 1:
+            # special treatment for single step interpolation
+            self.single_step = True
+            self._times = times
+            self._poses = poses
+        else:
+            self.single_step = False
+            assert np.all(times[1:] >= times[:-1])
+
+            pos = poses[:, :3]
+            rot = st.Rotation.from_rotvec(poses[:, 3:])
+
+            self.pos_interp = si.interp1d(times, pos, axis=0, assume_sorted=True)
+            self.rot_interp = st.Slerp(times, rot)
+
+    @property
+    def times(self) -> np.ndarray:
+        if self.single_step:
+            return self._times
+        else:
+            return self.pos_interp.x
+
+    @property
+    def poses(self) -> np.ndarray:
+        if self.single_step:
+            return self._poses
+        else:
+            n = len(self.times)
+            poses = np.zeros((n, 6))
+            poses[:, :3] = self.pos_interp.y
+            poses[:, 3:] = self.rot_interp(self.times).as_rotvec()
+            return poses
+
+    def trim(self, start_t: float, end_t: float) -> "PoseTrajectoryInterpolator":
+        assert start_t <= end_t
+        times = self.times
+        should_keep = (start_t < times) & (times < end_t)
+        keep_times = times[should_keep]
+        all_times = np.concatenate([[start_t], keep_times, [end_t]])
+        # remove duplicates, Slerp requires strictly increasing x
+        all_times = np.unique(all_times)
+        # interpolate
+        all_poses = self(all_times)
+        return PoseTrajectoryInterpolator(times=all_times, poses=all_poses)
+
+    def drive_to_waypoint(self,
+                          pose,
+                          time,
+                          curr_time,
+                          max_pos_speed=np.inf,
+                          max_rot_speed=np.inf) -> "PoseTrajectoryInterpolator":
+        assert max_pos_speed > 0
+        assert max_rot_speed > 0
+        time = max(time, curr_time)
+
+        curr_pose = self(curr_time)
+        pos_dist, rot_dist = pose_distance(curr_pose, pose)
+        pos_min_duration = pos_dist / max_pos_speed
+        rot_min_duration = rot_dist / max_rot_speed
+        duration = time - curr_time
+        duration = max(duration, max(pos_min_duration, rot_min_duration))
+        assert duration >= 0
+        last_waypoint_time = curr_time + duration
+
+        # insert new pose
+        trimmed_interp = self.trim(curr_time, curr_time)
+        times = np.append(trimmed_interp.times, [last_waypoint_time], axis=0)
+        poses = np.append(trimmed_interp.poses, [pose], axis=0)
+
+        # create new interpolator
+        final_interp = PoseTrajectoryInterpolator(times, poses)
+        return final_interp
+
+    def schedule_waypoint(
+        self,
+        pose,
+        time,
+        max_pos_speed=np.inf,
+        max_rot_speed=np.inf,
+        curr_time=None,
+        last_waypoint_time=None,
+    ) -> "PoseTrajectoryInterpolator":
+        assert max_pos_speed > 0
+        assert max_rot_speed > 0
+        if last_waypoint_time is not None:
+            assert curr_time is not None
+
+        # trim current interpolator to between curr_time and last_waypoint_time
+        start_time = self.times[0]
+        end_time = self.times[-1]
+        assert start_time <= end_time
+
+        if curr_time is not None:
+            if time <= curr_time:
+                # if insert time is earlier than current time
+                # no effect should be done to the interpolator
+                return self
+            # now, curr_time < time
+            start_time = max(curr_time, start_time)
+
+            if last_waypoint_time is not None:
+                # if last_waypoint_time is earlier than start_time
+                # use start_time
+                if time <= last_waypoint_time:
+                    end_time = curr_time
+                else:
+                    end_time = max(last_waypoint_time, curr_time)
+            else:
+                end_time = curr_time
+
+        end_time = min(end_time, time)
+        start_time = min(start_time, end_time)
+        # end time should be the latest of all times except time
+        # after this we can assume order (proven by zhenjia, due to the 2 min operations)
+
+        # Constraints:
+        # start_time <= end_time <= time (proven by zhenjia)
+        # curr_time <= start_time (proven by zhenjia)
+        # curr_time <= time (proven by zhenjia)
+
+        # time can't change
+        # last_waypoint_time can't change
+        # curr_time can't change
+        assert start_time <= end_time
+        assert end_time <= time
+        if last_waypoint_time is not None:
+            if time <= last_waypoint_time:
+                assert end_time == curr_time
+            else:
+                assert end_time == max(last_waypoint_time, curr_time)
+
+        if curr_time is not None:
+            assert curr_time <= start_time
+            assert curr_time <= time
+
+        trimmed_interp = self.trim(start_time, end_time)
+        # after this, all waypoints in trimmed_interp is within start_time and end_time
+        # and is earlier than time
+
+        # determine speed
+        duration = time - end_time
+        end_pose = trimmed_interp(end_time)
+        pos_dist, rot_dist = pose_distance(pose, end_pose)
+        pos_min_duration = pos_dist / max_pos_speed
+        rot_min_duration = rot_dist / max_rot_speed
+        duration = max(duration, max(pos_min_duration, rot_min_duration))
+        assert duration >= 0
+        last_waypoint_time = end_time + duration
+
+        # insert new pose
+        times = np.append(trimmed_interp.times, [last_waypoint_time], axis=0)
+        poses = np.append(trimmed_interp.poses, [pose], axis=0)
+
+        # create new interpolator
+        final_interp = PoseTrajectoryInterpolator(times, poses)
+        return final_interp
+
+    def __call__(self, t: Union[numbers.Number, np.ndarray]) -> np.ndarray:
+        is_single = False
+        if isinstance(t, numbers.Number):
+            is_single = True
+            t = np.array([t])
+
+        pose = np.zeros((len(t), 6))
+        if self.single_step:
+            pose[:] = self._poses[0]
+        else:
+            start_time = self.times[0]
+            end_time = self.times[-1]
+            t = np.clip(t, start_time, end_time)
+
+            pose = np.zeros((len(t), 6))
+            pose[:, :3] = self.pos_interp(t)
+            pose[:, 3:] = self.rot_interp(t).as_rotvec()
+
+        if is_single:
+            pose = pose[0]
+        return pose
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/precise_sleep.py b/r2sVLA/algos/DP/diffusion_policy/common/precise_sleep.py
new file mode 100644
index 0000000..04d783e
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/precise_sleep.py
@@ -0,0 +1,27 @@
+import time
+
+
+def precise_sleep(dt: float, slack_time: float = 0.001, time_func=time.monotonic):
+    """
+    Use hybrid of time.sleep and spinning to minimize jitter.
+    Sleep dt - slack_time seconds first, then spin for the rest.
+    """
+    t_start = time_func()
+    if dt > slack_time:
+        time.sleep(dt - slack_time)
+    t_end = t_start + dt
+    while time_func() < t_end:
+        pass
+    return
+
+
+def precise_wait(t_end: float, slack_time: float = 0.001, time_func=time.monotonic):
+    t_start = time_func()
+    t_wait = t_end - t_start
+    if t_wait > 0:
+        t_sleep = t_wait - slack_time
+        if t_sleep > 0:
+            time.sleep(t_sleep)
+        while time_func() < t_end:
+            pass
+    return
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/pymunk_override.py b/r2sVLA/algos/DP/diffusion_policy/common/pymunk_override.py
new file mode 100644
index 0000000..1c85f86
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/pymunk_override.py
@@ -0,0 +1,246 @@
+# ----------------------------------------------------------------------------
+# pymunk
+# Copyright (c) 2007-2016 Victor Blomqvist
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# ----------------------------------------------------------------------------
+"""This submodule contains helper functions to help with quick prototyping
+using pymunk together with pygame.
+
+Intended to help with debugging and prototyping, not for actual production use
+in a full application. The methods contained in this module is opinionated
+about your coordinate system and not in any way optimized.
+"""
+
+__docformat__ = "reStructuredText"
+
+__all__ = [
+    "DrawOptions",
+    "get_mouse_pos",
+    "to_pygame",
+    "from_pygame",
+    "lighten",
+    "positive_y_is_up",
+]
+
+from typing import List, Sequence, Tuple
+
+import pygame
+
+import numpy as np
+
+import pymunk
+from pymunk.space_debug_draw_options import SpaceDebugColor
+from pymunk.vec2d import Vec2d
+
+positive_y_is_up: bool = False
+"""Make increasing values of y point upwards.
+
+When True::
+
+    y
+    ^
+    |      . (3, 3)
+    |
+    |   . (2, 2)
+    |
+    +------ > x
+    
+When False::
+
+    +------ > x
+    |
+    |   . (2, 2)
+    |
+    |      . (3, 3)
+    v
+    y
+    
+"""
+
+
+class DrawOptions(pymunk.SpaceDebugDrawOptions):
+
+    def __init__(self, surface: pygame.Surface) -> None:
+        """Draw a pymunk.Space on a pygame.Surface object.
+
+        Typical usage::
+
+        >>> import pymunk
+        >>> surface = pygame.Surface((10,10))
+        >>> space = pymunk.Space()
+        >>> options = pymunk.pygame_util.DrawOptions(surface)
+        >>> space.debug_draw(options)
+
+        You can control the color of a shape by setting shape.color to the color
+        you want it drawn in::
+
+        >>> c = pymunk.Circle(None, 10)
+        >>> c.color = pygame.Color("pink")
+
+        See pygame_util.demo.py for a full example
+
+        Since pygame uses a coordinate system where y points down (in contrast
+        to many other cases), you either have to make the physics simulation
+        with Pymunk also behave in that way, or flip everything when you draw.
+
+        The easiest is probably to just make the simulation behave the same
+        way as Pygame does. In that way all coordinates used are in the same
+        orientation and easy to reason about::
+
+        >>> space = pymunk.Space()
+        >>> space.gravity = (0, -1000)
+        >>> body = pymunk.Body()
+        >>> body.position = (0, 0) # will be positioned in the top left corner
+        >>> space.debug_draw(options)
+
+        To flip the drawing its possible to set the module property
+        :py:data:`positive_y_is_up` to True. Then the pygame drawing will flip
+        the simulation upside down before drawing::
+
+        >>> positive_y_is_up = True
+        >>> body = pymunk.Body()
+        >>> body.position = (0, 0)
+        >>> # Body will be position in bottom left corner
+
+        :Parameters:
+                surface : pygame.Surface
+                    Surface that the objects will be drawn on
+        """
+        self.surface = surface
+        super(DrawOptions, self).__init__()
+
+    def draw_circle(
+        self,
+        pos: Vec2d,
+        angle: float,
+        radius: float,
+        outline_color: SpaceDebugColor,
+        fill_color: SpaceDebugColor,
+    ) -> None:
+        p = to_pygame(pos, self.surface)
+
+        pygame.draw.circle(self.surface, fill_color.as_int(), p, round(radius), 0)
+        pygame.draw.circle(self.surface, light_color(fill_color).as_int(), p, round(radius - 4), 0)
+
+        circle_edge = pos + Vec2d(radius, 0).rotated(angle)
+        p2 = to_pygame(circle_edge, self.surface)
+        line_r = 2 if radius > 20 else 1
+        # pygame.draw.lines(self.surface, outline_color.as_int(), False, [p, p2], line_r)
+
+    def draw_segment(self, a: Vec2d, b: Vec2d, color: SpaceDebugColor) -> None:
+        p1 = to_pygame(a, self.surface)
+        p2 = to_pygame(b, self.surface)
+
+        pygame.draw.aalines(self.surface, color.as_int(), False, [p1, p2])
+
+    def draw_fat_segment(
+        self,
+        a: Tuple[float, float],
+        b: Tuple[float, float],
+        radius: float,
+        outline_color: SpaceDebugColor,
+        fill_color: SpaceDebugColor,
+    ) -> None:
+        p1 = to_pygame(a, self.surface)
+        p2 = to_pygame(b, self.surface)
+
+        r = round(max(1, radius * 2))
+        pygame.draw.lines(self.surface, fill_color.as_int(), False, [p1, p2], r)
+        if r > 2:
+            orthog = [abs(p2[1] - p1[1]), abs(p2[0] - p1[0])]
+            if orthog[0] == 0 and orthog[1] == 0:
+                return
+            scale = radius / (orthog[0] * orthog[0] + orthog[1] * orthog[1])**0.5
+            orthog[0] = round(orthog[0] * scale)
+            orthog[1] = round(orthog[1] * scale)
+            points = [
+                (p1[0] - orthog[0], p1[1] - orthog[1]),
+                (p1[0] + orthog[0], p1[1] + orthog[1]),
+                (p2[0] + orthog[0], p2[1] + orthog[1]),
+                (p2[0] - orthog[0], p2[1] - orthog[1]),
+            ]
+            pygame.draw.polygon(self.surface, fill_color.as_int(), points)
+            pygame.draw.circle(
+                self.surface,
+                fill_color.as_int(),
+                (round(p1[0]), round(p1[1])),
+                round(radius),
+            )
+            pygame.draw.circle(
+                self.surface,
+                fill_color.as_int(),
+                (round(p2[0]), round(p2[1])),
+                round(radius),
+            )
+
+    def draw_polygon(
+        self,
+        verts: Sequence[Tuple[float, float]],
+        radius: float,
+        outline_color: SpaceDebugColor,
+        fill_color: SpaceDebugColor,
+    ) -> None:
+        ps = [to_pygame(v, self.surface) for v in verts]
+        ps += [ps[0]]
+
+        radius = 2
+        pygame.draw.polygon(self.surface, light_color(fill_color).as_int(), ps)
+
+        if radius > 0:
+            for i in range(len(verts)):
+                a = verts[i]
+                b = verts[(i + 1) % len(verts)]
+                self.draw_fat_segment(a, b, radius, fill_color, fill_color)
+
+    def draw_dot(self, size: float, pos: Tuple[float, float], color: SpaceDebugColor) -> None:
+        p = to_pygame(pos, self.surface)
+        pygame.draw.circle(self.surface, color.as_int(), p, round(size), 0)
+
+
+def get_mouse_pos(surface: pygame.Surface) -> Tuple[int, int]:
+    """Get position of the mouse pointer in pymunk coordinates."""
+    p = pygame.mouse.get_pos()
+    return from_pygame(p, surface)
+
+
+def to_pygame(p: Tuple[float, float], surface: pygame.Surface) -> Tuple[int, int]:
+    """Convenience method to convert pymunk coordinates to pygame surface
+    local coordinates.
+
+    Note that in case positive_y_is_up is False, this function won't actually do
+    anything except converting the point to integers.
+    """
+    if positive_y_is_up:
+        return round(p[0]), surface.get_height() - round(p[1])
+    else:
+        return round(p[0]), round(p[1])
+
+
+def from_pygame(p: Tuple[float, float], surface: pygame.Surface) -> Tuple[int, int]:
+    """Convenience method to convert pygame surface local coordinates to
+    pymunk coordinates
+    """
+    return to_pygame(p, surface)
+
+
+def light_color(color: SpaceDebugColor):
+    color = np.minimum(1.2 * np.float32([color.r, color.g, color.b, color.a]), np.float32([255]))
+    color = SpaceDebugColor(r=color[0], g=color[1], b=color[2], a=color[3])
+    return color
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/pymunk_util.py b/r2sVLA/algos/DP/diffusion_policy/common/pymunk_util.py
new file mode 100644
index 0000000..5f279e6
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/pymunk_util.py
@@ -0,0 +1,51 @@
+import pygame
+import pymunk
+import pymunk.pygame_util
+import numpy as np
+
+COLLTYPE_DEFAULT = 0
+COLLTYPE_MOUSE = 1
+COLLTYPE_BALL = 2
+
+
+def get_body_type(static=False):
+    body_type = pymunk.Body.DYNAMIC
+    if static:
+        body_type = pymunk.Body.STATIC
+    return body_type
+
+
+def create_rectangle(space, pos_x, pos_y, width, height, density=3, static=False):
+    body = pymunk.Body(body_type=get_body_type(static))
+    body.position = (pos_x, pos_y)
+    shape = pymunk.Poly.create_box(body, (width, height))
+    shape.density = density
+    space.add(body, shape)
+    return body, shape
+
+
+def create_rectangle_bb(space, left, bottom, right, top, **kwargs):
+    pos_x = (left + right) / 2
+    pos_y = (top + bottom) / 2
+    height = top - bottom
+    width = right - left
+    return create_rectangle(space, pos_x, pos_y, width, height, **kwargs)
+
+
+def create_circle(space, pos_x, pos_y, radius, density=3, static=False):
+    body = pymunk.Body(body_type=get_body_type(static))
+    body.position = (pos_x, pos_y)
+    shape = pymunk.Circle(body, radius=radius)
+    shape.density = density
+    shape.collision_type = COLLTYPE_BALL
+    space.add(body, shape)
+    return body, shape
+
+
+def get_body_state(body):
+    state = np.zeros(6, dtype=np.float32)
+    state[:2] = body.position
+    state[2] = body.angle
+    state[3:5] = body.velocity
+    state[5] = body.angular_velocity
+    return state
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/pytorch_util.py b/r2sVLA/algos/DP/diffusion_policy/common/pytorch_util.py
new file mode 100644
index 0000000..a4fe51a
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/pytorch_util.py
@@ -0,0 +1,81 @@
+from typing import Dict, Callable, List
+import collections
+import torch
+import torch.nn as nn
+
+
+def dict_apply(x: Dict[str, torch.Tensor], func: Callable[[torch.Tensor], torch.Tensor]) -> Dict[str, torch.Tensor]:
+    result = dict()
+    for key, value in x.items():
+        if isinstance(value, dict):
+            result[key] = dict_apply(value, func)
+        else:
+            result[key] = func(value)
+    return result
+
+
+def pad_remaining_dims(x, target):
+    assert x.shape == target.shape[:len(x.shape)]
+    return x.reshape(x.shape + (1, ) * (len(target.shape) - len(x.shape)))
+
+
+def dict_apply_split(
+    x: Dict[str, torch.Tensor],
+    split_func: Callable[[torch.Tensor], Dict[str, torch.Tensor]],
+) -> Dict[str, torch.Tensor]:
+    results = collections.defaultdict(dict)
+    for key, value in x.items():
+        result = split_func(value)
+        for k, v in result.items():
+            results[k][key] = v
+    return results
+
+
+def dict_apply_reduce(
+    x: List[Dict[str, torch.Tensor]],
+    reduce_func: Callable[[List[torch.Tensor]], torch.Tensor],
+) -> Dict[str, torch.Tensor]:
+    result = dict()
+    for key in x[0].keys():
+        result[key] = reduce_func([x_[key] for x_ in x])
+    return result
+
+
+def replace_submodules(
+    root_module: nn.Module,
+    predicate: Callable[[nn.Module], bool],
+    func: Callable[[nn.Module], nn.Module],
+) -> nn.Module:
+    """
+    predicate: Return true if the module is to be replaced.
+    func: Return new module to use.
+    """
+    if predicate(root_module):
+        return func(root_module)
+
+    bn_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
+    for *parent, k in bn_list:
+        parent_module = root_module
+        if len(parent) > 0:
+            parent_module = root_module.get_submodule(".".join(parent))
+        if isinstance(parent_module, nn.Sequential):
+            src_module = parent_module[int(k)]
+        else:
+            src_module = getattr(parent_module, k)
+        tgt_module = func(src_module)
+        if isinstance(parent_module, nn.Sequential):
+            parent_module[int(k)] = tgt_module
+        else:
+            setattr(parent_module, k, tgt_module)
+    # verify that all BN are replaced
+    bn_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
+    assert len(bn_list) == 0
+    return root_module
+
+
+def optimizer_to(optimizer, device):
+    for state in optimizer.state.values():
+        for k, v in state.items():
+            if isinstance(v, torch.Tensor):
+                state[k] = v.to(device=device)
+    return optimizer
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/replay_buffer.py b/r2sVLA/algos/DP/diffusion_policy/common/replay_buffer.py
new file mode 100644
index 0000000..344a540
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/replay_buffer.py
@@ -0,0 +1,622 @@
+from typing import Union, Dict, Optional
+import os
+import math
+import numbers
+import zarr
+import numcodecs
+import numpy as np
+from functools import cached_property
+
+
+def check_chunks_compatible(chunks: tuple, shape: tuple):
+    assert len(shape) == len(chunks)
+    for c in chunks:
+        assert isinstance(c, numbers.Integral)
+        assert c > 0
+
+
+def rechunk_recompress_array(group, name, chunks=None, chunk_length=None, compressor=None, tmp_key="_temp"):
+    old_arr = group[name]
+    if chunks is None:
+        if chunk_length is not None:
+            chunks = (chunk_length, ) + old_arr.chunks[1:]
+        else:
+            chunks = old_arr.chunks
+    check_chunks_compatible(chunks, old_arr.shape)
+
+    if compressor is None:
+        compressor = old_arr.compressor
+
+    if (chunks == old_arr.chunks) and (compressor == old_arr.compressor):
+        # no change
+        return old_arr
+
+    # rechunk recompress
+    group.move(name, tmp_key)
+    old_arr = group[tmp_key]
+    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+        source=old_arr,
+        dest=group,
+        name=name,
+        chunks=chunks,
+        compressor=compressor,
+    )
+    del group[tmp_key]
+    arr = group[name]
+    return arr
+
+
+def get_optimal_chunks(shape, dtype, target_chunk_bytes=2e6, max_chunk_length=None):
+    """
+    Common shapes
+    T,D
+    T,N,D
+    T,H,W,C
+    T,N,H,W,C
+    """
+    itemsize = np.dtype(dtype).itemsize
+    # reversed
+    rshape = list(shape[::-1])
+    if max_chunk_length is not None:
+        rshape[-1] = int(max_chunk_length)
+    split_idx = len(shape) - 1
+    for i in range(len(shape) - 1):
+        this_chunk_bytes = itemsize * np.prod(rshape[:i])
+        next_chunk_bytes = itemsize * np.prod(rshape[:i + 1])
+        if (this_chunk_bytes <= target_chunk_bytes and next_chunk_bytes > target_chunk_bytes):
+            split_idx = i
+
+    rchunks = rshape[:split_idx]
+    item_chunk_bytes = itemsize * np.prod(rshape[:split_idx])
+    this_max_chunk_length = rshape[split_idx]
+    next_chunk_length = min(this_max_chunk_length, math.ceil(target_chunk_bytes / item_chunk_bytes))
+    rchunks.append(next_chunk_length)
+    len_diff = len(shape) - len(rchunks)
+    rchunks.extend([1] * len_diff)
+    chunks = tuple(rchunks[::-1])
+    # print(np.prod(chunks) * itemsize / target_chunk_bytes)
+    return chunks
+
+
+class ReplayBuffer:
+    """
+    Zarr-based temporal datastructure.
+    Assumes first dimension to be time. Only chunk in time dimension.
+    """
+
+    def __init__(self, root: Union[zarr.Group, Dict[str, dict]]):
+        """
+        Dummy constructor. Use copy_from* and create_from* class methods instead.
+        """
+        assert "data" in root
+        assert "meta" in root
+        assert "episode_ends" in root["meta"]
+        for key, value in root["data"].items():
+            assert value.shape[0] == root["meta"]["episode_ends"][-1]
+        self.root = root
+
+    # ============= create constructors ===============
+    @classmethod
+    def create_empty_zarr(cls, storage=None, root=None):
+        if root is None:
+            if storage is None:
+                storage = zarr.MemoryStore()
+            root = zarr.group(store=storage)
+        data = root.require_group("data", overwrite=False)
+        meta = root.require_group("meta", overwrite=False)
+        if "episode_ends" not in meta:
+            episode_ends = meta.zeros(
+                "episode_ends",
+                shape=(0, ),
+                dtype=np.int64,
+                compressor=None,
+                overwrite=False,
+            )
+        return cls(root=root)
+
+    @classmethod
+    def create_empty_numpy(cls):
+        root = {
+            "data": dict(),
+            "meta": {
+                "episode_ends": np.zeros((0, ), dtype=np.int64)
+            },
+        }
+        return cls(root=root)
+
+    @classmethod
+    def create_from_group(cls, group, **kwargs):
+        if "data" not in group:
+            # create from stratch
+            buffer = cls.create_empty_zarr(root=group, **kwargs)
+        else:
+            # already exist
+            buffer = cls(root=group, **kwargs)
+        return buffer
+
+    @classmethod
+    def create_from_path(cls, zarr_path, mode="r", **kwargs):
+        """
+        Open a on-disk zarr directly (for dataset larger than memory).
+        Slower.
+        """
+        group = zarr.open(os.path.expanduser(zarr_path), mode)
+        return cls.create_from_group(group, **kwargs)
+
+    # ============= copy constructors ===============
+    @classmethod
+    def copy_from_store(
+            cls,
+            src_store,
+            store=None,
+            keys=None,
+            chunks: Dict[str, tuple] = dict(),
+            compressors: Union[dict, str, numcodecs.abc.Codec] = dict(),
+            if_exists="replace",
+            **kwargs,
+    ):
+        """
+        Load to memory.
+        """
+        src_root = zarr.group(src_store)
+        root = None
+        if store is None:
+            # numpy backend
+            meta = dict()
+            for key, value in src_root["meta"].items():
+                if len(value.shape) == 0:
+                    meta[key] = np.array(value)
+                else:
+                    meta[key] = value[:]
+
+            if keys is None:
+                keys = src_root["data"].keys()
+            data = dict()
+            for key in keys:
+                arr = src_root["data"][key]
+                data[key] = arr[:]
+
+            root = {"meta": meta, "data": data}
+        else:
+            root = zarr.group(store=store)
+            # copy without recompression
+            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                source=src_store,
+                dest=store,
+                source_path="/meta",
+                dest_path="/meta",
+                if_exists=if_exists,
+            )
+            data_group = root.create_group("data", overwrite=True)
+            if keys is None:
+                keys = src_root["data"].keys()
+            for key in keys:
+                value = src_root["data"][key]
+                cks = cls._resolve_array_chunks(chunks=chunks, key=key, array=value)
+                cpr = cls._resolve_array_compressor(compressors=compressors, key=key, array=value)
+                if cks == value.chunks and cpr == value.compressor:
+                    # copy without recompression
+                    this_path = "/data/" + key
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                        source=src_store,
+                        dest=store,
+                        source_path=this_path,
+                        dest_path=this_path,
+                        if_exists=if_exists,
+                    )
+                else:
+                    # copy with recompression
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+                        source=value,
+                        dest=data_group,
+                        name=key,
+                        chunks=cks,
+                        compressor=cpr,
+                        if_exists=if_exists,
+                    )
+        buffer = cls(root=root)
+        return buffer
+
+    @classmethod
+    def copy_from_path(
+            cls,
+            zarr_path,
+            backend=None,
+            store=None,
+            keys=None,
+            chunks: Dict[str, tuple] = dict(),
+            compressors: Union[dict, str, numcodecs.abc.Codec] = dict(),
+            if_exists="replace",
+            **kwargs,
+    ):
+        """
+        Copy a on-disk zarr to in-memory compressed.
+        Recommended
+        """
+        if backend == "numpy":
+            print("backend argument is deprecated!")
+            store = None
+        group = zarr.open(os.path.expanduser(zarr_path), "r")
+        return cls.copy_from_store(
+            src_store=group.store,
+            store=store,
+            keys=keys,
+            chunks=chunks,
+            compressors=compressors,
+            if_exists=if_exists,
+            **kwargs,
+        )
+
+    # ============= save methods ===============
+    def save_to_store(
+            self,
+            store,
+            chunks: Optional[Dict[str, tuple]] = dict(),
+            compressors: Union[str, numcodecs.abc.Codec, dict] = dict(),
+            if_exists="replace",
+            **kwargs,
+    ):
+
+        root = zarr.group(store)
+        if self.backend == "zarr":
+            # recompression free copy
+            n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                source=self.root.store,
+                dest=store,
+                source_path="/meta",
+                dest_path="/meta",
+                if_exists=if_exists,
+            )
+        else:
+            meta_group = root.create_group("meta", overwrite=True)
+            # save meta, no chunking
+            for key, value in self.root["meta"].items():
+                _ = meta_group.array(name=key, data=value, shape=value.shape, chunks=value.shape)
+
+        # save data, chunk
+        data_group = root.create_group("data", overwrite=True)
+        for key, value in self.root["data"].items():
+            cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
+            cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
+            if isinstance(value, zarr.Array):
+                if cks == value.chunks and cpr == value.compressor:
+                    # copy without recompression
+                    this_path = "/data/" + key
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
+                        source=self.root.store,
+                        dest=store,
+                        source_path=this_path,
+                        dest_path=this_path,
+                        if_exists=if_exists,
+                    )
+                else:
+                    # copy with recompression
+                    n_copied, n_skipped, n_bytes_copied = zarr.copy(
+                        source=value,
+                        dest=data_group,
+                        name=key,
+                        chunks=cks,
+                        compressor=cpr,
+                        if_exists=if_exists,
+                    )
+            else:
+                # numpy
+                _ = data_group.array(name=key, data=value, chunks=cks, compressor=cpr)
+        return store
+
+    def save_to_path(
+            self,
+            zarr_path,
+            chunks: Optional[Dict[str, tuple]] = dict(),
+            compressors: Union[str, numcodecs.abc.Codec, dict] = dict(),
+            if_exists="replace",
+            **kwargs,
+    ):
+        store = zarr.DirectoryStore(os.path.expanduser(zarr_path))
+        return self.save_to_store(store, chunks=chunks, compressors=compressors, if_exists=if_exists, **kwargs)
+
+    @staticmethod
+    def resolve_compressor(compressor="default"):
+        if compressor == "default":
+            compressor = numcodecs.Blosc(cname="lz4", clevel=5, shuffle=numcodecs.Blosc.NOSHUFFLE)
+        elif compressor == "disk":
+            compressor = numcodecs.Blosc("zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE)
+        return compressor
+
+    @classmethod
+    def _resolve_array_compressor(cls, compressors: Union[dict, str, numcodecs.abc.Codec], key, array):
+        # allows compressor to be explicitly set to None
+        cpr = "nil"
+        if isinstance(compressors, dict):
+            if key in compressors:
+                cpr = cls.resolve_compressor(compressors[key])
+            elif isinstance(array, zarr.Array):
+                cpr = array.compressor
+        else:
+            cpr = cls.resolve_compressor(compressors)
+        # backup default
+        if cpr == "nil":
+            cpr = cls.resolve_compressor("default")
+        return cpr
+
+    @classmethod
+    def _resolve_array_chunks(cls, chunks: Union[dict, tuple], key, array):
+        cks = None
+        if isinstance(chunks, dict):
+            if key in chunks:
+                cks = chunks[key]
+            elif isinstance(array, zarr.Array):
+                cks = array.chunks
+        elif isinstance(chunks, tuple):
+            cks = chunks
+        else:
+            raise TypeError(f"Unsupported chunks type {type(chunks)}")
+        # backup default
+        if cks is None:
+            cks = get_optimal_chunks(shape=array.shape, dtype=array.dtype)
+        # check
+        check_chunks_compatible(chunks=cks, shape=array.shape)
+        return cks
+
+    # ============= properties =================
+    @cached_property
+    def data(self):
+        return self.root["data"]
+
+    @cached_property
+    def meta(self):
+        return self.root["meta"]
+
+    def update_meta(self, data):
+        # sanitize data
+        np_data = dict()
+        for key, value in data.items():
+            if isinstance(value, np.ndarray):
+                np_data[key] = value
+            else:
+                arr = np.array(value)
+                if arr.dtype == object:
+                    raise TypeError(f"Invalid value type {type(value)}")
+                np_data[key] = arr
+
+        meta_group = self.meta
+        if self.backend == "zarr":
+            for key, value in np_data.items():
+                _ = meta_group.array(
+                    name=key,
+                    data=value,
+                    shape=value.shape,
+                    chunks=value.shape,
+                    overwrite=True,
+                )
+        else:
+            meta_group.update(np_data)
+
+        return meta_group
+
+    @property
+    def episode_ends(self):
+        return self.meta["episode_ends"]
+
+    def get_episode_idxs(self):
+        import numba
+
+        numba.jit(nopython=True)
+
+        def _get_episode_idxs(episode_ends):
+            result = np.zeros((episode_ends[-1], ), dtype=np.int64)
+            for i in range(len(episode_ends)):
+                start = 0
+                if i > 0:
+                    start = episode_ends[i - 1]
+                end = episode_ends[i]
+                for idx in range(start, end):
+                    result[idx] = i
+            return result
+
+        return _get_episode_idxs(self.episode_ends)
+
+    @property
+    def backend(self):
+        backend = "numpy"
+        if isinstance(self.root, zarr.Group):
+            backend = "zarr"
+        return backend
+
+    # =========== dict-like API ==============
+    def __repr__(self) -> str:
+        if self.backend == "zarr":
+            return str(self.root.tree())
+        else:
+            return super().__repr__()
+
+    def keys(self):
+        return self.data.keys()
+
+    def values(self):
+        return self.data.values()
+
+    def items(self):
+        return self.data.items()
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __contains__(self, key):
+        return key in self.data
+
+    # =========== our API ==============
+    @property
+    def n_steps(self):
+        if len(self.episode_ends) == 0:
+            return 0
+        return self.episode_ends[-1]
+
+    @property
+    def n_episodes(self):
+        return len(self.episode_ends)
+
+    @property
+    def chunk_size(self):
+        if self.backend == "zarr":
+            return next(iter(self.data.arrays()))[-1].chunks[0]
+        return None
+
+    @property
+    def episode_lengths(self):
+        ends = self.episode_ends[:]
+        ends = np.insert(ends, 0, 0)
+        lengths = np.diff(ends)
+        return lengths
+
+    def add_episode(
+            self,
+            data: Dict[str, np.ndarray],
+            chunks: Optional[Dict[str, tuple]] = dict(),
+            compressors: Union[str, numcodecs.abc.Codec, dict] = dict(),
+    ):
+        assert len(data) > 0
+        is_zarr = self.backend == "zarr"
+
+        curr_len = self.n_steps
+        episode_length = None
+        for key, value in data.items():
+            assert len(value.shape) >= 1
+            if episode_length is None:
+                episode_length = len(value)
+            else:
+                assert episode_length == len(value)
+        new_len = curr_len + episode_length
+
+        for key, value in data.items():
+            new_shape = (new_len, ) + value.shape[1:]
+            # create array
+            if key not in self.data:
+                if is_zarr:
+                    cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
+                    cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
+                    arr = self.data.zeros(
+                        name=key,
+                        shape=new_shape,
+                        chunks=cks,
+                        dtype=value.dtype,
+                        compressor=cpr,
+                    )
+                else:
+                    # copy data to prevent modify
+                    arr = np.zeros(shape=new_shape, dtype=value.dtype)
+                    self.data[key] = arr
+            else:
+                arr = self.data[key]
+                assert value.shape[1:] == arr.shape[1:]
+                # same method for both zarr and numpy
+                if is_zarr:
+                    arr.resize(new_shape)
+                else:
+                    arr.resize(new_shape, refcheck=False)
+            # copy data
+            arr[-value.shape[0]:] = value
+
+        # append to episode ends
+        episode_ends = self.episode_ends
+        if is_zarr:
+            episode_ends.resize(episode_ends.shape[0] + 1)
+        else:
+            episode_ends.resize(episode_ends.shape[0] + 1, refcheck=False)
+        episode_ends[-1] = new_len
+
+        # rechunk
+        if is_zarr:
+            if episode_ends.chunks[0] < episode_ends.shape[0]:
+                rechunk_recompress_array(
+                    self.meta,
+                    "episode_ends",
+                    chunk_length=int(episode_ends.shape[0] * 1.5),
+                )
+
+    def drop_episode(self):
+        is_zarr = self.backend == "zarr"
+        episode_ends = self.episode_ends[:].copy()
+        assert len(episode_ends) > 0
+        start_idx = 0
+        if len(episode_ends) > 1:
+            start_idx = episode_ends[-2]
+        for key, value in self.data.items():
+            new_shape = (start_idx, ) + value.shape[1:]
+            if is_zarr:
+                value.resize(new_shape)
+            else:
+                value.resize(new_shape, refcheck=False)
+        if is_zarr:
+            self.episode_ends.resize(len(episode_ends) - 1)
+        else:
+            self.episode_ends.resize(len(episode_ends) - 1, refcheck=False)
+
+    def pop_episode(self):
+        assert self.n_episodes > 0
+        episode = self.get_episode(self.n_episodes - 1, copy=True)
+        self.drop_episode()
+        return episode
+
+    def extend(self, data):
+        self.add_episode(data)
+
+    def get_episode(self, idx, copy=False):
+        idx = list(range(len(self.episode_ends)))[idx]
+        start_idx = 0
+        if idx > 0:
+            start_idx = self.episode_ends[idx - 1]
+        end_idx = self.episode_ends[idx]
+        result = self.get_steps_slice(start_idx, end_idx, copy=copy)
+        return result
+
+    def get_episode_slice(self, idx):
+        start_idx = 0
+        if idx > 0:
+            start_idx = self.episode_ends[idx - 1]
+        end_idx = self.episode_ends[idx]
+        return slice(start_idx, end_idx)
+
+    def get_steps_slice(self, start, stop, step=None, copy=False):
+        _slice = slice(start, stop, step)
+
+        result = dict()
+        for key, value in self.data.items():
+            x = value[_slice]
+            if copy and isinstance(value, np.ndarray):
+                x = x.copy()
+            result[key] = x
+        return result
+
+    # =========== chunking =============
+    def get_chunks(self) -> dict:
+        assert self.backend == "zarr"
+        chunks = dict()
+        for key, value in self.data.items():
+            chunks[key] = value.chunks
+        return chunks
+
+    def set_chunks(self, chunks: dict):
+        assert self.backend == "zarr"
+        for key, value in chunks.items():
+            if key in self.data:
+                arr = self.data[key]
+                if value != arr.chunks:
+                    check_chunks_compatible(chunks=value, shape=arr.shape)
+                    rechunk_recompress_array(self.data, key, chunks=value)
+
+    def get_compressors(self) -> dict:
+        assert self.backend == "zarr"
+        compressors = dict()
+        for key, value in self.data.items():
+            compressors[key] = value.compressor
+        return compressors
+
+    def set_compressors(self, compressors: dict):
+        assert self.backend == "zarr"
+        for key, value in compressors.items():
+            if key in self.data:
+                arr = self.data[key]
+                compressor = self.resolve_compressor(value)
+                if compressor != arr.compressor:
+                    rechunk_recompress_array(self.data, key, compressor=compressor)
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/robomimic_config_util.py b/r2sVLA/algos/DP/diffusion_policy/common/robomimic_config_util.py
new file mode 100644
index 0000000..633f4f8
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/robomimic_config_util.py
@@ -0,0 +1,41 @@
+from omegaconf import OmegaConf
+from robomimic.config import config_factory
+import robomimic.scripts.generate_paper_configs as gpc
+from robomimic.scripts.generate_paper_configs import (
+    modify_config_for_default_image_exp,
+    modify_config_for_default_low_dim_exp,
+    modify_config_for_dataset,
+)
+
+
+def get_robomimic_config(algo_name="bc_rnn", hdf5_type="low_dim", task_name="square", dataset_type="ph"):
+    base_dataset_dir = "/tmp/null"
+    filter_key = None
+
+    # decide whether to use low-dim or image training defaults
+    modifier_for_obs = modify_config_for_default_image_exp
+    if hdf5_type in ["low_dim", "low_dim_sparse", "low_dim_dense"]:
+        modifier_for_obs = modify_config_for_default_low_dim_exp
+
+    algo_config_name = "bc" if algo_name == "bc_rnn" else algo_name
+    config = config_factory(algo_name=algo_config_name)
+    # turn into default config for observation modalities (e.g.: low-dim or rgb)
+    config = modifier_for_obs(config)
+    # add in config based on the dataset
+    config = modify_config_for_dataset(
+        config=config,
+        task_name=task_name,
+        dataset_type=dataset_type,
+        hdf5_type=hdf5_type,
+        base_dataset_dir=base_dataset_dir,
+        filter_key=filter_key,
+    )
+    # add in algo hypers based on dataset
+    algo_config_modifier = getattr(gpc, f"modify_{algo_name}_config_for_dataset")
+    config = algo_config_modifier(
+        config=config,
+        task_name=task_name,
+        dataset_type=dataset_type,
+        hdf5_type=hdf5_type,
+    )
+    return config
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/robomimic_util.py b/r2sVLA/algos/DP/diffusion_policy/common/robomimic_util.py
new file mode 100644
index 0000000..c655172
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/robomimic_util.py
@@ -0,0 +1,170 @@
+import numpy as np
+import copy
+
+import h5py
+import robomimic.utils.obs_utils as ObsUtils
+import robomimic.utils.file_utils as FileUtils
+import robomimic.utils.env_utils as EnvUtils
+from scipy.spatial.transform import Rotation
+
+from robomimic.config import config_factory
+
+
+class RobomimicAbsoluteActionConverter:
+
+    def __init__(self, dataset_path, algo_name="bc"):
+        # default BC config
+        config = config_factory(algo_name=algo_name)
+
+        # read config to set up metadata for observation modalities (e.g. detecting rgb observations)
+        # must ran before create dataset
+        ObsUtils.initialize_obs_utils_with_config(config)
+
+        env_meta = FileUtils.get_env_metadata_from_dataset(dataset_path)
+        abs_env_meta = copy.deepcopy(env_meta)
+        abs_env_meta["env_kwargs"]["controller_configs"]["control_delta"] = False
+
+        env = EnvUtils.create_env_from_metadata(
+            env_meta=env_meta,
+            render=False,
+            render_offscreen=False,
+            use_image_obs=False,
+        )
+        assert len(env.env.robots) in (1, 2)
+
+        abs_env = EnvUtils.create_env_from_metadata(
+            env_meta=abs_env_meta,
+            render=False,
+            render_offscreen=False,
+            use_image_obs=False,
+        )
+        assert not abs_env.env.robots[0].controller.use_delta
+
+        self.env = env
+        self.abs_env = abs_env
+        self.file = h5py.File(dataset_path, "r")
+
+    def __len__(self):
+        return len(self.file["data"])
+
+    def convert_actions(self, states: np.ndarray, actions: np.ndarray) -> np.ndarray:
+        """
+        Given state and delta action sequence
+        generate equivalent goal position and orientation for each step
+        keep the original gripper action intact.
+        """
+        # in case of multi robot
+        # reshape (N,14) to (N,2,7)
+        # or (N,7) to (N,1,7)
+        stacked_actions = actions.reshape(*actions.shape[:-1], -1, 7)
+
+        env = self.env
+        # generate abs actions
+        action_goal_pos = np.zeros(stacked_actions.shape[:-1] + (3, ), dtype=stacked_actions.dtype)
+        action_goal_ori = np.zeros(stacked_actions.shape[:-1] + (3, ), dtype=stacked_actions.dtype)
+        action_gripper = stacked_actions[..., [-1]]
+        for i in range(len(states)):
+            _ = env.reset_to({"states": states[i]})
+
+            # taken from robot_env.py L#454
+            for idx, robot in enumerate(env.env.robots):
+                # run controller goal generator
+                robot.control(stacked_actions[i, idx], policy_step=True)
+
+                # read pos and ori from robots
+                controller = robot.controller
+                action_goal_pos[i, idx] = controller.goal_pos
+                action_goal_ori[i, idx] = Rotation.from_matrix(controller.goal_ori).as_rotvec()
+
+        stacked_abs_actions = np.concatenate([action_goal_pos, action_goal_ori, action_gripper], axis=-1)
+        abs_actions = stacked_abs_actions.reshape(actions.shape)
+        return abs_actions
+
+    def convert_idx(self, idx):
+        file = self.file
+        demo = file[f"data/demo_{idx}"]
+        # input
+        states = demo["states"][:]
+        actions = demo["actions"][:]
+
+        # generate abs actions
+        abs_actions = self.convert_actions(states, actions)
+        return abs_actions
+
+    def convert_and_eval_idx(self, idx):
+        env = self.env
+        abs_env = self.abs_env
+        file = self.file
+        # first step have high error for some reason, not representative
+        eval_skip_steps = 1
+
+        demo = file[f"data/demo_{idx}"]
+        # input
+        states = demo["states"][:]
+        actions = demo["actions"][:]
+
+        # generate abs actions
+        abs_actions = self.convert_actions(states, actions)
+
+        # verify
+        robot0_eef_pos = demo["obs"]["robot0_eef_pos"][:]
+        robot0_eef_quat = demo["obs"]["robot0_eef_quat"][:]
+
+        delta_error_info = self.evaluate_rollout_error(
+            env,
+            states,
+            actions,
+            robot0_eef_pos,
+            robot0_eef_quat,
+            metric_skip_steps=eval_skip_steps,
+        )
+        abs_error_info = self.evaluate_rollout_error(
+            abs_env,
+            states,
+            abs_actions,
+            robot0_eef_pos,
+            robot0_eef_quat,
+            metric_skip_steps=eval_skip_steps,
+        )
+
+        info = {"delta_max_error": delta_error_info, "abs_max_error": abs_error_info}
+        return abs_actions, info
+
+    @staticmethod
+    def evaluate_rollout_error(env, states, actions, robot0_eef_pos, robot0_eef_quat, metric_skip_steps=1):
+        # first step have high error for some reason, not representative
+
+        # evaluate abs actions
+        rollout_next_states = list()
+        rollout_next_eef_pos = list()
+        rollout_next_eef_quat = list()
+        obs = env.reset_to({"states": states[0]})
+        for i in range(len(states)):
+            obs = env.reset_to({"states": states[i]})
+            obs, reward, done, info = env.step(actions[i])
+            obs = env.get_observation()
+            rollout_next_states.append(env.get_state()["states"])
+            rollout_next_eef_pos.append(obs["robot0_eef_pos"])
+            rollout_next_eef_quat.append(obs["robot0_eef_quat"])
+        rollout_next_states = np.array(rollout_next_states)
+        rollout_next_eef_pos = np.array(rollout_next_eef_pos)
+        rollout_next_eef_quat = np.array(rollout_next_eef_quat)
+
+        next_state_diff = states[1:] - rollout_next_states[:-1]
+        max_next_state_diff = np.max(np.abs(next_state_diff[metric_skip_steps:]))
+
+        next_eef_pos_diff = robot0_eef_pos[1:] - rollout_next_eef_pos[:-1]
+        next_eef_pos_dist = np.linalg.norm(next_eef_pos_diff, axis=-1)
+        max_next_eef_pos_dist = next_eef_pos_dist[metric_skip_steps:].max()
+
+        next_eef_rot_diff = (Rotation.from_quat(robot0_eef_quat[1:]) *
+                             Rotation.from_quat(rollout_next_eef_quat[:-1]).inv())
+        next_eef_rot_dist = next_eef_rot_diff.magnitude()
+        max_next_eef_rot_dist = next_eef_rot_dist[metric_skip_steps:].max()
+
+        info = {
+            "state": max_next_state_diff,
+            "pos": max_next_eef_pos_dist,
+            "rot": max_next_eef_rot_dist,
+        }
+        return info
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/sampler.py b/r2sVLA/algos/DP/diffusion_policy/common/sampler.py
new file mode 100644
index 0000000..8511617
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/sampler.py
@@ -0,0 +1,164 @@
+from typing import Optional
+import numpy as np
+import numba
+from diffusion_policy.common.replay_buffer import ReplayBuffer
+
+
+@numba.jit(nopython=True)
+def create_indices(
+    episode_ends: np.ndarray,
+    sequence_length: int,
+    episode_mask: np.ndarray,
+    pad_before: int = 0,
+    pad_after: int = 0,
+    debug: bool = True,
+) -> np.ndarray:
+    episode_mask.shape == episode_ends.shape
+    pad_before = min(max(pad_before, 0), sequence_length - 1)
+    pad_after = min(max(pad_after, 0), sequence_length - 1)
+
+    indices = list()
+    for i in range(len(episode_ends)):
+        if not episode_mask[i]:
+            # skip episode
+            continue
+        start_idx = 0
+        if i > 0:
+            start_idx = episode_ends[i - 1]
+        end_idx = episode_ends[i]
+        episode_length = end_idx - start_idx
+
+        min_start = -pad_before
+        max_start = episode_length - sequence_length + pad_after
+
+        # range stops one idx before end
+        for idx in range(min_start, max_start + 1):
+            buffer_start_idx = max(idx, 0) + start_idx
+            buffer_end_idx = min(idx + sequence_length, episode_length) + start_idx
+            start_offset = buffer_start_idx - (idx + start_idx)
+            end_offset = (idx + sequence_length + start_idx) - buffer_end_idx
+            sample_start_idx = 0 + start_offset
+            sample_end_idx = sequence_length - end_offset
+            if debug:
+                assert start_offset >= 0
+                assert end_offset >= 0
+                assert (sample_end_idx - sample_start_idx) == (buffer_end_idx - buffer_start_idx)
+            indices.append([buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx])
+    indices = np.array(indices)
+    return indices
+
+
+def get_val_mask(n_episodes, val_ratio, seed=0):
+    val_mask = np.zeros(n_episodes, dtype=bool)
+    if val_ratio <= 0:
+        return val_mask
+
+    # have at least 1 episode for validation, and at least 1 episode for train
+    n_val = min(max(1, round(n_episodes * val_ratio)), n_episodes - 1)
+    rng = np.random.default_rng(seed=seed)
+    # val_idxs = rng.choice(n_episodes, size=n_val, replace=False)
+    val_idxs = -1
+    val_mask[val_idxs] = True
+    return val_mask
+
+
+def downsample_mask(mask, max_n, seed=0):
+    # subsample training data
+    train_mask = mask
+    if (max_n is not None) and (np.sum(train_mask) > max_n):
+        n_train = int(max_n)
+        curr_train_idxs = np.nonzero(train_mask)[0]
+        rng = np.random.default_rng(seed=seed)
+        train_idxs_idx = rng.choice(len(curr_train_idxs), size=n_train, replace=False)
+        train_idxs = curr_train_idxs[train_idxs_idx]
+        train_mask = np.zeros_like(train_mask)
+        train_mask[train_idxs] = True
+        assert np.sum(train_mask) == n_train
+    return train_mask
+
+
+class SequenceSampler:
+
+    def __init__(
+            self,
+            replay_buffer: ReplayBuffer,
+            sequence_length: int,
+            pad_before: int = 0,
+            pad_after: int = 0,
+            keys=None,
+            key_first_k=dict(),
+            episode_mask: Optional[np.ndarray] = None,
+    ):
+        """
+        key_first_k: dict str: int
+            Only take first k data from these keys (to improve perf)
+        """
+
+        super().__init__()
+        assert sequence_length >= 1
+        if keys is None:
+            keys = list(replay_buffer.keys())
+
+        episode_ends = replay_buffer.episode_ends[:]
+        if episode_mask is None:
+            episode_mask = np.ones(episode_ends.shape, dtype=bool)
+
+        if np.any(episode_mask):
+            indices = create_indices(
+                episode_ends,
+                sequence_length=sequence_length,
+                pad_before=pad_before,
+                pad_after=pad_after,
+                episode_mask=episode_mask,
+            )
+        else:
+            indices = np.zeros((0, 4), dtype=np.int64)
+
+        # (buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx)
+        self.indices = indices
+        self.keys = list(keys)  # prevent OmegaConf list performance problem
+        self.sequence_length = sequence_length
+        self.replay_buffer = replay_buffer
+        self.key_first_k = key_first_k
+
+    def __len__(self):
+        return len(self.indices)
+
+    def sample_sequence(self, idx):
+        buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx = (self.indices[idx])
+        result = dict()
+        for key in self.keys:
+            input_arr = self.replay_buffer[key]
+            # performance optimization, avoid small allocation if possible
+            if key not in self.key_first_k:
+                sample = input_arr[buffer_start_idx:buffer_end_idx]
+            else:
+                # performance optimization, only load used obs steps
+                n_data = buffer_end_idx - buffer_start_idx
+                k_data = min(self.key_first_k[key], n_data)
+                # fill value with Nan to catch bugs
+                # the non-loaded region should never be used
+                sample = np.full(
+                    (n_data, ) + input_arr.shape[1:],
+                    fill_value=np.nan,
+                    dtype=input_arr.dtype,
+                )
+                try:
+                    sample[:k_data] = input_arr[buffer_start_idx:buffer_start_idx + k_data]
+                except Exception as e:
+                    import pdb
+
+                    pdb.set_trace()
+            data = sample
+            if (sample_start_idx > 0) or (sample_end_idx < self.sequence_length):
+                data = np.zeros(
+                    shape=(self.sequence_length, ) + input_arr.shape[1:],
+                    dtype=input_arr.dtype,
+                )
+                if sample_start_idx > 0:
+                    data[:sample_start_idx] = sample[0]
+                if sample_end_idx < self.sequence_length:
+                    data[sample_end_idx:] = sample[-1]
+                data[sample_start_idx:sample_end_idx] = sample
+            result[key] = data
+        return result
diff --git a/r2sVLA/algos/DP/diffusion_policy/common/timestamp_accumulator.py b/r2sVLA/algos/DP/diffusion_policy/common/timestamp_accumulator.py
new file mode 100644
index 0000000..b3be297
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/common/timestamp_accumulator.py
@@ -0,0 +1,220 @@
+from typing import List, Tuple, Optional, Dict
+import math
+import numpy as np
+
+
+def get_accumulate_timestamp_idxs(
+    timestamps: List[float],
+    start_time: float,
+    dt: float,
+    eps: float = 1e-5,
+    next_global_idx: Optional[int] = 0,
+    allow_negative=False,
+) -> Tuple[List[int], List[int], int]:
+    """
+    For each dt window, choose the first timestamp in the window.
+    Assumes timestamps sorted. One timestamp might be chosen multiple times due to dropped frames.
+    next_global_idx should start at 0 normally, and then use the returned next_global_idx.
+    However, when overwiting previous values are desired, set last_global_idx to None.
+
+    Returns:
+    local_idxs: which index in the given timestamps array to chose from
+    global_idxs: the global index of each chosen timestamp
+    next_global_idx: used for next call.
+    """
+    local_idxs = list()
+    global_idxs = list()
+    for local_idx, ts in enumerate(timestamps):
+        # add eps * dt to timestamps so that when ts == start_time + k * dt
+        # is always recorded as kth element (avoiding floating point errors)
+        global_idx = math.floor((ts - start_time) / dt + eps)
+        if (not allow_negative) and (global_idx < 0):
+            continue
+        if next_global_idx is None:
+            next_global_idx = global_idx
+
+        n_repeats = max(0, global_idx - next_global_idx + 1)
+        for i in range(n_repeats):
+            local_idxs.append(local_idx)
+            global_idxs.append(next_global_idx + i)
+        next_global_idx += n_repeats
+    return local_idxs, global_idxs, next_global_idx
+
+
+def align_timestamps(
+    timestamps: List[float],
+    target_global_idxs: List[int],
+    start_time: float,
+    dt: float,
+    eps: float = 1e-5,
+):
+    if isinstance(target_global_idxs, np.ndarray):
+        target_global_idxs = target_global_idxs.tolist()
+    assert len(target_global_idxs) > 0
+
+    local_idxs, global_idxs, _ = get_accumulate_timestamp_idxs(
+        timestamps=timestamps,
+        start_time=start_time,
+        dt=dt,
+        eps=eps,
+        next_global_idx=target_global_idxs[0],
+        allow_negative=True,
+    )
+    if len(global_idxs) > len(target_global_idxs):
+        # if more steps available, truncate
+        global_idxs = global_idxs[:len(target_global_idxs)]
+        local_idxs = local_idxs[:len(target_global_idxs)]
+
+    if len(global_idxs) == 0:
+        import pdb
+
+        pdb.set_trace()
+
+    for i in range(len(target_global_idxs) - len(global_idxs)):
+        # if missing, repeat
+        local_idxs.append(len(timestamps) - 1)
+        global_idxs.append(global_idxs[-1] + 1)
+    assert global_idxs == target_global_idxs
+    assert len(local_idxs) == len(global_idxs)
+    return local_idxs
+
+
+class TimestampObsAccumulator:
+
+    def __init__(self, start_time: float, dt: float, eps: float = 1e-5):
+        self.start_time = start_time
+        self.dt = dt
+        self.eps = eps
+        self.obs_buffer = dict()
+        self.timestamp_buffer = None
+        self.next_global_idx = 0
+
+    def __len__(self):
+        return self.next_global_idx
+
+    @property
+    def data(self):
+        if self.timestamp_buffer is None:
+            return dict()
+        result = dict()
+        for key, value in self.obs_buffer.items():
+            result[key] = value[:len(self)]
+        return result
+
+    @property
+    def actual_timestamps(self):
+        if self.timestamp_buffer is None:
+            return np.array([])
+        return self.timestamp_buffer[:len(self)]
+
+    @property
+    def timestamps(self):
+        if self.timestamp_buffer is None:
+            return np.array([])
+        return self.start_time + np.arange(len(self)) * self.dt
+
+    def put(self, data: Dict[str, np.ndarray], timestamps: np.ndarray):
+        """
+        data:
+            key: T,*
+        """
+
+        local_idxs, global_idxs, self.next_global_idx = get_accumulate_timestamp_idxs(
+            timestamps=timestamps,
+            start_time=self.start_time,
+            dt=self.dt,
+            eps=self.eps,
+            next_global_idx=self.next_global_idx,
+        )
+
+        if len(global_idxs) > 0:
+            if self.timestamp_buffer is None:
+                # first allocation
+                self.obs_buffer = dict()
+                for key, value in data.items():
+                    self.obs_buffer[key] = np.zeros_like(value)
+                self.timestamp_buffer = np.zeros((len(timestamps), ), dtype=np.float64)
+
+            this_max_size = global_idxs[-1] + 1
+            if this_max_size > len(self.timestamp_buffer):
+                # reallocate
+                new_size = max(this_max_size, len(self.timestamp_buffer) * 2)
+                for key in list(self.obs_buffer.keys()):
+                    new_shape = (new_size, ) + self.obs_buffer[key].shape[1:]
+                    self.obs_buffer[key] = np.resize(self.obs_buffer[key], new_shape)
+                self.timestamp_buffer = np.resize(self.timestamp_buffer, (new_size))
+
+            # write data
+            for key, value in self.obs_buffer.items():
+                value[global_idxs] = data[key][local_idxs]
+            self.timestamp_buffer[global_idxs] = timestamps[local_idxs]
+
+
+class TimestampActionAccumulator:
+
+    def __init__(self, start_time: float, dt: float, eps: float = 1e-5):
+        """
+        Different from Obs accumulator, the action accumulator
+        allows overwriting previous values.
+        """
+        self.start_time = start_time
+        self.dt = dt
+        self.eps = eps
+        self.action_buffer = None
+        self.timestamp_buffer = None
+        self.size = 0
+
+    def __len__(self):
+        return self.size
+
+    @property
+    def actions(self):
+        if self.action_buffer is None:
+            return np.array([])
+        return self.action_buffer[:len(self)]
+
+    @property
+    def actual_timestamps(self):
+        if self.timestamp_buffer is None:
+            return np.array([])
+        return self.timestamp_buffer[:len(self)]
+
+    @property
+    def timestamps(self):
+        if self.timestamp_buffer is None:
+            return np.array([])
+        return self.start_time + np.arange(len(self)) * self.dt
+
+    def put(self, actions: np.ndarray, timestamps: np.ndarray):
+        """
+        Note: timestamps is the time when the action will be issued,
+        not when the action will be completed (target_timestamp)
+        """
+
+        local_idxs, global_idxs, _ = get_accumulate_timestamp_idxs(
+            timestamps=timestamps,
+            start_time=self.start_time,
+            dt=self.dt,
+            eps=self.eps,
+            # allows overwriting previous actions
+            next_global_idx=None,
+        )
+
+        if len(global_idxs) > 0:
+            if self.timestamp_buffer is None:
+                # first allocation
+                self.action_buffer = np.zeros_like(actions)
+                self.timestamp_buffer = np.zeros((len(actions), ), dtype=np.float64)
+
+            this_max_size = global_idxs[-1] + 1
+            if this_max_size > len(self.timestamp_buffer):
+                # reallocate
+                new_size = max(this_max_size, len(self.timestamp_buffer) * 2)
+                new_shape = (new_size, ) + self.action_buffer.shape[1:]
+                self.action_buffer = np.resize(self.action_buffer, new_shape)
+                self.timestamp_buffer = np.resize(self.timestamp_buffer, (new_size, ))
+
+            # potentially rewrite old data (as expected)
+            self.action_buffer[global_idxs] = actions[local_idxs]
+            self.timestamp_buffer[global_idxs] = timestamps[local_idxs]
+            self.size = max(self.size, this_max_size)
diff --git a/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_14.yaml b/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_14.yaml
new file mode 100644
index 0000000..7f8cde9
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_14.yaml
@@ -0,0 +1,155 @@
+defaults:
+  - _self_
+  - task: default_task_14
+
+name: robot_${task.name}
+_target_: diffusion_policy.workspace.robotworkspace.RobotWorkspace
+
+task_name: ${task.name}
+shape_meta: ${task.shape_meta}
+exp_name: "default"
+
+horizon: 8
+n_obs_steps: 3
+n_action_steps: 6
+n_latency_steps: 0
+dataset_obs_steps: ${n_obs_steps}
+past_action_visible: False
+keypoint_visible_rate: 1.0
+obs_as_global_cond: True
+
+policy:
+  _target_: diffusion_policy.policy.diffusion_unet_image_policy.DiffusionUnetImagePolicy
+
+  shape_meta: ${shape_meta}
+  
+  noise_scheduler:
+    _target_: diffusers.schedulers.scheduling_ddpm.DDPMScheduler
+    num_train_timesteps: 100
+    beta_start: 0.0001
+    beta_end: 0.02
+    beta_schedule: squaredcos_cap_v2
+    variance_type: fixed_small # Yilun's paper uses fixed_small_log instead, but easy to cause Nan
+    clip_sample: True # required when predict_epsilon=False
+    prediction_type: epsilon # or sample
+
+  obs_encoder:
+    _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+    shape_meta: ${shape_meta}
+    rgb_model:
+      _target_: diffusion_policy.model.vision.model_getter.get_resnet
+      name: resnet18
+      weights: null
+    resize_shape: null
+    crop_shape: null
+    # constant center crop
+    random_crop: True
+    use_group_norm: True
+    share_rgb_model: False
+    imagenet_norm: True
+
+  horizon: ${horizon}
+  n_action_steps: ${eval:'${n_action_steps}+${n_latency_steps}'}
+  n_obs_steps: ${n_obs_steps}
+  num_inference_steps: 100
+  obs_as_global_cond: ${obs_as_global_cond}
+  # crop_shape: null
+  diffusion_step_embed_dim: 128
+  # down_dims: [512, 1024, 2048]
+  down_dims: [256, 512, 1024]
+  kernel_size: 5
+  n_groups: 8
+  cond_predict_scale: True
+
+  # scheduler.step params
+  # predict_epsilon: True
+
+ema:
+  _target_: diffusion_policy.model.diffusion.ema_model.EMAModel
+  update_after_step: 0
+  inv_gamma: 1.0
+  power: 0.75
+  min_value: 0.0
+  max_value: 0.9999
+
+dataloader:
+  batch_size: 128
+  num_workers: 0
+  shuffle: True
+  pin_memory: True
+  persistent_workers: False
+
+val_dataloader:
+  batch_size: 128
+  num_workers: 0
+  shuffle: False
+  pin_memory: True
+  persistent_workers: False
+
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1.0e-4
+  betas: [0.95, 0.999]
+  eps: 1.0e-8
+  weight_decay: 1.0e-6
+
+training:
+  device: "cuda:0"
+  seed: 42
+  debug: False
+  resume: True
+  # optimization
+  lr_scheduler: cosine
+  lr_warmup_steps: 500
+  num_epochs: 600
+  gradient_accumulate_every: 1
+  # EMA destroys performance when used with BatchNorm
+  # replace BatchNorm with GroupNorm.
+  use_ema: True
+  freeze_encoder: False
+  # training loop control
+  # in epochs
+  rollout_every: 50
+  checkpoint_every: 300
+  val_every: 1
+  sample_every: 5
+  # steps per epoch
+  max_train_steps: null
+  max_val_steps: null
+  # misc
+  tqdm_interval_sec: 1.0
+
+logging:
+  project: diffusion_policy_debug
+  resume: True
+  mode: online
+  name: ${now:%Y.%m.%d-%H.%M.%S}_${name}_${task_name}
+  tags: ["${name}", "${task_name}", "${exp_name}"]
+  id: null
+  group: null
+
+checkpoint:
+  topk:
+    monitor_key: test_mean_score
+    mode: max
+    k: 5
+    format_str: 'epoch={epoch:04d}-test_mean_score={test_mean_score:.3f}.ckpt'
+  save_last_ckpt: True
+  save_last_snapshot: False
+
+multi_run:
+  run_dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+  wandb_name_base: ${now:%Y.%m.%d-%H.%M.%S}_${name}_${task_name}
+
+hydra:
+  job:
+    override_dirname: ${name}
+  run:
+    dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+  sweep:
+    dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+    subdir: ${hydra.job.num}
+
+setting: null
+expert_data_num: null
+head_camera_type: null
\ No newline at end of file
diff --git a/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_16.yaml b/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_16.yaml
new file mode 100644
index 0000000..4ea76f5
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_16.yaml
@@ -0,0 +1,155 @@
+defaults:
+  - _self_
+  - task: default_task_16
+
+name: robot_${task.name}
+_target_: diffusion_policy.workspace.robotworkspace.RobotWorkspace
+
+task_name: ${task.name}
+shape_meta: ${task.shape_meta}
+exp_name: "default"
+
+horizon: 8
+n_obs_steps: 3
+n_action_steps: 6
+n_latency_steps: 0
+dataset_obs_steps: ${n_obs_steps}
+past_action_visible: False
+keypoint_visible_rate: 1.0
+obs_as_global_cond: True
+
+policy:
+  _target_: diffusion_policy.policy.diffusion_unet_image_policy.DiffusionUnetImagePolicy
+
+  shape_meta: ${shape_meta}
+  
+  noise_scheduler:
+    _target_: diffusers.schedulers.scheduling_ddpm.DDPMScheduler
+    num_train_timesteps: 100
+    beta_start: 0.0001
+    beta_end: 0.02
+    beta_schedule: squaredcos_cap_v2
+    variance_type: fixed_small # Yilun's paper uses fixed_small_log instead, but easy to cause Nan
+    clip_sample: True # required when predict_epsilon=False
+    prediction_type: epsilon # or sample
+
+  obs_encoder:
+    _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+    shape_meta: ${shape_meta}
+    rgb_model:
+      _target_: diffusion_policy.model.vision.model_getter.get_resnet
+      name: resnet18
+      weights: null
+    resize_shape: null
+    crop_shape: null
+    # constant center crop
+    random_crop: True
+    use_group_norm: True
+    share_rgb_model: False
+    imagenet_norm: True
+
+  horizon: ${horizon}
+  n_action_steps: ${eval:'${n_action_steps}+${n_latency_steps}'}
+  n_obs_steps: ${n_obs_steps}
+  num_inference_steps: 100
+  obs_as_global_cond: ${obs_as_global_cond}
+  # crop_shape: null
+  diffusion_step_embed_dim: 128
+  # down_dims: [512, 1024, 2048]
+  down_dims: [256, 512, 1024]
+  kernel_size: 5
+  n_groups: 8
+  cond_predict_scale: True
+
+  # scheduler.step params
+  # predict_epsilon: True
+
+ema:
+  _target_: diffusion_policy.model.diffusion.ema_model.EMAModel
+  update_after_step: 0
+  inv_gamma: 1.0
+  power: 0.75
+  min_value: 0.0
+  max_value: 0.9999
+
+dataloader:
+  batch_size: 128
+  num_workers: 0
+  shuffle: True
+  pin_memory: True
+  persistent_workers: False
+
+val_dataloader:
+  batch_size: 128
+  num_workers: 0
+  shuffle: False
+  pin_memory: True
+  persistent_workers: False
+
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1.0e-4
+  betas: [0.95, 0.999]
+  eps: 1.0e-8
+  weight_decay: 1.0e-6
+
+training:
+  device: "cuda:0"
+  seed: 42
+  debug: False
+  resume: True
+  # optimization
+  lr_scheduler: cosine
+  lr_warmup_steps: 500
+  num_epochs: 600
+  gradient_accumulate_every: 1
+  # EMA destroys performance when used with BatchNorm
+  # replace BatchNorm with GroupNorm.
+  use_ema: True
+  freeze_encoder: False
+  # training loop control
+  # in epochs
+  rollout_every: 50
+  checkpoint_every: 300
+  val_every: 1
+  sample_every: 5
+  # steps per epoch
+  max_train_steps: null
+  max_val_steps: null
+  # misc
+  tqdm_interval_sec: 1.0
+
+logging:
+  project: diffusion_policy_debug
+  resume: True
+  mode: online
+  name: ${now:%Y.%m.%d-%H.%M.%S}_${name}_${task_name}
+  tags: ["${name}", "${task_name}", "${exp_name}"]
+  id: null
+  group: null
+
+checkpoint:
+  topk:
+    monitor_key: test_mean_score
+    mode: max
+    k: 5
+    format_str: 'epoch={epoch:04d}-test_mean_score={test_mean_score:.3f}.ckpt'
+  save_last_ckpt: True
+  save_last_snapshot: False
+
+multi_run:
+  run_dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+  wandb_name_base: ${now:%Y.%m.%d-%H.%M.%S}_${name}_${task_name}
+
+hydra:
+  job:
+    override_dirname: ${name}
+  run:
+    dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+  sweep:
+    dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+    subdir: ${hydra.job.num}
+
+setting: null
+expert_data_num: null
+head_camera_type: null
\ No newline at end of file
diff --git a/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_8.yaml b/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_8.yaml
new file mode 100644
index 0000000..4da1254
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/config/robot_dp_8.yaml
@@ -0,0 +1,156 @@
+defaults:
+  - _self_
+  - task: default_task_8
+
+name: robot_${task.name}
+_target_: diffusion_policy.workspace.robotworkspace.RobotWorkspace
+
+task_name: ${task.name}
+shape_meta: ${task.shape_meta}
+exp_name: "default"
+
+horizon: 8
+n_obs_steps: 3
+n_action_steps: 6
+n_latency_steps: 0
+dataset_obs_steps: ${n_obs_steps}
+past_action_visible: False
+keypoint_visible_rate: 1.0
+obs_as_global_cond: True
+
+policy:
+  _target_: diffusion_policy.policy.diffusion_unet_image_policy.DiffusionUnetImagePolicy
+
+  shape_meta: ${shape_meta}
+  
+  noise_scheduler:
+    _target_: diffusers.schedulers.scheduling_ddpm.DDPMScheduler
+    num_train_timesteps: 100
+    beta_start: 0.0001
+    beta_end: 0.02
+    beta_schedule: squaredcos_cap_v2
+    variance_type: fixed_small # Yilun's paper uses fixed_small_log instead, but easy to cause Nan
+    clip_sample: True # required when predict_epsilon=False
+    prediction_type: epsilon # or sample
+
+  obs_encoder:
+    _target_: diffusion_policy.model.vision.multi_image_obs_encoder.MultiImageObsEncoder
+    shape_meta: ${shape_meta}
+    rgb_model:
+      _target_: diffusion_policy.model.vision.model_getter.get_resnet
+      name: resnet18
+      weights: null
+    resize_shape: null
+    crop_shape: null
+    # constant center crop
+    random_crop: True
+    use_group_norm: True
+    share_rgb_model: False
+    imagenet_norm: True
+
+  horizon: ${horizon}
+  n_action_steps: ${eval:'${n_action_steps}+${n_latency_steps}'}
+  n_obs_steps: ${n_obs_steps}
+  num_inference_steps: 100
+  obs_as_global_cond: ${obs_as_global_cond}
+  # crop_shape: null
+  diffusion_step_embed_dim: 128
+  # down_dims: [512, 1024, 2048]
+  down_dims: [256, 512, 1024]
+  kernel_size: 5
+  n_groups: 8
+  cond_predict_scale: True
+
+  # scheduler.step params
+  # predict_epsilon: True
+
+ema:
+  _target_: diffusion_policy.model.diffusion.ema_model.EMAModel
+  update_after_step: 0
+  inv_gamma: 1.0
+  power: 0.75
+  min_value: 0.0
+  max_value: 0.9999
+
+dataloader:
+  batch_size: 128
+  num_workers: 0
+  shuffle: True
+  pin_memory: True
+  persistent_workers: False
+
+val_dataloader:
+  batch_size: 128
+  num_workers: 0
+  shuffle: False
+  pin_memory: True
+  persistent_workers: False
+
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1.0e-4
+  betas: [0.95, 0.999]
+  eps: 1.0e-8
+  weight_decay: 1.0e-6
+
+training:
+  device: "cuda:0"
+  seed: 42
+  debug: False
+  resume: True
+  # optimization
+  lr_scheduler: cosine
+  lr_warmup_steps: 500
+  num_epochs: 600
+  gradient_accumulate_every: 1
+  # EMA destroys performance when used with BatchNorm
+  # replace BatchNorm with GroupNorm.
+  use_ema: True
+  freeze_encoder: False
+  # training loop control
+  # in epochs
+  rollout_every: 50
+  checkpoint_every: 300
+  val_every: 1
+  sample_every: 5
+  # steps per epoch
+  max_train_steps: null
+  max_val_steps: null
+  # misc
+  tqdm_interval_sec: 1.0
+
+logging:
+  project: diffusion_policy_debug
+  resume: True
+  mode: online
+  name: ${now:%Y.%m.%d-%H.%M.%S}_${name}_${task_name}
+  tags: ["${name}", "${task_name}", "${exp_name}"]
+  id: null
+  group: null
+
+checkpoint:
+  topk:
+    monitor_key: test_mean_score
+    mode: max
+    k: 5
+    format_str: 'epoch={epoch:04d}-test_mean_score={test_mean_score:.3f}.ckpt'
+  save_last_ckpt: True
+  save_last_snapshot: False
+
+multi_run:
+  run_dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+  wandb_name_base: ${now:%Y.%m.%d-%H.%M.%S}_${name}_${task_name}
+
+hydra:
+  job:
+    override_dirname: ${name}
+  run:
+    dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+  sweep:
+    dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name}
+    subdir: ${hydra.job.num}
+
+setting: null
+expert_data_num: null
+head_camera_type: null
+
diff --git a/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_14.yaml b/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_14.yaml
new file mode 100644
index 0000000..c9d5a48
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_14.yaml
@@ -0,0 +1,50 @@
+name: task_config
+
+image_shape: &image_shape [3, -1, -1]
+shape_meta: &shape_meta
+  # acceptable types: rgb, low_dim
+  obs:
+    head_cam:
+      shape: *image_shape
+      type: rgb
+    # front_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    # left_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    # right_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    agent_pos:
+      shape: [14]
+      type: low_dim
+  action:
+    shape: [14]
+
+env_runner:
+  _target_: diffusion_policy.env_runner.pusht_image_runner.PushTImageRunner
+  n_train: 6
+  n_train_vis: 2
+  train_start_seed: 0
+  n_test: 50
+  n_test_vis: 4
+  legacy_test: True
+  test_start_seed: 100000
+  max_steps: 300
+  n_obs_steps: ${n_obs_steps}
+  n_action_steps: ${n_action_steps}
+  fps: 10
+  past_action: ${past_action_visible}
+  n_envs: null
+
+dataset:
+  _target_: diffusion_policy.dataset.robot_image_dataset.RobotImageDataset
+  zarr_path: data/useless.zarr
+  batch_size: ${dataloader.batch_size}
+  horizon: ${horizon}
+  pad_before: ${eval:'${n_obs_steps}-1'}
+  pad_after: ${eval:'${n_action_steps}-1'}
+  seed: 42
+  val_ratio: 0.02
+  max_train_episodes: null
diff --git a/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_16.yaml b/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_16.yaml
new file mode 100644
index 0000000..6dd619a
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_16.yaml
@@ -0,0 +1,50 @@
+name: task_config
+
+image_shape: &image_shape [3, -1, -1]
+shape_meta: &shape_meta
+  # acceptable types: rgb, low_dim
+  obs:
+    head_cam:
+      shape: *image_shape
+      type: rgb
+    # front_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    # left_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    # right_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    agent_pos:
+      shape: [16]
+      type: low_dim
+  action:
+    shape: [16]
+
+env_runner:
+  _target_: diffusion_policy.env_runner.pusht_image_runner.PushTImageRunner
+  n_train: 6
+  n_train_vis: 2
+  train_start_seed: 0
+  n_test: 50
+  n_test_vis: 4
+  legacy_test: True
+  test_start_seed: 100000
+  max_steps: 300
+  n_obs_steps: ${n_obs_steps}
+  n_action_steps: ${n_action_steps}
+  fps: 10
+  past_action: ${past_action_visible}
+  n_envs: null
+
+dataset:
+  _target_: diffusion_policy.dataset.robot_image_dataset.RobotImageDataset
+  zarr_path: data/useless.zarr
+  batch_size: ${dataloader.batch_size}
+  horizon: ${horizon}
+  pad_before: ${eval:'${n_obs_steps}-1'}
+  pad_after: ${eval:'${n_action_steps}-1'}
+  seed: 42
+  val_ratio: 0.02
+  max_train_episodes: null
diff --git a/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_8.yaml b/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_8.yaml
new file mode 100644
index 0000000..9129a0a
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/config/task/default_task_8.yaml
@@ -0,0 +1,50 @@
+name: task_config
+
+image_shape: &image_shape [3, -1, -1]
+shape_meta: &shape_meta
+  # acceptable types: rgb, low_dim
+  obs:
+    head_cam:
+      shape: *image_shape
+      type: rgb
+    # front_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    # left_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    # right_cam:
+    #   shape: *image_shape
+    #   type: rgb
+    agent_pos:
+      shape: [8]  # Franka: 7 joints + 1 normalized gripper
+      type: low_dim
+  action:
+    shape: [8]  # Franka: 7 joints + 1 normalized gripper
+
+env_runner:
+  _target_: diffusion_policy.env_runner.pusht_image_runner.PushTImageRunner
+  n_train: 6
+  n_train_vis: 2
+  train_start_seed: 0
+  n_test: 50
+  n_test_vis: 4
+  legacy_test: True
+  test_start_seed: 100000
+  max_steps: 300
+  n_obs_steps: ${n_obs_steps}
+  n_action_steps: ${n_action_steps}
+  fps: 10
+  past_action: ${past_action_visible}
+  n_envs: null
+
+dataset:
+  _target_: diffusion_policy.dataset.robot_image_dataset.RobotImageDataset
+  zarr_path: data/useless.zarr
+  batch_size: ${dataloader.batch_size}
+  horizon: ${horizon}
+  pad_before: ${eval:'${n_obs_steps}-1'}
+  pad_after: ${eval:'${n_action_steps}-1'}
+  seed: 42
+  val_ratio: 0.02
+  max_train_episodes: null
diff --git a/r2sVLA/algos/DP/diffusion_policy/dataset/base_dataset.py b/r2sVLA/algos/DP/diffusion_policy/dataset/base_dataset.py
new file mode 100644
index 0000000..4ec3386
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/dataset/base_dataset.py
@@ -0,0 +1,54 @@
+from typing import Dict
+
+import torch
+import torch.nn
+from diffusion_policy.model.common.normalizer import LinearNormalizer
+
+
+class BaseLowdimDataset(torch.utils.data.Dataset):
+
+    def get_validation_dataset(self) -> "BaseLowdimDataset":
+        # return an empty dataset by default
+        return BaseLowdimDataset()
+
+    def get_normalizer(self, **kwargs) -> LinearNormalizer:
+        raise NotImplementedError()
+
+    def get_all_actions(self) -> torch.Tensor:
+        raise NotImplementedError()
+
+    def __len__(self) -> int:
+        return 0
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """
+        output:
+            obs: T, Do
+            action: T, Da
+        """
+        raise NotImplementedError()
+
+
+class BaseImageDataset(torch.utils.data.Dataset):
+
+    def get_validation_dataset(self) -> "BaseLowdimDataset":
+        # return an empty dataset by default
+        return BaseImageDataset()
+
+    def get_normalizer(self, **kwargs) -> LinearNormalizer:
+        raise NotImplementedError()
+
+    def get_all_actions(self) -> torch.Tensor:
+        raise NotImplementedError()
+
+    def __len__(self) -> int:
+        return 0
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """
+        output:
+            obs:
+                key: T, *
+            action: T, Da
+        """
+        raise NotImplementedError()
diff --git a/r2sVLA/algos/DP/diffusion_policy/dataset/robot_image_dataset.py b/r2sVLA/algos/DP/diffusion_policy/dataset/robot_image_dataset.py
new file mode 100644
index 0000000..d935680
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/dataset/robot_image_dataset.py
@@ -0,0 +1,185 @@
+from typing import Dict
+import numba
+import torch
+import numpy as np
+import copy
+from diffusion_policy.common.pytorch_util import dict_apply
+from diffusion_policy.common.replay_buffer import ReplayBuffer
+from diffusion_policy.common.sampler import (
+    SequenceSampler,
+    get_val_mask,
+    downsample_mask,
+)
+from diffusion_policy.model.common.normalizer import LinearNormalizer
+from diffusion_policy.dataset.base_dataset import BaseImageDataset
+from diffusion_policy.common.normalize_util import get_image_range_normalizer
+import pdb
+
+
+class RobotImageDataset(BaseImageDataset):
+
+    def __init__(
+        self,
+        zarr_path,
+        horizon=1,
+        pad_before=0,
+        pad_after=0,
+        seed=42,
+        val_ratio=0.0,
+        batch_size=128,
+        max_train_episodes=None,
+    ):
+
+        super().__init__()
+        self.replay_buffer = ReplayBuffer.copy_from_path(
+            zarr_path,
+            # keys=['head_camera', 'front_camera', 'left_camera', 'right_camera', 'state', 'action'],
+            keys=["head_camera", "state", "action"],
+        )
+
+        val_mask = get_val_mask(n_episodes=self.replay_buffer.n_episodes, val_ratio=val_ratio, seed=seed)
+        train_mask = ~val_mask
+        train_mask = downsample_mask(mask=train_mask, max_n=max_train_episodes, seed=seed)
+
+        self.sampler = SequenceSampler(
+            replay_buffer=self.replay_buffer,
+            sequence_length=horizon,
+            pad_before=pad_before,
+            pad_after=pad_after,
+            episode_mask=train_mask,
+        )
+        self.train_mask = train_mask
+        self.horizon = horizon
+        self.pad_before = pad_before
+        self.pad_after = pad_after
+
+        self.batch_size = batch_size
+        sequence_length = self.sampler.sequence_length
+        self.buffers = {
+            k: np.zeros((batch_size, sequence_length, *v.shape[1:]), dtype=v.dtype)
+            for k, v in self.sampler.replay_buffer.items()
+        }
+        self.buffers_torch = {k: torch.from_numpy(v) for k, v in self.buffers.items()}
+        for v in self.buffers_torch.values():
+            v.pin_memory()
+
+    def get_validation_dataset(self):
+        val_set = copy.copy(self)
+        val_set.sampler = SequenceSampler(
+            replay_buffer=self.replay_buffer,
+            sequence_length=self.horizon,
+            pad_before=self.pad_before,
+            pad_after=self.pad_after,
+            episode_mask=~self.train_mask,
+        )
+        val_set.train_mask = ~self.train_mask
+        return val_set
+
+    def get_normalizer(self, mode="limits", **kwargs):
+        data = {
+            "action": self.replay_buffer["action"],
+            "agent_pos": self.replay_buffer["state"],
+        }
+        normalizer = LinearNormalizer()
+        normalizer.fit(data=data, last_n_dims=1, mode=mode, **kwargs)
+        normalizer["head_cam"] = get_image_range_normalizer()
+        normalizer["front_cam"] = get_image_range_normalizer()
+        normalizer["left_cam"] = get_image_range_normalizer()
+        normalizer["right_cam"] = get_image_range_normalizer()
+        return normalizer
+
+    def __len__(self) -> int:
+        return len(self.sampler)
+
+    def _sample_to_data(self, sample):
+        agent_pos = sample["state"].astype(np.float32)  # (agent_posx2, block_posex3)
+        head_cam = np.moveaxis(sample["head_camera"], -1, 1) / 255
+        # front_cam = np.moveaxis(sample['front_camera'],-1,1)/255
+        # left_cam = np.moveaxis(sample['left_camera'],-1,1)/255
+        # right_cam = np.moveaxis(sample['right_camera'],-1,1)/255
+
+        data = {
+            "obs": {
+                "head_cam": head_cam,  # T, 3, H, W
+                # 'front_cam': front_cam, # T, 3, H, W
+                # 'left_cam': left_cam, # T, 3, H, W
+                # 'right_cam': right_cam, # T, 3, H, W
+                "agent_pos": agent_pos,  # T, D
+            },
+            "action": sample["action"].astype(np.float32),  # T, D
+        }
+        return data
+
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        if isinstance(idx, slice):
+            raise NotImplementedError  # Specialized
+        elif isinstance(idx, int):
+            sample = self.sampler.sample_sequence(idx)
+            sample = dict_apply(sample, torch.from_numpy)
+            return sample
+        elif isinstance(idx, np.ndarray):
+            assert len(idx) == self.batch_size
+            for k, v in self.sampler.replay_buffer.items():
+                batch_sample_sequence(
+                    self.buffers[k],
+                    v,
+                    self.sampler.indices,
+                    idx,
+                    self.sampler.sequence_length,
+                )
+            return self.buffers_torch
+        else:
+            raise ValueError(idx)
+
+    def postprocess(self, samples, device):
+        agent_pos = samples["state"].to(device, non_blocking=True)
+        head_cam = samples["head_camera"].to(device, non_blocking=True) / 255.0
+        # front_cam = samples['front_camera'].to(device, non_blocking=True) / 255.0
+        # left_cam = samples['left_camera'].to(device, non_blocking=True) / 255.0
+        # right_cam = samples['right_camera'].to(device, non_blocking=True) / 255.0
+        action = samples["action"].to(device, non_blocking=True)
+        return {
+            "obs": {
+                "head_cam": head_cam,  # B, T, 3, H, W
+                # 'front_cam': front_cam, # B, T, 3, H, W
+                # 'left_cam': left_cam, # B, T, 3, H, W
+                # 'right_cam': right_cam, # B, T, 3, H, W
+                "agent_pos": agent_pos,  # B, T, D
+            },
+            "action": action,  # B, T, D
+        }
+
+
+def _batch_sample_sequence(
+    data: np.ndarray,
+    input_arr: np.ndarray,
+    indices: np.ndarray,
+    idx: np.ndarray,
+    sequence_length: int,
+):
+    for i in numba.prange(len(idx)):
+        buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx = indices[idx[i]]
+        data[i, sample_start_idx:sample_end_idx] = input_arr[buffer_start_idx:buffer_end_idx]
+        if sample_start_idx > 0:
+            data[i, :sample_start_idx] = data[i, sample_start_idx]
+        if sample_end_idx < sequence_length:
+            data[i, sample_end_idx:] = data[i, sample_end_idx - 1]
+
+
+_batch_sample_sequence_sequential = numba.jit(_batch_sample_sequence, nopython=True, parallel=False)
+_batch_sample_sequence_parallel = numba.jit(_batch_sample_sequence, nopython=True, parallel=True)
+
+
+def batch_sample_sequence(
+    data: np.ndarray,
+    input_arr: np.ndarray,
+    indices: np.ndarray,
+    idx: np.ndarray,
+    sequence_length: int,
+):
+    batch_size = len(idx)
+    assert data.shape == (batch_size, sequence_length, *input_arr.shape[1:])
+    if batch_size >= 16 and data.nbytes // batch_size >= 2**16:
+        _batch_sample_sequence_parallel(data, input_arr, indices, idx, sequence_length)
+    else:
+        _batch_sample_sequence_sequential(data, input_arr, indices, idx, sequence_length)
diff --git a/r2sVLA/algos/DP/diffusion_policy/env_runner/dp_runner.py b/r2sVLA/algos/DP/diffusion_policy/env_runner/dp_runner.py
new file mode 100644
index 0000000..96ecd02
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/env_runner/dp_runner.py
@@ -0,0 +1,102 @@
+import torch
+import os
+import numpy as np
+import hydra
+from pathlib import Path
+from collections import deque
+
+import yaml
+from datetime import datetime
+import importlib
+import dill
+from argparse import ArgumentParser
+from diffusion_policy.common.pytorch_util import dict_apply
+from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+
+
+class DPRunner:
+
+    def __init__(
+        self,
+        eval_episodes=20,
+        max_steps=300,
+        n_obs_steps=3,
+        n_action_steps=8,
+        fps=10,
+        crf=22,
+        tqdm_interval_sec=5.0,
+        task_name=None,
+    ):
+        self.task_name = task_name
+        self.eval_episodes = eval_episodes
+        self.fps = fps
+        self.crf = crf
+        self.n_obs_steps = n_obs_steps
+        self.n_action_steps = n_action_steps
+        self.max_steps = max_steps
+        self.tqdm_interval_sec = tqdm_interval_sec
+
+        self.obs = deque(maxlen=n_obs_steps + 1)
+        self.env = None
+
+    def stack_last_n_obs(self, all_obs, n_steps):
+        assert len(all_obs) > 0
+        all_obs = list(all_obs)
+        if isinstance(all_obs[0], np.ndarray):
+            result = np.zeros((n_steps, ) + all_obs[-1].shape, dtype=all_obs[-1].dtype)
+            start_idx = -min(n_steps, len(all_obs))
+            result[start_idx:] = np.array(all_obs[start_idx:])
+            if n_steps > len(all_obs):
+                # pad
+                result[:start_idx] = result[start_idx]
+        elif isinstance(all_obs[0], torch.Tensor):
+            result = torch.zeros((n_steps, ) + all_obs[-1].shape, dtype=all_obs[-1].dtype)
+            start_idx = -min(n_steps, len(all_obs))
+            result[start_idx:] = torch.stack(all_obs[start_idx:])
+            if n_steps > len(all_obs):
+                # pad
+                result[:start_idx] = result[start_idx]
+        else:
+            raise RuntimeError(f"Unsupported obs type {type(all_obs[0])}")
+        return result
+
+    def reset_obs(self):
+        self.obs.clear()
+
+    def update_obs(self, current_obs):
+        self.obs.append(current_obs)
+
+    def get_n_steps_obs(self):
+        assert len(self.obs) > 0, "no observation is recorded, please update obs first"
+
+        result = dict()
+        for key in self.obs[0].keys():
+            result[key] = self.stack_last_n_obs([obs[key] for obs in self.obs], self.n_obs_steps)
+
+        return result
+
+    def get_action(self, policy: BaseImagePolicy, observaton=None):
+        device, dtype = policy.device, policy.dtype
+        if observaton is not None:
+            self.obs.append(observaton)  # update
+        obs = self.get_n_steps_obs()
+
+        # create obs dict
+        np_obs_dict = dict(obs)
+        # device transfer
+        obs_dict = dict_apply(np_obs_dict, lambda x: torch.from_numpy(x).to(device=device))
+        # run policy
+        with torch.no_grad():
+            obs_dict_input = {}  # flush unused keys
+            obs_dict_input["head_cam"] = obs_dict["head_cam"].unsqueeze(0)
+            # obs_dict_input['front_cam'] = obs_dict['front_cam'].unsqueeze(0)
+            obs_dict_input["left_cam"] = obs_dict["left_cam"].unsqueeze(0)
+            obs_dict_input["right_cam"] = obs_dict["right_cam"].unsqueeze(0)
+            obs_dict_input["agent_pos"] = obs_dict["agent_pos"].unsqueeze(0)
+
+            action_dict = policy.predict_action(obs_dict_input)
+
+        # device_transfer
+        np_action_dict = dict_apply(action_dict, lambda x: x.detach().to("cpu").numpy())
+        action = np_action_dict["action"].squeeze(0)[:self.n_action_steps]
+        return action
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/action_ae/__init__.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/action_ae/__init__.py
new file mode 100644
index 0000000..de289fd
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/action_ae/__init__.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+import abc
+
+from typing import Optional, Union
+
+import diffusion_policy.model.bet.utils as utils
+
+
+class AbstractActionAE(utils.SaveModule, abc.ABC):
+
+    @abc.abstractmethod
+    def fit_model(
+        self,
+        input_dataloader: DataLoader,
+        eval_dataloader: DataLoader,
+        obs_encoding_net: Optional[nn.Module] = None,
+    ) -> None:
+        pass
+
+    @abc.abstractmethod
+    def encode_into_latent(
+        self,
+        input_action: torch.Tensor,
+        input_rep: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Given the input action, discretize it.
+
+        Inputs:
+        input_action (shape: ... x action_dim): The input action to discretize. This can be in a batch,
+        and is generally assumed that the last dimnesion is the action dimension.
+
+        Outputs:
+        discretized_action (shape: ... x num_tokens): The discretized action.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def decode_actions(
+        self,
+        latent_action_batch: Optional[torch.Tensor],
+        input_rep_batch: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Given a discretized action, convert it to a continuous action.
+
+        Inputs:
+        latent_action_batch (shape: ... x num_tokens): The discretized action
+        generated by the discretizer.
+
+        Outputs:
+        continuous_action (shape: ... x action_dim): The continuous action.
+        """
+        raise NotImplementedError
+
+    @property
+    @abc.abstractmethod
+    def num_latents(self) -> Union[int, float]:
+        """
+        Number of possible latents for this generator, useful for state priors that use softmax.
+        """
+        return float("inf")
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/action_ae/discretizers/k_means.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/action_ae/discretizers/k_means.py
new file mode 100644
index 0000000..b4e1783
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/action_ae/discretizers/k_means.py
@@ -0,0 +1,136 @@
+import torch
+import numpy as np
+
+import tqdm
+
+from typing import Optional, Tuple, Union
+from diffusion_policy.model.common.dict_of_tensor_mixin import DictOfTensorMixin
+
+
+class KMeansDiscretizer(DictOfTensorMixin):
+    """
+    Simplified and modified version of KMeans algorithm  from sklearn.
+    """
+
+    def __init__(
+        self,
+        action_dim: int,
+        num_bins: int = 100,
+        predict_offsets: bool = False,
+    ):
+        super().__init__()
+        self.n_bins = num_bins
+        self.action_dim = action_dim
+        self.predict_offsets = predict_offsets
+
+    def fit_discretizer(self, input_actions: torch.Tensor) -> None:
+        assert (self.action_dim == input_actions.shape[-1]
+                ), f"Input action dimension {self.action_dim} does not match fitted model {input_actions.shape[-1]}"
+
+        flattened_actions = input_actions.view(-1, self.action_dim)
+        cluster_centers = KMeansDiscretizer._kmeans(flattened_actions, ncluster=self.n_bins)
+        self.params_dict["bin_centers"] = cluster_centers
+
+    @property
+    def suggested_actions(self) -> torch.Tensor:
+        return self.params_dict["bin_centers"]
+
+    @classmethod
+    def _kmeans(cls, x: torch.Tensor, ncluster: int = 512, niter: int = 50):
+        """
+        Simple k-means clustering algorithm adapted from Karpathy's minGPT library
+        https://github.com/karpathy/minGPT/blob/master/play_image.ipynb
+        """
+        N, D = x.size()
+        c = x[torch.randperm(N)[:ncluster]]  # init clusters at random
+
+        pbar = tqdm.trange(niter)
+        pbar.set_description("K-means clustering")
+        for i in pbar:
+            # assign all pixels to the closest codebook element
+            a = ((x[:, None, :] - c[None, :, :])**2).sum(-1).argmin(1)
+            # move each codebook element to be the mean of the pixels that assigned to it
+            c = torch.stack([x[a == k].mean(0) for k in range(ncluster)])
+            # re-assign any poorly positioned codebook elements
+            nanix = torch.any(torch.isnan(c), dim=1)
+            ndead = nanix.sum().item()
+            if ndead:
+                tqdm.tqdm.write("done step %d/%d, re-initialized %d dead clusters" % (i + 1, niter, ndead))
+            c[nanix] = x[torch.randperm(N)[:ndead]]  # re-init dead clusters
+        return c
+
+    def encode_into_latent(self, input_action: torch.Tensor, input_rep: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Given the input action, discretize it using the k-Means clustering algorithm.
+
+        Inputs:
+        input_action (shape: ... x action_dim): The input action to discretize. This can be in a batch,
+        and is generally assumed that the last dimnesion is the action dimension.
+
+        Outputs:
+        discretized_action (shape: ... x num_tokens): The discretized action.
+        If self.predict_offsets is True, then the offsets are also returned.
+        """
+        assert (input_action.shape[-1] == self.action_dim), "Input action dimension does not match fitted model"
+
+        # flatten the input action
+        flattened_actions = input_action.view(-1, self.action_dim)
+
+        # get the closest cluster center
+        closest_cluster_center = torch.argmin(
+            torch.sum(
+                (flattened_actions[:, None, :] - self.params_dict["bin_centers"][None, :, :])**2,
+                dim=2,
+            ),
+            dim=1,
+        )
+        # Reshape to the original shape
+        discretized_action = closest_cluster_center.view(input_action.shape[:-1] + (1, ))
+
+        if self.predict_offsets:
+            # decode from latent and get the difference
+            reconstructed_action = self.decode_actions(discretized_action)
+            offsets = input_action - reconstructed_action
+            return (discretized_action, offsets)
+        else:
+            # return the one-hot vector
+            return discretized_action
+
+    def decode_actions(
+        self,
+        latent_action_batch: torch.Tensor,
+        input_rep_batch: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Given the latent action, reconstruct the original action.
+
+        Inputs:
+        latent_action (shape: ... x 1): The latent action to reconstruct. This can be in a batch,
+        and is generally assumed that the last dimension is the action dimension. If the latent_action_batch
+        is a tuple, then it is assumed to be (discretized_action, offsets).
+
+        Outputs:
+        reconstructed_action (shape: ... x action_dim): The reconstructed action.
+        """
+        offsets = None
+        if type(latent_action_batch) == tuple:
+            latent_action_batch, offsets = latent_action_batch
+        # get the closest cluster center
+        closest_cluster_center = self.params_dict["bin_centers"][latent_action_batch]
+        # Reshape to the original shape
+        reconstructed_action = closest_cluster_center.view(latent_action_batch.shape[:-1] + (self.action_dim, ))
+        if offsets is not None:
+            reconstructed_action += offsets
+        return reconstructed_action
+
+    @property
+    def discretized_space(self) -> int:
+        return self.n_bins
+
+    @property
+    def latent_dim(self) -> int:
+        return 1
+
+    @property
+    def num_latents(self) -> int:
+        return self.n_bins
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/latent_generator.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/latent_generator.py
new file mode 100644
index 0000000..f0e38b2
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/latent_generator.py
@@ -0,0 +1,67 @@
+import abc
+import torch
+from typing import Tuple, Optional
+
+import diffusion_policy.model.bet.utils as utils
+
+
+class AbstractLatentGenerator(abc.ABC, utils.SaveModule):
+    """
+    Abstract class for a generative model that can generate latents given observation representations.
+
+    In the probabilisitc sense, this model fits and samples from P(latent|observation) given some observation.
+    """
+
+    @abc.abstractmethod
+    def get_latent_and_loss(
+        self,
+        obs_rep: torch.Tensor,
+        target_latents: torch.Tensor,
+        seq_masks: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Given a set of observation representation and generated latents, get the encoded latent and the loss.
+
+        Inputs:
+        input_action: Batch of the actions taken in the multimodal demonstrations.
+        target_latents: Batch of the latents that the generator should learn to generate the actions from.
+        seq_masks: Batch of masks that indicate which timesteps are valid.
+
+        Outputs:
+        latent: The sampled latent from the observation.
+        loss: The loss of the latent generator.
+        """
+        pass
+
+    @abc.abstractmethod
+    def generate_latents(self, seq_obses: torch.Tensor, seq_masks: torch.Tensor) -> torch.Tensor:
+        """
+        Given a batch of sequences of observations, generate a batch of sequences of latents.
+
+        Inputs:
+        seq_obses: Batch of sequences of observations, of shape seq x batch x dim, following the transformer convention.
+        seq_masks: Batch of sequences of masks, of shape seq x batch, following the transformer convention.
+
+        Outputs:
+        seq_latents: Batch of sequences of latents of shape seq x batch x latent_dim.
+        """
+        pass
+
+    def get_optimizer(self, weight_decay: float, learning_rate: float, betas: Tuple[float,
+                                                                                    float]) -> torch.optim.Optimizer:
+        """
+        Default optimizer class. Override this if you want to use a different optimizer.
+        """
+        return torch.optim.Adam(self.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=betas)
+
+
+class LatentGeneratorDataParallel(torch.nn.DataParallel):
+
+    def get_latent_and_loss(self, *args, **kwargs):
+        return self.module.get_latent_and_loss(*args, **kwargs)  # type: ignore
+
+    def generate_latents(self, *args, **kwargs):
+        return self.module.generate_latents(*args, **kwargs)  # type: ignore
+
+    def get_optimizer(self, *args, **kwargs):
+        return self.module.get_optimizer(*args, **kwargs)  # type: ignore
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/mingpt.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/mingpt.py
new file mode 100644
index 0000000..c67b0ac
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/mingpt.py
@@ -0,0 +1,177 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import einops
+import diffusion_policy.model.bet.latent_generators.latent_generator as latent_generator
+
+import diffusion_policy.model.bet.libraries.mingpt.model as mingpt_model
+import diffusion_policy.model.bet.libraries.mingpt.trainer as mingpt_trainer
+from diffusion_policy.model.bet.libraries.loss_fn import FocalLoss, soft_cross_entropy
+
+from typing import Optional, Tuple
+
+
+class MinGPT(latent_generator.AbstractLatentGenerator):
+
+    def __init__(
+            self,
+            input_dim: int,
+            n_layer: int = 12,
+            n_head: int = 12,
+            n_embd: int = 768,
+            embd_pdrop: float = 0.1,
+            resid_pdrop: float = 0.1,
+            attn_pdrop: float = 0.1,
+            block_size: int = 128,
+            vocab_size: int = 50257,
+            latent_dim: int = 768,  # Ignore, used for compatibility with other models.
+            action_dim: int = 0,
+            discrete_input: bool = False,
+            predict_offsets: bool = False,
+            offset_loss_scale: float = 1.0,
+            focal_loss_gamma: float = 0.0,
+            **kwargs):
+        super().__init__()
+        self.input_size = input_dim
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.embd_pdrop = embd_pdrop
+        self.resid_pdrop = resid_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.block_size = block_size
+        self.vocab_size = vocab_size
+        self.action_dim = action_dim
+        self.predict_offsets = predict_offsets
+        self.offset_loss_scale = offset_loss_scale
+        self.focal_loss_gamma = focal_loss_gamma
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+        gpt_config = mingpt_model.GPTConfig(
+            input_size=self.input_size,
+            vocab_size=(self.vocab_size * (1 + self.action_dim) if self.predict_offsets else self.vocab_size),
+            block_size=self.block_size,
+            n_layer=n_layer,
+            n_head=n_head,
+            n_embd=n_embd,
+            discrete_input=discrete_input,
+            embd_pdrop=embd_pdrop,
+            resid_pdrop=resid_pdrop,
+            attn_pdrop=attn_pdrop,
+        )
+
+        self.model = mingpt_model.GPT(gpt_config)
+
+    def get_latent_and_loss(
+        self,
+        obs_rep: torch.Tensor,
+        target_latents: torch.Tensor,
+        seq_masks: Optional[torch.Tensor] = None,
+        return_loss_components: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Unlike torch.transformers, GPT takes in batch x seq_len x embd_dim
+        # obs_rep = einops.rearrange(obs_rep, "seq batch embed -> batch seq embed")
+        # target_latents = einops.rearrange(
+        #     target_latents, "seq batch embed -> batch seq embed"
+        # )
+        # While this has been trained autoregressively,
+        # there is no reason why it needs to be so.
+        # We can just use the observation as the input and the next latent as the target.
+        if self.predict_offsets:
+            target_latents, target_offsets = target_latents
+        is_soft_target = (target_latents.shape[-1] == self.vocab_size) and (self.vocab_size != 1)
+        if is_soft_target:
+            target_latents = target_latents.view(-1, target_latents.size(-1))
+            criterion = soft_cross_entropy
+        else:
+            target_latents = target_latents.view(-1)
+            if self.vocab_size == 1:
+                # unify k-means (target_class == 0) and GMM (target_prob == 1)
+                target_latents = torch.zeros_like(target_latents)
+            criterion = FocalLoss(gamma=self.focal_loss_gamma)
+        if self.predict_offsets:
+            output, _ = self.model(obs_rep)
+            logits = output[:, :, :self.vocab_size]
+            offsets = output[:, :, self.vocab_size:]
+            batch = logits.shape[0]
+            seq = logits.shape[1]
+            offsets = einops.rearrange(
+                offsets,
+                "N T (V A) -> (N T) V A",  # N = batch, T = seq
+                V=self.vocab_size,
+                A=self.action_dim,
+            )
+            # calculate (optionally soft) cross entropy and offset losses
+            class_loss = criterion(logits.view(-1, logits.size(-1)), target_latents)
+            # offset loss is only calculated on the target class
+            # if soft targets, argmax is considered the target class
+            selected_offsets = offsets[
+                torch.arange(offsets.size(0)),
+                (target_latents.argmax(dim=-1).view(-1) if is_soft_target else target_latents.view(-1)),
+            ]
+            offset_loss = self.offset_loss_scale * F.mse_loss(selected_offsets, target_offsets.view(
+                -1, self.action_dim))
+            loss = offset_loss + class_loss
+            logits = einops.rearrange(logits, "batch seq classes -> seq batch classes")
+            offsets = einops.rearrange(
+                offsets,
+                "(N T) V A -> T N V A",  # ? N, T order? Anyway does not affect loss and training (might affect visualization)
+                N=batch,
+                T=seq,
+            )
+            if return_loss_components:
+                return (
+                    (logits, offsets),
+                    loss,
+                    {
+                        "offset": offset_loss,
+                        "class": class_loss,
+                        "total": loss
+                    },
+                )
+            else:
+                return (logits, offsets), loss
+        else:
+            logits, _ = self.model(obs_rep)
+            loss = criterion(logits.view(-1, logits.size(-1)), target_latents)
+            logits = einops.rearrange(
+                logits, "batch seq classes -> seq batch classes"
+            )  # ? N, T order? Anyway does not affect loss and training (might affect visualization)
+            if return_loss_components:
+                return logits, loss, {"class": loss, "total": loss}
+            else:
+                return logits, loss
+
+    def generate_latents(self, obs_rep: torch.Tensor) -> torch.Tensor:
+        batch, seq, embed = obs_rep.shape
+
+        output, _ = self.model(obs_rep, None)
+        if self.predict_offsets:
+            logits = output[:, :, :self.vocab_size]
+            offsets = output[:, :, self.vocab_size:]
+            offsets = einops.rearrange(
+                offsets,
+                "N T (V A) -> (N T) V A",  # N = batch, T = seq
+                V=self.vocab_size,
+                A=self.action_dim,
+            )
+        else:
+            logits = output
+        probs = F.softmax(logits, dim=-1)
+        batch, seq, choices = probs.shape
+        # Sample from the multinomial distribution, one per row.
+        sampled_data = torch.multinomial(probs.view(-1, choices), num_samples=1)
+        sampled_data = einops.rearrange(sampled_data, "(batch seq) 1 -> batch seq 1", batch=batch, seq=seq)
+        if self.predict_offsets:
+            sampled_offsets = offsets[torch.arange(offsets.shape[0]),
+                                      sampled_data.flatten()].view(batch, seq, self.action_dim)
+
+            return (sampled_data, sampled_offsets)
+        else:
+            return sampled_data
+
+    def get_optimizer(self, weight_decay: float, learning_rate: float, betas: Tuple[float,
+                                                                                    float]) -> torch.optim.Optimizer:
+        trainer_cfg = mingpt_trainer.TrainerConfig(weight_decay=weight_decay, learning_rate=learning_rate, betas=betas)
+        return self.model.configure_optimizers(trainer_cfg)
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/transformer.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/transformer.py
new file mode 100644
index 0000000..d46f409
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/latent_generators/transformer.py
@@ -0,0 +1,99 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import einops
+import diffusion_policy.model.bet.latent_generators.latent_generator as latent_generator
+
+from diffusion_policy.model.diffusion.transformer_for_diffusion import (
+    TransformerForDiffusion, )
+from diffusion_policy.model.bet.libraries.loss_fn import FocalLoss, soft_cross_entropy
+
+from typing import Optional, Tuple
+
+
+class Transformer(latent_generator.AbstractLatentGenerator):
+
+    def __init__(self, input_dim: int, num_bins: int, action_dim: int, horizon: int, focal_loss_gamma: float,
+                 offset_loss_scale: float, **kwargs):
+        super().__init__()
+        self.model = TransformerForDiffusion(input_dim=input_dim,
+                                             output_dim=num_bins * (1 + action_dim),
+                                             horizon=horizon,
+                                             **kwargs)
+        self.vocab_size = num_bins
+        self.focal_loss_gamma = focal_loss_gamma
+        self.offset_loss_scale = offset_loss_scale
+        self.action_dim = action_dim
+
+    def get_optimizer(self, **kwargs) -> torch.optim.Optimizer:
+        return self.model.configure_optimizers(**kwargs)
+
+    def get_latent_and_loss(
+        self,
+        obs_rep: torch.Tensor,
+        target_latents: torch.Tensor,
+        return_loss_components=True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        target_latents, target_offsets = target_latents
+        target_latents = target_latents.view(-1)
+        criterion = FocalLoss(gamma=self.focal_loss_gamma)
+
+        t = torch.tensor(0, device=self.model.device)
+        output = self.model(obs_rep, t)
+        logits = output[:, :, :self.vocab_size]
+        offsets = output[:, :, self.vocab_size:]
+        batch = logits.shape[0]
+        seq = logits.shape[1]
+        offsets = einops.rearrange(
+            offsets,
+            "N T (V A) -> (N T) V A",  # N = batch, T = seq
+            V=self.vocab_size,
+            A=self.action_dim,
+        )
+        # calculate (optionally soft) cross entropy and offset losses
+        class_loss = criterion(logits.view(-1, logits.size(-1)), target_latents)
+        # offset loss is only calculated on the target class
+        # if soft targets, argmax is considered the target class
+        selected_offsets = offsets[
+            torch.arange(offsets.size(0)),
+            target_latents.view(-1),
+        ]
+        offset_loss = self.offset_loss_scale * F.mse_loss(selected_offsets, target_offsets.view(-1, self.action_dim))
+        loss = offset_loss + class_loss
+        logits = einops.rearrange(logits, "batch seq classes -> seq batch classes")
+        offsets = einops.rearrange(
+            offsets,
+            "(N T) V A -> T N V A",  # ? N, T order? Anyway does not affect loss and training (might affect visualization)
+            N=batch,
+            T=seq,
+        )
+        return (
+            (logits, offsets),
+            loss,
+            {
+                "offset": offset_loss,
+                "class": class_loss,
+                "total": loss
+            },
+        )
+
+    def generate_latents(self, obs_rep: torch.Tensor) -> torch.Tensor:
+        t = torch.tensor(0, device=self.model.device)
+        output = self.model(obs_rep, t)
+        logits = output[:, :, :self.vocab_size]
+        offsets = output[:, :, self.vocab_size:]
+        offsets = einops.rearrange(
+            offsets,
+            "N T (V A) -> (N T) V A",  # N = batch, T = seq
+            V=self.vocab_size,
+            A=self.action_dim,
+        )
+
+        probs = F.softmax(logits, dim=-1)
+        batch, seq, choices = probs.shape
+        # Sample from the multinomial distribution, one per row.
+        sampled_data = torch.multinomial(probs.view(-1, choices), num_samples=1)
+        sampled_data = einops.rearrange(sampled_data, "(batch seq) 1 -> batch seq 1", batch=batch, seq=seq)
+        sampled_offsets = offsets[torch.arange(offsets.shape[0]),
+                                  sampled_data.flatten()].view(batch, seq, self.action_dim)
+        return (sampled_data, sampled_offsets)
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/loss_fn.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/loss_fn.py
new file mode 100644
index 0000000..47817dd
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/loss_fn.py
@@ -0,0 +1,165 @@
+from typing import Optional, Sequence
+
+import torch
+from torch import Tensor
+from torch import nn
+from torch.nn import functional as F
+
+
+# Reference: https://github.com/pytorch/pytorch/issues/11959
+def soft_cross_entropy(
+    input: torch.Tensor,
+    target: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Args:
+        input: (batch_size, num_classes): tensor of raw logits
+        target: (batch_size, num_classes): tensor of class probability; sum(target) == 1
+
+    Returns:
+        loss: (batch_size,)
+    """
+    log_probs = torch.log_softmax(input, dim=-1)
+    # target is a distribution
+    loss = F.kl_div(log_probs, target, reduction="batchmean")
+    return loss
+
+
+# Focal loss implementation
+# Source: https://github.com/AdeelH/pytorch-multi-class-focal-loss/blob/master/focal_loss.py
+# MIT License
+#
+# Copyright (c) 2020 Adeel Hassan
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+class FocalLoss(nn.Module):
+    """Focal Loss, as described in https://arxiv.org/abs/1708.02002.
+    It is essentially an enhancement to cross entropy loss and is
+    useful for classification tasks when there is a large class imbalance.
+    x is expected to contain raw, unnormalized scores for each class.
+    y is expected to contain class labels.
+    Shape:
+        - x: (batch_size, C) or (batch_size, C, d1, d2, ..., dK), K > 0.
+        - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
+    """
+
+    def __init__(
+        self,
+        alpha: Optional[Tensor] = None,
+        gamma: float = 0.0,
+        reduction: str = "mean",
+        ignore_index: int = -100,
+    ):
+        """Constructor.
+        Args:
+            alpha (Tensor, optional): Weights for each class. Defaults to None.
+            gamma (float, optional): A constant, as described in the paper.
+                Defaults to 0.
+            reduction (str, optional): 'mean', 'sum' or 'none'.
+                Defaults to 'mean'.
+            ignore_index (int, optional): class label to ignore.
+                Defaults to -100.
+        """
+        if reduction not in ("mean", "sum", "none"):
+            raise ValueError('Reduction must be one of: "mean", "sum", "none".')
+
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+
+        self.nll_loss = nn.NLLLoss(weight=alpha, reduction="none", ignore_index=ignore_index)
+
+    def __repr__(self):
+        arg_keys = ["alpha", "gamma", "ignore_index", "reduction"]
+        arg_vals = [self.__dict__[k] for k in arg_keys]
+        arg_strs = [f"{k}={v}" for k, v in zip(arg_keys, arg_vals)]
+        arg_str = ", ".join(arg_strs)
+        return f"{type(self).__name__}({arg_str})"
+
+    def forward(self, x: Tensor, y: Tensor) -> Tensor:
+        if x.ndim > 2:
+            # (N, C, d1, d2, ..., dK) --> (N * d1 * ... * dK, C)
+            c = x.shape[1]
+            x = x.permute(0, *range(2, x.ndim), 1).reshape(-1, c)
+            # (N, d1, d2, ..., dK) --> (N * d1 * ... * dK,)
+            y = y.view(-1)
+
+        unignored_mask = y != self.ignore_index
+        y = y[unignored_mask]
+        if len(y) == 0:
+            return 0.0
+        x = x[unignored_mask]
+
+        # compute weighted cross entropy term: -alpha * log(pt)
+        # (alpha is already part of self.nll_loss)
+        log_p = F.log_softmax(x, dim=-1)
+        ce = self.nll_loss(log_p, y)
+
+        # get true class column from each row
+        all_rows = torch.arange(len(x))
+        log_pt = log_p[all_rows, y]
+
+        # compute focal term: (1 - pt)^gamma
+        pt = log_pt.exp()
+        focal_term = (1 - pt)**self.gamma
+
+        # the full loss: -alpha * ((1 - pt)^gamma) * log(pt)
+        loss = focal_term * ce
+
+        if self.reduction == "mean":
+            loss = loss.mean()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+
+        return loss
+
+
+def focal_loss(
+    alpha: Optional[Sequence] = None,
+    gamma: float = 0.0,
+    reduction: str = "mean",
+    ignore_index: int = -100,
+    device="cpu",
+    dtype=torch.float32,
+) -> FocalLoss:
+    """Factory function for FocalLoss.
+    Args:
+        alpha (Sequence, optional): Weights for each class. Will be converted
+            to a Tensor if not None. Defaults to None.
+        gamma (float, optional): A constant, as described in the paper.
+            Defaults to 0.
+        reduction (str, optional): 'mean', 'sum' or 'none'.
+            Defaults to 'mean'.
+        ignore_index (int, optional): class label to ignore.
+            Defaults to -100.
+        device (str, optional): Device to move alpha to. Defaults to 'cpu'.
+        dtype (torch.dtype, optional): dtype to cast alpha to.
+            Defaults to torch.float32.
+    Returns:
+        A FocalLoss object
+    """
+    if alpha is not None:
+        if not isinstance(alpha, Tensor):
+            alpha = torch.tensor(alpha)
+        alpha = alpha.to(device=device, dtype=dtype)
+
+    fl = FocalLoss(alpha=alpha, gamma=gamma, reduction=reduction, ignore_index=ignore_index)
+    return fl
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/LICENSE b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/LICENSE
new file mode 100644
index 0000000..87bfb15
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/LICENSE
@@ -0,0 +1,8 @@
+The MIT License (MIT) Copyright (c) 2020 Andrej Karpathy
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/__init__.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/model.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/model.py
new file mode 100644
index 0000000..ab9e1c4
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/model.py
@@ -0,0 +1,231 @@
+"""
+GPT model:
+- the initial stem consists of a combination of token encoding and a positional encoding
+- the meat of it is a uniform sequence of Transformer blocks
+    - each Transformer is a sequential combination of a 1-hidden-layer MLP block and a self-attention block
+    - all blocks feed into a central residual pathway similar to resnets
+- the final decoder is a linear projection into a vanilla Softmax classifier
+"""
+
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+
+
+class GPTConfig:
+    """base GPT config, params common to all GPT versions"""
+
+    embd_pdrop = 0.1
+    resid_pdrop = 0.1
+    attn_pdrop = 0.1
+    discrete_input = False
+    input_size = 10
+    n_embd = 768
+    n_layer = 12
+
+    def __init__(self, vocab_size, block_size, **kwargs):
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+
+class GPT1Config(GPTConfig):
+    """GPT-1 like network roughly 125M params"""
+
+    n_layer = 12
+    n_head = 12
+    n_embd = 768
+
+
+class CausalSelfAttention(nn.Module):
+    """
+    A vanilla multi-head masked self-attention layer with a projection at the end.
+    It is possible to use torch.nn.MultiheadAttention here but I am including an
+    explicit implementation here to show that there is nothing too scary here.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(config.n_embd, config.n_embd)
+        self.query = nn.Linear(config.n_embd, config.n_embd)
+        self.value = nn.Linear(config.n_embd, config.n_embd)
+        # regularization
+        self.attn_drop = nn.Dropout(config.attn_pdrop)
+        self.resid_drop = nn.Dropout(config.resid_pdrop)
+        # output projection
+        self.proj = nn.Linear(config.n_embd, config.n_embd)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer(
+            "mask",
+            torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size,
+                                                                              config.block_size),
+        )
+        self.n_head = config.n_head
+
+    def forward(self, x):
+        (
+            B,
+            T,
+            C,
+        ) = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = (self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2))  # (B, nh, T, hs)
+        q = (self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2))  # (B, nh, T, hs)
+        v = (self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2))  # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_drop(att)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (y.transpose(1, 2).contiguous().view(B, T, C))  # re-assemble all head outputs side by side
+
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        return y
+
+
+class Block(nn.Module):
+    """an unassuming Transformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config.n_embd)
+        self.ln2 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config.n_embd, 4 * config.n_embd),
+            nn.GELU(),
+            nn.Linear(4 * config.n_embd, config.n_embd),
+            nn.Dropout(config.resid_pdrop),
+        )
+
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+class GPT(nn.Module):
+    """the full GPT language model, with a context size of block_size"""
+
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+
+        # input embedding stem
+        if config.discrete_input:
+            self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        else:
+            self.tok_emb = nn.Linear(config.input_size, config.n_embd)
+        self.discrete_input = config.discrete_input
+        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
+        self.drop = nn.Dropout(config.embd_pdrop)
+        # transformer
+        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
+        # decoder head
+        self.ln_f = nn.LayerNorm(config.n_embd)
+        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        self.block_size = config.block_size
+        self.apply(self._init_weights)
+
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            torch.nn.init.zeros_(module.bias)
+            torch.nn.init.ones_(module.weight)
+        elif isinstance(module, GPT):
+            torch.nn.init.normal_(module.pos_emb, mean=0.0, std=0.02)
+
+    def configure_optimizers(self, train_config):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = "%s.%s" % (mn, pn) if mn else pn  # full param name
+
+                if pn.endswith("bias"):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith("weight") and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith("weight") and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+
+        # special case the position embedding parameter in the root GPT module as not decayed
+        no_decay.add("pos_emb")
+
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert (len(inter_params) == 0), "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert (len(param_dict.keys() -
+                    union_params) == 0), "parameters %s were not separated into either decay/no_decay set!" % (
+                        str(param_dict.keys() - union_params), )
+
+        # create the pytorch optimizer object
+        optim_groups = [
+            {
+                "params": [param_dict[pn] for pn in sorted(list(decay))],
+                "weight_decay": train_config.weight_decay,
+            },
+            {
+                "params": [param_dict[pn] for pn in sorted(list(no_decay))],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        return optimizer
+
+    def forward(self, idx, targets=None):
+        if self.discrete_input:
+            b, t = idx.size()
+        else:
+            b, t, dim = idx.size()
+        assert t <= self.block_size, "Cannot forward, model block size is exhausted."
+
+        # forward the GPT model
+        token_embeddings = self.tok_emb(idx)  # each index maps to a (learnable) vector
+        position_embeddings = self.pos_emb[:, :t, :]  # each position maps to a (learnable) vector
+        x = self.drop(token_embeddings + position_embeddings)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.head(x)
+
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+
+        return logits, loss
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/trainer.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/trainer.py
new file mode 100644
index 0000000..10fd3c4
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/trainer.py
@@ -0,0 +1,145 @@
+"""
+Simple training loop; Boilerplate that could apply to any arbitrary neural network,
+so nothing in this file really has anything to do with GPT specifically.
+"""
+
+import math
+import logging
+
+from tqdm import tqdm
+import numpy as np
+
+import torch
+import torch.optim as optim
+from torch.optim.lr_scheduler import LambdaLR
+from torch.utils.data.dataloader import DataLoader
+
+logger = logging.getLogger(__name__)
+
+
+class TrainerConfig:
+    # optimization parameters
+    max_epochs = 10
+    batch_size = 64
+    learning_rate = 3e-4
+    betas = (0.9, 0.95)
+    grad_norm_clip = 1.0
+    weight_decay = 0.1  # only applied on matmul weights
+    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
+    lr_decay = False
+    warmup_tokens = 375e6  # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
+    final_tokens = 260e9  # (at what point we reach 10% of original LR)
+    # checkpoint settings
+    ckpt_path = None
+    num_workers = 0  # for DataLoader
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+
+class Trainer:
+
+    def __init__(self, model, train_dataset, test_dataset, config):
+        self.model = model
+        self.train_dataset = train_dataset
+        self.test_dataset = test_dataset
+        self.config = config
+
+        # take over whatever gpus are on the system
+        self.device = "cpu"
+        if torch.cuda.is_available():
+            self.device = torch.cuda.current_device()
+            self.model = torch.nn.DataParallel(self.model).to(self.device)
+
+    def save_checkpoint(self):
+        # DataParallel wrappers keep raw model object in .module attribute
+        raw_model = self.model.module if hasattr(self.model, "module") else self.model
+        logger.info("saving %s", self.config.ckpt_path)
+        torch.save(raw_model.state_dict(), self.config.ckpt_path)
+
+    def train(self):
+        model, config = self.model, self.config
+        raw_model = model.module if hasattr(self.model, "module") else model
+        optimizer = raw_model.configure_optimizers(config)
+
+        def run_epoch(loader, is_train):
+            model.train(is_train)
+
+            losses = []
+            pbar = (tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader))
+            for it, (x, y) in pbar:
+
+                # place data on the correct device
+                x = x.to(self.device)
+                y = y.to(self.device)
+
+                # forward the model
+                with torch.set_grad_enabled(is_train):
+                    logits, loss = model(x, y)
+                    loss = (loss.mean())  # collapse all losses if they are scattered on multiple gpus
+                    losses.append(loss.item())
+
+                if is_train:
+
+                    # backprop and update the parameters
+                    model.zero_grad()
+                    loss.backward()
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
+                    optimizer.step()
+
+                    # decay the learning rate based on our progress
+                    if config.lr_decay:
+                        self.tokens += (y >= 0).sum()  # number of tokens processed this step (i.e. label is not -100)
+                        if self.tokens < config.warmup_tokens:
+                            # linear warmup
+                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
+                        else:
+                            # cosine learning rate decay
+                            progress = float(self.tokens - config.warmup_tokens) / float(
+                                max(1, config.final_tokens - config.warmup_tokens))
+                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
+                        lr = config.learning_rate * lr_mult
+                        for param_group in optimizer.param_groups:
+                            param_group["lr"] = lr
+                    else:
+                        lr = config.learning_rate
+
+                    # report progress
+                    pbar.set_description(  # type: ignore
+                        f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}")
+
+            if not is_train:
+                test_loss = float(np.mean(losses))
+                logger.info("test loss: %f", test_loss)
+                return test_loss
+
+        best_loss = float("inf")
+        self.tokens = 0  # counter used for learning rate decay
+
+        train_loader = DataLoader(
+            self.train_dataset,
+            shuffle=True,
+            pin_memory=True,
+            batch_size=config.batch_size,
+            num_workers=config.num_workers,
+        )
+        if self.test_dataset is not None:
+            test_loader = DataLoader(
+                self.test_dataset,
+                shuffle=True,
+                pin_memory=True,
+                batch_size=config.batch_size,
+                num_workers=config.num_workers,
+            )
+
+        for epoch in range(config.max_epochs):
+            run_epoch(train_loader, is_train=True)
+            if self.test_dataset is not None:
+                test_loss = run_epoch(test_loader, is_train=False)
+
+            # supports early stopping based on the test loss, or just save always if no test set is provided
+            good_model = self.test_dataset is None or test_loss < best_loss
+            if self.config.ckpt_path is not None and good_model:
+                best_loss = test_loss
+                self.save_checkpoint()
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/utils.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/utils.py
new file mode 100644
index 0000000..1065ce7
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/libraries/mingpt/utils.py
@@ -0,0 +1,49 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float("Inf")
+    return out
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    """
+    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
+    the sequence, feeding the predictions back into the model each time. Clearly the sampling
+    has quadratic complexity unlike an RNN that is only linear, and has a finite context window
+    of block_size, unlike an RNN that has an infinite context window.
+    """
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = (x if x.size(1) <= block_size else x[:, -block_size:])  # crop context if needed
+        logits, _ = model(x_cond)
+        # pluck the logits at the final step and scale by temperature
+        logits = logits[:, -1, :] / temperature
+        # optionally crop probabilities to only the top k options
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        # apply softmax to convert to probabilities
+        probs = F.softmax(logits, dim=-1)
+        # sample from the distribution or take the most likely
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        # append to the sequence and continue
+        x = torch.cat((x, ix), dim=1)
+
+    return x
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/bet/utils.py b/r2sVLA/algos/DP/diffusion_policy/model/bet/utils.py
new file mode 100644
index 0000000..3b75b73
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/bet/utils.py
@@ -0,0 +1,130 @@
+import os
+
+from collections import OrderedDict
+from typing import List, Optional
+
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+
+from torch.utils.data import random_split
+import wandb
+
+
+def mlp(input_dim, hidden_dim, output_dim, hidden_depth, output_mod=None):
+    if hidden_depth == 0:
+        mods = [nn.Linear(input_dim, output_dim)]
+    else:
+        mods = [nn.Linear(input_dim, hidden_dim), nn.ReLU(inplace=True)]
+        for i in range(hidden_depth - 1):
+            mods += [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(inplace=True)]
+        mods.append(nn.Linear(hidden_dim, output_dim))
+    if output_mod is not None:
+        mods.append(output_mod)
+    trunk = nn.Sequential(*mods)
+    return trunk
+
+
+class eval_mode:
+
+    def __init__(self, *models, no_grad=False):
+        self.models = models
+        self.no_grad = no_grad
+        self.no_grad_context = torch.no_grad()
+
+    def __enter__(self):
+        self.prev_states = []
+        for model in self.models:
+            self.prev_states.append(model.training)
+            model.train(False)
+        if self.no_grad:
+            self.no_grad_context.__enter__()
+
+    def __exit__(self, *args):
+        if self.no_grad:
+            self.no_grad_context.__exit__(*args)
+        for model, state in zip(self.models, self.prev_states):
+            model.train(state)
+        return False
+
+
+def freeze_module(module: nn.Module) -> nn.Module:
+    for param in module.parameters():
+        param.requires_grad = False
+    module.eval()
+    return module
+
+
+def set_seed_everywhere(seed):
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def shuffle_along_axis(a, axis):
+    idx = np.random.rand(*a.shape).argsort(axis=axis)
+    return np.take_along_axis(a, idx, axis=axis)
+
+
+def transpose_batch_timestep(*args):
+    return (einops.rearrange(arg, "b t ... -> t b ...") for arg in args)
+
+
+class TrainWithLogger:
+
+    def reset_log(self):
+        self.log_components = OrderedDict()
+
+    def log_append(self, log_key, length, loss_components):
+        for key, value in loss_components.items():
+            key_name = f"{log_key}/{key}"
+            count, sum = self.log_components.get(key_name, (0, 0.0))
+            self.log_components[key_name] = (
+                count + length,
+                sum + (length * value.detach().cpu().item()),
+            )
+
+    def flush_log(self, epoch, iterator=None):
+        log_components = OrderedDict()
+        iterator_log_component = OrderedDict()
+        for key, value in self.log_components.items():
+            count, sum = value
+            to_log = sum / count
+            log_components[key] = to_log
+            # Set the iterator status
+            log_key, name_key = key.split("/")
+            iterator_log_name = f"{log_key[0]}{name_key[0]}".upper()
+            iterator_log_component[iterator_log_name] = to_log
+        postfix = ",".join("{}:{:.2e}".format(key, iterator_log_component[key])
+                           for key in iterator_log_component.keys())
+        if iterator is not None:
+            iterator.set_postfix_str(postfix)
+        wandb.log(log_components, step=epoch)
+        self.log_components = OrderedDict()
+
+
+class SaveModule(nn.Module):
+
+    def set_snapshot_path(self, path):
+        self.snapshot_path = path
+        print(f"Setting snapshot path to {self.snapshot_path}")
+
+    def save_snapshot(self):
+        os.makedirs(self.snapshot_path, exist_ok=True)
+        torch.save(self.state_dict(), self.snapshot_path / "snapshot.pth")
+
+    def load_snapshot(self):
+        self.load_state_dict(torch.load(self.snapshot_path / "snapshot.pth"))
+
+
+def split_datasets(dataset, train_fraction=0.95, random_seed=42):
+    dataset_length = len(dataset)
+    lengths = [
+        int(train_fraction * dataset_length),
+        dataset_length - int(train_fraction * dataset_length),
+    ]
+    train_set, val_set = random_split(dataset, lengths, generator=torch.Generator().manual_seed(random_seed))
+    return train_set, val_set
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/common/dict_of_tensor_mixin.py b/r2sVLA/algos/DP/diffusion_policy/model/common/dict_of_tensor_mixin.py
new file mode 100644
index 0000000..358da9f
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/common/dict_of_tensor_mixin.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+
+
+class DictOfTensorMixin(nn.Module):
+
+    def __init__(self, params_dict=None):
+        super().__init__()
+        if params_dict is None:
+            params_dict = nn.ParameterDict()
+        self.params_dict = params_dict
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+
+        def dfs_add(dest, keys, value: torch.Tensor):
+            if len(keys) == 1:
+                dest[keys[0]] = value
+                return
+
+            if keys[0] not in dest:
+                dest[keys[0]] = nn.ParameterDict()
+            dfs_add(dest[keys[0]], keys[1:], value)
+
+        def load_dict(state_dict, prefix):
+            out_dict = nn.ParameterDict()
+            for key, value in state_dict.items():
+                value: torch.Tensor
+                if key.startswith(prefix):
+                    param_keys = key[len(prefix):].split(".")[1:]
+                    # if len(param_keys) == 0:
+                    #     import pdb; pdb.set_trace()
+                    dfs_add(out_dict, param_keys, value.clone())
+            return out_dict
+
+        self.params_dict = load_dict(state_dict, prefix + "params_dict")
+        self.params_dict.requires_grad_(False)
+        return
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/common/lr_scheduler.py b/r2sVLA/algos/DP/diffusion_policy/model/common/lr_scheduler.py
new file mode 100644
index 0000000..c97f305
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/common/lr_scheduler.py
@@ -0,0 +1,55 @@
+from diffusers.optimization import (
+    Union,
+    SchedulerType,
+    Optional,
+    Optimizer,
+    TYPE_TO_SCHEDULER_FUNCTION,
+)
+
+
+def get_scheduler(
+    name: Union[str, SchedulerType],
+    optimizer: Optimizer,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    **kwargs,
+):
+    """
+    Added kwargs vs diffuser's original implementation
+
+    Unified API to get any scheduler from its name.
+
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == SchedulerType.CONSTANT:
+        return schedule_func(optimizer, **kwargs)
+
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **kwargs)
+
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+    return schedule_func(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        **kwargs,
+    )
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/common/module_attr_mixin.py b/r2sVLA/algos/DP/diffusion_policy/model/common/module_attr_mixin.py
new file mode 100644
index 0000000..e33efe2
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/common/module_attr_mixin.py
@@ -0,0 +1,16 @@
+import torch.nn as nn
+
+
+class ModuleAttrMixin(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self._dummy_variable = nn.Parameter()
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    @property
+    def dtype(self):
+        return next(iter(self.parameters())).dtype
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/common/normalizer.py b/r2sVLA/algos/DP/diffusion_policy/model/common/normalizer.py
new file mode 100644
index 0000000..c8d362c
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/common/normalizer.py
@@ -0,0 +1,369 @@
+from typing import Union, Dict
+
+import unittest
+import zarr
+import numpy as np
+import torch
+import torch.nn as nn
+from diffusion_policy.common.pytorch_util import dict_apply
+from diffusion_policy.model.common.dict_of_tensor_mixin import DictOfTensorMixin
+
+
+class LinearNormalizer(DictOfTensorMixin):
+    avaliable_modes = ["limits", "gaussian"]
+
+    @torch.no_grad()
+    def fit(
+        self,
+        data: Union[Dict, torch.Tensor, np.ndarray, zarr.Array],
+        last_n_dims=1,
+        dtype=torch.float32,
+        mode="limits",
+        output_max=1.0,
+        output_min=-1.0,
+        range_eps=1e-4,
+        fit_offset=True,
+    ):
+        if isinstance(data, dict):
+            for key, value in data.items():
+                self.params_dict[key] = _fit(
+                    value,
+                    last_n_dims=last_n_dims,
+                    dtype=dtype,
+                    mode=mode,
+                    output_max=output_max,
+                    output_min=output_min,
+                    range_eps=range_eps,
+                    fit_offset=fit_offset,
+                )
+        else:
+            self.params_dict["_default"] = _fit(
+                data,
+                last_n_dims=last_n_dims,
+                dtype=dtype,
+                mode=mode,
+                output_max=output_max,
+                output_min=output_min,
+                range_eps=range_eps,
+                fit_offset=fit_offset,
+            )
+
+    def __call__(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return self.normalize(x)
+
+    def __getitem__(self, key: str):
+        return SingleFieldLinearNormalizer(self.params_dict[key])
+
+    def __setitem__(self, key: str, value: "SingleFieldLinearNormalizer"):
+        self.params_dict[key] = value.params_dict
+
+    def _normalize_impl(self, x, forward=True):
+        if isinstance(x, dict):
+            result = dict()
+            for key, value in x.items():
+                params = self.params_dict[key]
+                result[key] = _normalize(value, params, forward=forward)
+            return result
+        else:
+            if "_default" not in self.params_dict:
+                raise RuntimeError("Not initialized")
+            params = self.params_dict["_default"]
+            return _normalize(x, params, forward=forward)
+
+    def normalize(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return self._normalize_impl(x, forward=True)
+
+    def unnormalize(self, x: Union[Dict, torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return self._normalize_impl(x, forward=False)
+
+    def get_input_stats(self) -> Dict:
+        if len(self.params_dict) == 0:
+            raise RuntimeError("Not initialized")
+        if len(self.params_dict) == 1 and "_default" in self.params_dict:
+            return self.params_dict["_default"]["input_stats"]
+
+        result = dict()
+        for key, value in self.params_dict.items():
+            if key != "_default":
+                result[key] = value["input_stats"]
+        return result
+
+    def get_output_stats(self, key="_default"):
+        input_stats = self.get_input_stats()
+        if "min" in input_stats:
+            # no dict
+            return dict_apply(input_stats, self.normalize)
+
+        result = dict()
+        for key, group in input_stats.items():
+            this_dict = dict()
+            for name, value in group.items():
+                this_dict[name] = self.normalize({key: value})[key]
+            result[key] = this_dict
+        return result
+
+
+class SingleFieldLinearNormalizer(DictOfTensorMixin):
+    avaliable_modes = ["limits", "gaussian"]
+
+    @torch.no_grad()
+    def fit(
+        self,
+        data: Union[torch.Tensor, np.ndarray, zarr.Array],
+        last_n_dims=1,
+        dtype=torch.float32,
+        mode="limits",
+        output_max=1.0,
+        output_min=-1.0,
+        range_eps=1e-4,
+        fit_offset=True,
+    ):
+        self.params_dict = _fit(
+            data,
+            last_n_dims=last_n_dims,
+            dtype=dtype,
+            mode=mode,
+            output_max=output_max,
+            output_min=output_min,
+            range_eps=range_eps,
+            fit_offset=fit_offset,
+        )
+
+    @classmethod
+    def create_fit(cls, data: Union[torch.Tensor, np.ndarray, zarr.Array], **kwargs):
+        obj = cls()
+        obj.fit(data, **kwargs)
+        return obj
+
+    @classmethod
+    def create_manual(
+        cls,
+        scale: Union[torch.Tensor, np.ndarray],
+        offset: Union[torch.Tensor, np.ndarray],
+        input_stats_dict: Dict[str, Union[torch.Tensor, np.ndarray]],
+    ):
+
+        def to_tensor(x):
+            if not isinstance(x, torch.Tensor):
+                x = torch.from_numpy(x)
+            x = x.flatten()
+            return x
+
+        # check
+        for x in [offset] + list(input_stats_dict.values()):
+            assert x.shape == scale.shape
+            assert x.dtype == scale.dtype
+
+        params_dict = nn.ParameterDict({
+            "scale": to_tensor(scale),
+            "offset": to_tensor(offset),
+            "input_stats": nn.ParameterDict(dict_apply(input_stats_dict, to_tensor)),
+        })
+        return cls(params_dict)
+
+    @classmethod
+    def create_identity(cls, dtype=torch.float32):
+        scale = torch.tensor([1], dtype=dtype)
+        offset = torch.tensor([0], dtype=dtype)
+        input_stats_dict = {
+            "min": torch.tensor([-1], dtype=dtype),
+            "max": torch.tensor([1], dtype=dtype),
+            "mean": torch.tensor([0], dtype=dtype),
+            "std": torch.tensor([1], dtype=dtype),
+        }
+        return cls.create_manual(scale, offset, input_stats_dict)
+
+    def normalize(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return _normalize(x, self.params_dict, forward=True)
+
+    def unnormalize(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return _normalize(x, self.params_dict, forward=False)
+
+    def get_input_stats(self):
+        return self.params_dict["input_stats"]
+
+    def get_output_stats(self):
+        return dict_apply(self.params_dict["input_stats"], self.normalize)
+
+    def __call__(self, x: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+        return self.normalize(x)
+
+
+def _fit(
+    data: Union[torch.Tensor, np.ndarray, zarr.Array],
+    last_n_dims=1,
+    dtype=torch.float32,
+    mode="limits",
+    output_max=1.0,
+    output_min=-1.0,
+    range_eps=1e-4,
+    fit_offset=True,
+):
+    assert mode in ["limits", "gaussian"]
+    assert last_n_dims >= 0
+    assert output_max > output_min
+
+    # convert data to torch and type
+    if isinstance(data, zarr.Array):
+        data = data[:]
+    if isinstance(data, np.ndarray):
+        data = torch.from_numpy(data)
+    if dtype is not None:
+        data = data.type(dtype)
+
+    # convert shape
+    dim = 1
+    if last_n_dims > 0:
+        dim = np.prod(data.shape[-last_n_dims:])
+    data = data.reshape(-1, dim)
+
+    # compute input stats min max mean std
+    input_min, _ = data.min(axis=0)
+    input_max, _ = data.max(axis=0)
+    input_mean = data.mean(axis=0)
+    input_std = data.std(axis=0)
+
+    # compute scale and offset
+    if mode == "limits":
+        if fit_offset:
+            # unit scale
+            input_range = input_max - input_min
+            ignore_dim = input_range < range_eps
+            input_range[ignore_dim] = output_max - output_min
+            scale = (output_max - output_min) / input_range
+            offset = output_min - scale * input_min
+            offset[ignore_dim] = (output_max + output_min) / 2 - input_min[ignore_dim]
+            # ignore dims scaled to mean of output max and min
+        else:
+            # use this when data is pre-zero-centered.
+            assert output_max > 0
+            assert output_min < 0
+            # unit abs
+            output_abs = min(abs(output_min), abs(output_max))
+            input_abs = torch.maximum(torch.abs(input_min), torch.abs(input_max))
+            ignore_dim = input_abs < range_eps
+            input_abs[ignore_dim] = output_abs
+            # don't scale constant channels
+            scale = output_abs / input_abs
+            offset = torch.zeros_like(input_mean)
+    elif mode == "gaussian":
+        ignore_dim = input_std < range_eps
+        scale = input_std.clone()
+        scale[ignore_dim] = 1
+        scale = 1 / scale
+
+        if fit_offset:
+            offset = -input_mean * scale
+        else:
+            offset = torch.zeros_like(input_mean)
+
+    # save
+    this_params = nn.ParameterDict({
+        "scale":
+        scale,
+        "offset":
+        offset,
+        "input_stats":
+        nn.ParameterDict({
+            "min": input_min,
+            "max": input_max,
+            "mean": input_mean,
+            "std": input_std,
+        }),
+    })
+    for p in this_params.parameters():
+        p.requires_grad_(False)
+    return this_params
+
+
+def _normalize(x, params, forward=True):
+    assert "scale" in params
+    if isinstance(x, np.ndarray):
+        x = torch.from_numpy(x)
+    scale = params["scale"]
+    offset = params["offset"]
+    x = x.to(device=scale.device, dtype=scale.dtype)
+    src_shape = x.shape
+    # import pdb
+    # pdb.set_trace()
+    x = x.reshape(-1, scale.shape[0])
+    if forward:
+        x = x * scale + offset
+    else:
+        x = (x - offset) / scale
+    x = x.reshape(src_shape)
+    return x
+
+
+def test():
+    data = torch.zeros((100, 10, 9, 2)).uniform_()
+    data[..., 0, 0] = 0
+
+    normalizer = SingleFieldLinearNormalizer()
+    normalizer.fit(data, mode="limits", last_n_dims=2)
+    datan = normalizer.normalize(data)
+    assert datan.shape == data.shape
+    assert np.allclose(datan.max(), 1.0)
+    assert np.allclose(datan.min(), -1.0)
+    dataun = normalizer.unnormalize(datan)
+    assert torch.allclose(data, dataun, atol=1e-7)
+
+    input_stats = normalizer.get_input_stats()
+    output_stats = normalizer.get_output_stats()
+
+    normalizer = SingleFieldLinearNormalizer()
+    normalizer.fit(data, mode="limits", last_n_dims=1, fit_offset=False)
+    datan = normalizer.normalize(data)
+    assert datan.shape == data.shape
+    assert np.allclose(datan.max(), 1.0, atol=1e-3)
+    assert np.allclose(datan.min(), 0.0, atol=1e-3)
+    dataun = normalizer.unnormalize(datan)
+    assert torch.allclose(data, dataun, atol=1e-7)
+
+    data = torch.zeros((100, 10, 9, 2)).uniform_()
+    normalizer = SingleFieldLinearNormalizer()
+    normalizer.fit(data, mode="gaussian", last_n_dims=0)
+    datan = normalizer.normalize(data)
+    assert datan.shape == data.shape
+    assert np.allclose(datan.mean(), 0.0, atol=1e-3)
+    assert np.allclose(datan.std(), 1.0, atol=1e-3)
+    dataun = normalizer.unnormalize(datan)
+    assert torch.allclose(data, dataun, atol=1e-7)
+
+    # dict
+    data = torch.zeros((100, 10, 9, 2)).uniform_()
+    data[..., 0, 0] = 0
+
+    normalizer = LinearNormalizer()
+    normalizer.fit(data, mode="limits", last_n_dims=2)
+    datan = normalizer.normalize(data)
+    assert datan.shape == data.shape
+    assert np.allclose(datan.max(), 1.0)
+    assert np.allclose(datan.min(), -1.0)
+    dataun = normalizer.unnormalize(datan)
+    assert torch.allclose(data, dataun, atol=1e-7)
+
+    input_stats = normalizer.get_input_stats()
+    output_stats = normalizer.get_output_stats()
+
+    data = {
+        "obs": torch.zeros((1000, 128, 9, 2)).uniform_() * 512,
+        "action": torch.zeros((1000, 128, 2)).uniform_() * 512,
+    }
+    normalizer = LinearNormalizer()
+    normalizer.fit(data)
+    datan = normalizer.normalize(data)
+    dataun = normalizer.unnormalize(datan)
+    for key in data:
+        assert torch.allclose(data[key], dataun[key], atol=1e-4)
+
+    input_stats = normalizer.get_input_stats()
+    output_stats = normalizer.get_output_stats()
+
+    state_dict = normalizer.state_dict()
+    n = LinearNormalizer()
+    n.load_state_dict(state_dict)
+    datan = n.normalize(data)
+    dataun = n.unnormalize(datan)
+    for key in data:
+        assert torch.allclose(data[key], dataun[key], atol=1e-4)
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/common/rotation_transformer.py b/r2sVLA/algos/DP/diffusion_policy/model/common/rotation_transformer.py
new file mode 100644
index 0000000..faf182a
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/common/rotation_transformer.py
@@ -0,0 +1,97 @@
+from typing import Union
+import pytorch3d.transforms as pt
+import torch
+import numpy as np
+import functools
+
+
+class RotationTransformer:
+    valid_reps = ["axis_angle", "euler_angles", "quaternion", "rotation_6d", "matrix"]
+
+    def __init__(
+        self,
+        from_rep="axis_angle",
+        to_rep="rotation_6d",
+        from_convention=None,
+        to_convention=None,
+    ):
+        """
+        Valid representations
+
+        Always use matrix as intermediate representation.
+        """
+        assert from_rep != to_rep
+        assert from_rep in self.valid_reps
+        assert to_rep in self.valid_reps
+        if from_rep == "euler_angles":
+            assert from_convention is not None
+        if to_rep == "euler_angles":
+            assert to_convention is not None
+
+        forward_funcs = list()
+        inverse_funcs = list()
+
+        if from_rep != "matrix":
+            funcs = [
+                getattr(pt, f"{from_rep}_to_matrix"),
+                getattr(pt, f"matrix_to_{from_rep}"),
+            ]
+            if from_convention is not None:
+                funcs = [functools.partial(func, convention=from_convention) for func in funcs]
+            forward_funcs.append(funcs[0])
+            inverse_funcs.append(funcs[1])
+
+        if to_rep != "matrix":
+            funcs = [
+                getattr(pt, f"matrix_to_{to_rep}"),
+                getattr(pt, f"{to_rep}_to_matrix"),
+            ]
+            if to_convention is not None:
+                funcs = [functools.partial(func, convention=to_convention) for func in funcs]
+            forward_funcs.append(funcs[0])
+            inverse_funcs.append(funcs[1])
+
+        inverse_funcs = inverse_funcs[::-1]
+
+        self.forward_funcs = forward_funcs
+        self.inverse_funcs = inverse_funcs
+
+    @staticmethod
+    def _apply_funcs(x: Union[np.ndarray, torch.Tensor], funcs: list) -> Union[np.ndarray, torch.Tensor]:
+        x_ = x
+        if isinstance(x, np.ndarray):
+            x_ = torch.from_numpy(x)
+        x_: torch.Tensor
+        for func in funcs:
+            x_ = func(x_)
+        y = x_
+        if isinstance(x, np.ndarray):
+            y = x_.numpy()
+        return y
+
+    def forward(self, x: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        return self._apply_funcs(x, self.forward_funcs)
+
+    def inverse(self, x: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        return self._apply_funcs(x, self.inverse_funcs)
+
+
+def test():
+    tf = RotationTransformer()
+
+    rotvec = np.random.uniform(-2 * np.pi, 2 * np.pi, size=(1000, 3))
+    rot6d = tf.forward(rotvec)
+    new_rotvec = tf.inverse(rot6d)
+
+    from scipy.spatial.transform import Rotation
+
+    diff = Rotation.from_rotvec(rotvec) * Rotation.from_rotvec(new_rotvec).inv()
+    dist = diff.magnitude()
+    assert dist.max() < 1e-7
+
+    tf = RotationTransformer("rotation_6d", "matrix")
+    rot6d_wrong = rot6d + np.random.normal(scale=0.1, size=rot6d.shape)
+    mat = tf.forward(rot6d_wrong)
+    mat_det = np.linalg.det(mat)
+    assert np.allclose(mat_det, 1)
+    # rotaiton_6d will be normalized to rotation matrix
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/common/shape_util.py b/r2sVLA/algos/DP/diffusion_policy/model/common/shape_util.py
new file mode 100644
index 0000000..2445d8a
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/common/shape_util.py
@@ -0,0 +1,22 @@
+from typing import Dict, List, Tuple, Callable
+import torch
+import torch.nn as nn
+
+
+def get_module_device(m: nn.Module):
+    device = torch.device("cpu")
+    try:
+        param = next(iter(m.parameters()))
+        device = param.device
+    except StopIteration:
+        pass
+    return device
+
+
+@torch.no_grad()
+def get_output_shape(input_shape: Tuple[int], net: Callable[[torch.Tensor], torch.Tensor]):
+    device = get_module_device(net)
+    test_input = torch.zeros((1, ) + tuple(input_shape), device=device)
+    test_output = net(test_input)
+    output_shape = tuple(test_output.shape[1:])
+    return output_shape
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/common/tensor_util.py b/r2sVLA/algos/DP/diffusion_policy/model/common/tensor_util.py
new file mode 100644
index 0000000..f0fc7dd
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/common/tensor_util.py
@@ -0,0 +1,972 @@
+"""
+A collection of utilities for working with nested tensor structures consisting
+of numpy arrays and torch tensors.
+"""
+
+import collections
+import numpy as np
+import torch
+
+
+def recursive_dict_list_tuple_apply(x, type_func_dict):
+    """
+    Recursively apply functions to a nested dictionary or list or tuple, given a dictionary of
+    {data_type: function_to_apply}.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        type_func_dict (dict): a mapping from data types to the functions to be
+            applied for each data type.
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    assert list not in type_func_dict
+    assert tuple not in type_func_dict
+    assert dict not in type_func_dict
+
+    if isinstance(x, (dict, collections.OrderedDict)):
+        new_x = (collections.OrderedDict() if isinstance(x, collections.OrderedDict) else dict())
+        for k, v in x.items():
+            new_x[k] = recursive_dict_list_tuple_apply(v, type_func_dict)
+        return new_x
+    elif isinstance(x, (list, tuple)):
+        ret = [recursive_dict_list_tuple_apply(v, type_func_dict) for v in x]
+        if isinstance(x, tuple):
+            ret = tuple(ret)
+        return ret
+    else:
+        for t, f in type_func_dict.items():
+            if isinstance(x, t):
+                return f(x)
+        else:
+            raise NotImplementedError("Cannot handle data type %s" % str(type(x)))
+
+
+def map_tensor(x, func):
+    """
+    Apply function @func to torch.Tensor objects in a nested dictionary or
+    list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        func (function): function to apply to each tensor
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: func,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def map_ndarray(x, func):
+    """
+    Apply function @func to np.ndarray objects in a nested dictionary or
+    list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        func (function): function to apply to each array
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            np.ndarray: func,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def map_tensor_ndarray(x, tensor_func, ndarray_func):
+    """
+    Apply function @tensor_func to torch.Tensor objects and @ndarray_func to
+    np.ndarray objects in a nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        tensor_func (function): function to apply to each tensor
+        ndarray_Func (function): function to apply to each array
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: tensor_func,
+            np.ndarray: ndarray_func,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def clone(x):
+    """
+    Clones all torch tensors and numpy arrays in nested dictionary or list
+    or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.clone(),
+            np.ndarray: lambda x: x.copy(),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def detach(x):
+    """
+    Detaches all torch tensors in nested dictionary or list
+    or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.detach(),
+        },
+    )
+
+
+def to_batch(x):
+    """
+    Introduces a leading batch dimension of 1 for all torch tensors and numpy
+    arrays in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[None, ...],
+            np.ndarray: lambda x: x[None, ...],
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_sequence(x):
+    """
+    Introduces a time dimension of 1 at dimension 1 for all torch tensors and numpy
+    arrays in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[:, None, ...],
+            np.ndarray: lambda x: x[:, None, ...],
+            type(None): lambda x: x,
+        },
+    )
+
+
+def index_at_time(x, ind):
+    """
+    Indexes all torch tensors and numpy arrays in dimension 1 with index @ind in
+    nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        ind (int): index
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x[:, ind, ...],
+            np.ndarray: lambda x: x[:, ind, ...],
+            type(None): lambda x: x,
+        },
+    )
+
+
+def unsqueeze(x, dim):
+    """
+    Adds dimension of size 1 at dimension @dim in all torch tensors and numpy arrays
+    in nested dictionary or list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        dim (int): dimension
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.unsqueeze(dim=dim),
+            np.ndarray: lambda x: np.expand_dims(x, axis=dim),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def contiguous(x):
+    """
+    Makes all torch tensors and numpy arrays contiguous in nested dictionary or
+    list or tuple and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.contiguous(),
+            np.ndarray: lambda x: np.ascontiguousarray(x),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_device(x, device):
+    """
+    Sends all torch tensors in nested dictionary or list or tuple to device
+    @device, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        device (torch.Device): device to send tensors to
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, d=device: x.to(d),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_tensor(x):
+    """
+    Converts all numpy arrays in nested dictionary or list or tuple to
+    torch tensors (and leaves existing torch Tensors as-is), and returns
+    a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x,
+            np.ndarray: lambda x: torch.from_numpy(x),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_numpy(x):
+    """
+    Converts all torch tensors in nested dictionary or list or tuple to
+    numpy (and leaves existing numpy arrays as-is), and returns
+    a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+
+    def f(tensor):
+        if tensor.is_cuda:
+            return tensor.detach().cpu().numpy()
+        else:
+            return tensor.detach().numpy()
+
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: f,
+            np.ndarray: lambda x: x,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_list(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list
+    or tuple to a list, and returns a new nested structure. Useful for
+    json encoding.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+
+    def f(tensor):
+        if tensor.is_cuda:
+            return tensor.detach().cpu().numpy().tolist()
+        else:
+            return tensor.detach().numpy().tolist()
+
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: f,
+            np.ndarray: lambda x: x.tolist(),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_float(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list
+    or tuple to float type entries, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.float(),
+            np.ndarray: lambda x: x.astype(np.float32),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_uint8(x):
+    """
+    Converts all torch tensors and numpy arrays in nested dictionary or list
+    or tuple to uint8 type entries, and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.byte(),
+            np.ndarray: lambda x: x.astype(np.uint8),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def to_torch(x, device):
+    """
+    Converts all numpy arrays and torch tensors in nested dictionary or list or tuple to
+    torch tensors on device @device and returns a new nested structure.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        device (torch.Device): device to send tensors to
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return to_device(to_float(to_tensor(x)), device)
+
+
+def to_one_hot_single(tensor, num_class):
+    """
+    Convert tensor to one-hot representation, assuming a certain number of total class labels.
+
+    Args:
+        tensor (torch.Tensor): tensor containing integer labels
+        num_class (int): number of classes
+
+    Returns:
+        x (torch.Tensor): tensor containing one-hot representation of labels
+    """
+    x = torch.zeros(tensor.size() + (num_class, )).to(tensor.device)
+    x.scatter_(-1, tensor.unsqueeze(-1), 1)
+    return x
+
+
+def to_one_hot(tensor, num_class):
+    """
+    Convert all tensors in nested dictionary or list or tuple to one-hot representation,
+    assuming a certain number of total class labels.
+
+    Args:
+        tensor (dict or list or tuple): a possibly nested dictionary or list or tuple
+        num_class (int): number of classes
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(tensor, func=lambda x, nc=num_class: to_one_hot_single(x, nc))
+
+
+def flatten_single(x, begin_axis=1):
+    """
+    Flatten a tensor in all dimensions from @begin_axis onwards.
+
+    Args:
+        x (torch.Tensor): tensor to flatten
+        begin_axis (int): which axis to flatten from
+
+    Returns:
+        y (torch.Tensor): flattened tensor
+    """
+    fixed_size = x.size()[:begin_axis]
+    _s = list(fixed_size) + [-1]
+    return x.reshape(*_s)
+
+
+def flatten(x, begin_axis=1):
+    """
+    Flatten all tensors in nested dictionary or list or tuple, from @begin_axis onwards.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): which axis to flatten from
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x, b=begin_axis: flatten_single(x, begin_axis=b),
+        },
+    )
+
+
+def reshape_dimensions_single(x, begin_axis, end_axis, target_dims):
+    """
+    Reshape selected dimensions in a tensor to a target dimension.
+
+    Args:
+        x (torch.Tensor): tensor to reshape
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension
+        target_dims (tuple or list): target shape for the range of dimensions
+            (@begin_axis, @end_axis)
+
+    Returns:
+        y (torch.Tensor): reshaped tensor
+    """
+    assert begin_axis <= end_axis
+    assert begin_axis >= 0
+    assert end_axis < len(x.shape)
+    assert isinstance(target_dims, (tuple, list))
+    s = x.shape
+    final_s = []
+    for i in range(len(s)):
+        if i == begin_axis:
+            final_s.extend(target_dims)
+        elif i < begin_axis or i > end_axis:
+            final_s.append(s[i])
+    return x.reshape(*final_s)
+
+
+def reshape_dimensions(x, begin_axis, end_axis, target_dims):
+    """
+    Reshape selected dimensions for all tensors in nested dictionary or list or tuple
+    to a target dimension.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension
+        target_dims (tuple or list): target shape for the range of dimensions
+            (@begin_axis, @end_axis)
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor:
+            lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=t),
+            np.ndarray:
+            lambda x, b=begin_axis, e=end_axis, t=target_dims: reshape_dimensions_single(
+                x, begin_axis=b, end_axis=e, target_dims=t),
+            type(None):
+            lambda x: x,
+        },
+    )
+
+
+def join_dimensions(x, begin_axis, end_axis):
+    """
+    Joins all dimensions between dimensions (@begin_axis, @end_axis) into a flat dimension, for
+    all tensors in nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        begin_axis (int): begin dimension
+        end_axis (int): end dimension
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor:
+            lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(x, begin_axis=b, end_axis=e, target_dims=[-1]
+                                                                          ),
+            np.ndarray:
+            lambda x, b=begin_axis, e=end_axis: reshape_dimensions_single(x, begin_axis=b, end_axis=e, target_dims=[-1]
+                                                                          ),
+            type(None):
+            lambda x: x,
+        },
+    )
+
+
+def expand_at_single(x, size, dim):
+    """
+    Expand a tensor at a single dimension @dim by @size
+
+    Args:
+        x (torch.Tensor): input tensor
+        size (int): size to expand
+        dim (int): dimension to expand
+
+    Returns:
+        y (torch.Tensor): expanded tensor
+    """
+    assert dim < x.ndimension()
+    assert x.shape[dim] == 1
+    expand_dims = [-1] * x.ndimension()
+    expand_dims[dim] = size
+    return x.expand(*expand_dims)
+
+
+def expand_at(x, size, dim):
+    """
+    Expand all tensors in nested dictionary or list or tuple at a single
+    dimension @dim by @size.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size to expand
+        dim (int): dimension to expand
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(x, lambda t, s=size, d=dim: expand_at_single(t, s, d))
+
+
+def unsqueeze_expand_at(x, size, dim):
+    """
+    Unsqueeze and expand a tensor at a dimension @dim by @size.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size to expand
+        dim (int): dimension to unsqueeze and expand
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    x = unsqueeze(x, dim)
+    return expand_at(x, size, dim)
+
+
+def repeat_by_expand_at(x, repeats, dim):
+    """
+    Repeat a dimension by combining expand and reshape operations.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        repeats (int): number of times to repeat the target dimension
+        dim (int): dimension to repeat on
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    x = unsqueeze_expand_at(x, repeats, dim + 1)
+    return join_dimensions(x, dim, dim + 1)
+
+
+def named_reduce_single(x, reduction, dim):
+    """
+    Reduce tensor at a dimension by named reduction functions.
+
+    Args:
+        x (torch.Tensor): tensor to be reduced
+        reduction (str): one of ["sum", "max", "mean", "flatten"]
+        dim (int): dimension to be reduced (or begin axis for flatten)
+
+    Returns:
+        y (torch.Tensor): reduced tensor
+    """
+    assert x.ndimension() > dim
+    assert reduction in ["sum", "max", "mean", "flatten"]
+    if reduction == "flatten":
+        x = flatten(x, begin_axis=dim)
+    elif reduction == "max":
+        x = torch.max(x, dim=dim)[0]  # [B, D]
+    elif reduction == "sum":
+        x = torch.sum(x, dim=dim)
+    else:
+        x = torch.mean(x, dim=dim)
+    return x
+
+
+def named_reduce(x, reduction, dim):
+    """
+    Reduces all tensors in nested dictionary or list or tuple at a dimension
+    using a named reduction function.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        reduction (str): one of ["sum", "max", "mean", "flatten"]
+        dim (int): dimension to be reduced (or begin axis for flatten)
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(x, func=lambda t, r=reduction, d=dim: named_reduce_single(t, r, d))
+
+
+def gather_along_dim_with_dim_single(x, target_dim, source_dim, indices):
+    """
+    This function indexes out a target dimension of a tensor in a structured way,
+    by allowing a different value to be selected for each member of a flat index
+    tensor (@indices) corresponding to a source dimension. This can be interpreted
+    as moving along the source dimension, using the corresponding index value
+    in @indices to select values for all other dimensions outside of the
+    source and target dimensions. A common use case is to gather values
+    in target dimension 1 for each batch member (target dimension 0).
+
+    Args:
+        x (torch.Tensor): tensor to gather values for
+        target_dim (int): dimension to gather values along
+        source_dim (int): dimension to hold constant and use for gathering values
+            from the other dimensions
+        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
+            @source_dim
+
+    Returns:
+        y (torch.Tensor): gathered tensor, with dimension @target_dim indexed out
+    """
+    assert len(indices.shape) == 1
+    assert x.shape[source_dim] == indices.shape[0]
+
+    # unsqueeze in all dimensions except the source dimension
+    new_shape = [1] * x.ndimension()
+    new_shape[source_dim] = -1
+    indices = indices.reshape(*new_shape)
+
+    # repeat in all dimensions - but preserve shape of source dimension,
+    # and make sure target_dimension has singleton dimension
+    expand_shape = list(x.shape)
+    expand_shape[source_dim] = -1
+    expand_shape[target_dim] = 1
+    indices = indices.expand(*expand_shape)
+
+    out = x.gather(dim=target_dim, index=indices)
+    return out.squeeze(target_dim)
+
+
+def gather_along_dim_with_dim(x, target_dim, source_dim, indices):
+    """
+    Apply @gather_along_dim_with_dim_single to all tensors in a nested
+    dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        target_dim (int): dimension to gather values along
+        source_dim (int): dimension to hold constant and use for gathering values
+            from the other dimensions
+        indices (torch.Tensor): flat index tensor with same shape as tensor @x along
+            @source_dim
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple
+    """
+    return map_tensor(
+        x,
+        lambda y, t=target_dim, s=source_dim, i=indices: gather_along_dim_with_dim_single(y, t, s, i),
+    )
+
+
+def gather_sequence_single(seq, indices):
+    """
+    Given a tensor with leading dimensions [B, T, ...], gather an element from each sequence in
+    the batch given an index for each sequence.
+
+    Args:
+        seq (torch.Tensor): tensor with leading dimensions [B, T, ...]
+        indices (torch.Tensor): tensor indices of shape [B]
+
+    Return:
+        y (torch.Tensor): indexed tensor of shape [B, ....]
+    """
+    return gather_along_dim_with_dim_single(seq, target_dim=1, source_dim=0, indices=indices)
+
+
+def gather_sequence(seq, indices):
+    """
+    Given a nested dictionary or list or tuple, gathers an element from each sequence of the batch
+    for tensors with leading dimensions [B, T, ...].
+
+    Args:
+        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        indices (torch.Tensor): tensor indices of shape [B]
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple with tensors of shape [B, ...]
+    """
+    return gather_along_dim_with_dim(seq, target_dim=1, source_dim=0, indices=indices)
+
+
+def pad_sequence_single(seq, padding, batched=False, pad_same=True, pad_values=None):
+    """
+    Pad input tensor or array @seq in the time dimension (dimension 1).
+
+    Args:
+        seq (np.ndarray or torch.Tensor): sequence to be padded
+        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
+        batched (bool): if sequence has the batch dimension
+        pad_same (bool): if pad by duplicating
+        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
+
+    Returns:
+        padded sequence (np.ndarray or torch.Tensor)
+    """
+    assert isinstance(seq, (np.ndarray, torch.Tensor))
+    assert pad_same or pad_values is not None
+    if pad_values is not None:
+        assert isinstance(pad_values, float)
+    repeat_func = np.repeat if isinstance(seq, np.ndarray) else torch.repeat_interleave
+    concat_func = np.concatenate if isinstance(seq, np.ndarray) else torch.cat
+    ones_like_func = np.ones_like if isinstance(seq, np.ndarray) else torch.ones_like
+    seq_dim = 1 if batched else 0
+
+    begin_pad = []
+    end_pad = []
+
+    if padding[0] > 0:
+        pad = seq[[0]] if pad_same else ones_like_func(seq[[0]]) * pad_values
+        begin_pad.append(repeat_func(pad, padding[0], seq_dim))
+    if padding[1] > 0:
+        pad = seq[[-1]] if pad_same else ones_like_func(seq[[-1]]) * pad_values
+        end_pad.append(repeat_func(pad, padding[1], seq_dim))
+
+    return concat_func(begin_pad + [seq] + end_pad, seq_dim)
+
+
+def pad_sequence(seq, padding, batched=False, pad_same=True, pad_values=None):
+    """
+    Pad a nested dictionary or list or tuple of sequence tensors in the time dimension (dimension 1).
+
+    Args:
+        seq (dict or list or tuple): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        padding (tuple): begin and end padding, e.g. [1, 1] pads both begin and end of the sequence by 1
+        batched (bool): if sequence has the batch dimension
+        pad_same (bool): if pad by duplicating
+        pad_values (scalar or (ndarray, Tensor)): values to be padded if not pad_same
+
+    Returns:
+        padded sequence (dict or list or tuple)
+    """
+    return recursive_dict_list_tuple_apply(
+        seq,
+        {
+            torch.Tensor:
+            lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values: pad_sequence_single(x, p, b, ps, pv),
+            np.ndarray:
+            lambda x, p=padding, b=batched, ps=pad_same, pv=pad_values: pad_sequence_single(x, p, b, ps, pv),
+            type(None): lambda x: x,
+        },
+    )
+
+
+def assert_size_at_dim_single(x, size, dim, msg):
+    """
+    Ensure that array or tensor @x has size @size in dim @dim.
+
+    Args:
+        x (np.ndarray or torch.Tensor): input array or tensor
+        size (int): size that tensors should have at @dim
+        dim (int): dimension to check
+        msg (str): text to display if assertion fails
+    """
+    assert x.shape[dim] == size, msg
+
+
+def assert_size_at_dim(x, size, dim, msg):
+    """
+    Ensure that arrays and tensors in nested dictionary or list or tuple have
+    size @size in dim @dim.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+        size (int): size that tensors should have at @dim
+        dim (int): dimension to check
+    """
+    map_tensor(x, lambda t, s=size, d=dim, m=msg: assert_size_at_dim_single(t, s, d, m))
+
+
+def get_shape(x):
+    """
+    Get all shapes of arrays and tensors in nested dictionary or list or tuple.
+
+    Args:
+        x (dict or list or tuple): a possibly nested dictionary or list or tuple
+
+    Returns:
+        y (dict or list or tuple): new nested dict-list-tuple that contains each array or
+            tensor's shape
+    """
+    return recursive_dict_list_tuple_apply(
+        x,
+        {
+            torch.Tensor: lambda x: x.shape,
+            np.ndarray: lambda x: x.shape,
+            type(None): lambda x: x,
+        },
+    )
+
+
+def list_of_flat_dict_to_dict_of_list(list_of_dict):
+    """
+    Helper function to go from a list of flat dictionaries to a dictionary of lists.
+    By "flat" we mean that none of the values are dictionaries, but are numpy arrays,
+    floats, etc.
+
+    Args:
+        list_of_dict (list): list of flat dictionaries
+
+    Returns:
+        dict_of_list (dict): dictionary of lists
+    """
+    assert isinstance(list_of_dict, list)
+    dic = collections.OrderedDict()
+    for i in range(len(list_of_dict)):
+        for k in list_of_dict[i]:
+            if k not in dic:
+                dic[k] = []
+            dic[k].append(list_of_dict[i][k])
+    return dic
+
+
+def flatten_nested_dict_list(d, parent_key="", sep="_", item_key=""):
+    """
+    Flatten a nested dict or list to a list.
+
+    For example, given a dict
+    {
+        a: 1
+        b: {
+            c: 2
+        }
+        c: 3
+    }
+
+    the function would return [(a, 1), (b_c, 2), (c, 3)]
+
+    Args:
+        d (dict, list): a nested dict or list to be flattened
+        parent_key (str): recursion helper
+        sep (str): separator for nesting keys
+        item_key (str): recursion helper
+    Returns:
+        list: a list of (key, value) tuples
+    """
+    items = []
+    if isinstance(d, (tuple, list)):
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        for i, v in enumerate(d):
+            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=str(i)))
+        return items
+    elif isinstance(d, dict):
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        for k, v in d.items():
+            assert isinstance(k, str)
+            items.extend(flatten_nested_dict_list(v, new_key, sep=sep, item_key=k))
+        return items
+    else:
+        new_key = parent_key + sep + item_key if len(parent_key) > 0 else item_key
+        return [(new_key, d)]
+
+
+def time_distributed(inputs, op, activation=None, inputs_as_kwargs=False, inputs_as_args=False, **kwargs):
+    """
+    Apply function @op to all tensors in nested dictionary or list or tuple @inputs in both the
+    batch (B) and time (T) dimension, where the tensors are expected to have shape [B, T, ...].
+    Will do this by reshaping tensors to [B * T, ...], passing through the op, and then reshaping
+    outputs to [B, T, ...].
+
+    Args:
+        inputs (list or tuple or dict): a possibly nested dictionary or list or tuple with tensors
+            of leading dimensions [B, T, ...]
+        op: a layer op that accepts inputs
+        activation: activation to apply at the output
+        inputs_as_kwargs (bool): whether to feed input as a kwargs dict to the op
+        inputs_as_args (bool) whether to feed input as a args list to the op
+        kwargs (dict): other kwargs to supply to the op
+
+    Returns:
+        outputs (dict or list or tuple): new nested dict-list-tuple with tensors of leading dimension [B, T].
+    """
+    batch_size, seq_len = flatten_nested_dict_list(inputs)[0][1].shape[:2]
+    inputs = join_dimensions(inputs, 0, 1)
+    if inputs_as_kwargs:
+        outputs = op(**inputs, **kwargs)
+    elif inputs_as_args:
+        outputs = op(*inputs, **kwargs)
+    else:
+        outputs = op(inputs, **kwargs)
+
+    if activation is not None:
+        outputs = map_tensor(outputs, activation)
+    outputs = reshape_dimensions(outputs, begin_axis=0, end_axis=0, target_dims=(batch_size, seq_len))
+    return outputs
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/diffusion/conditional_unet1d.py b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/conditional_unet1d.py
new file mode 100644
index 0000000..ea6616e
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/conditional_unet1d.py
@@ -0,0 +1,278 @@
+from typing import Union
+import logging
+import torch
+import torch.nn as nn
+import einops
+from einops.layers.torch import Rearrange
+
+from diffusion_policy.model.diffusion.conv1d_components import (
+    Downsample1d,
+    Upsample1d,
+    Conv1dBlock,
+)
+from diffusion_policy.model.diffusion.positional_embedding import SinusoidalPosEmb
+
+logger = logging.getLogger(__name__)
+
+
+class ConditionalResidualBlock1D(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        cond_dim,
+        kernel_size=3,
+        n_groups=8,
+        cond_predict_scale=False,
+    ):
+        super().__init__()
+
+        self.blocks = nn.ModuleList([
+            Conv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups),
+            Conv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups),
+        ])
+
+        # FiLM modulation https://arxiv.org/abs/1709.07871
+        # predicts per-channel scale and bias
+        cond_channels = out_channels
+        if cond_predict_scale:
+            cond_channels = out_channels * 2
+        self.cond_predict_scale = cond_predict_scale
+        self.out_channels = out_channels
+        self.cond_encoder = nn.Sequential(
+            nn.Mish(),
+            nn.Linear(cond_dim, cond_channels),
+            Rearrange("batch t -> batch t 1"),
+        )
+
+        # make sure dimensions compatible
+        self.residual_conv = (nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity())
+
+    def forward(self, x, cond):
+        """
+        x : [ batch_size x in_channels x horizon ]
+        cond : [ batch_size x cond_dim]
+
+        returns:
+        out : [ batch_size x out_channels x horizon ]
+        """
+        out = self.blocks[0](x)
+        embed = self.cond_encoder(cond)
+        if self.cond_predict_scale:
+            embed = embed.reshape(embed.shape[0], 2, self.out_channels, 1)
+            scale = embed[:, 0, ...]
+            bias = embed[:, 1, ...]
+            out = scale * out + bias
+        else:
+            out = out + embed
+        out = self.blocks[1](out)
+        out = out + self.residual_conv(x)
+        return out
+
+
+class ConditionalUnet1D(nn.Module):
+
+    def __init__(
+        self,
+        input_dim,
+        local_cond_dim=None,
+        global_cond_dim=None,
+        diffusion_step_embed_dim=256,
+        down_dims=[256, 512, 1024],
+        kernel_size=3,
+        n_groups=8,
+        cond_predict_scale=False,
+    ):
+        super().__init__()
+        all_dims = [input_dim] + list(down_dims)
+        start_dim = down_dims[0]
+
+        dsed = diffusion_step_embed_dim
+        diffusion_step_encoder = nn.Sequential(
+            SinusoidalPosEmb(dsed),
+            nn.Linear(dsed, dsed * 4),
+            nn.Mish(),
+            nn.Linear(dsed * 4, dsed),
+        )
+        cond_dim = dsed
+        if global_cond_dim is not None:
+            cond_dim += global_cond_dim
+
+        in_out = list(zip(all_dims[:-1], all_dims[1:]))
+
+        local_cond_encoder = None
+        if local_cond_dim is not None:
+            _, dim_out = in_out[0]
+            dim_in = local_cond_dim
+            local_cond_encoder = nn.ModuleList([
+                # down encoder
+                ConditionalResidualBlock1D(
+                    dim_in,
+                    dim_out,
+                    cond_dim=cond_dim,
+                    kernel_size=kernel_size,
+                    n_groups=n_groups,
+                    cond_predict_scale=cond_predict_scale,
+                ),
+                # up encoder
+                ConditionalResidualBlock1D(
+                    dim_in,
+                    dim_out,
+                    cond_dim=cond_dim,
+                    kernel_size=kernel_size,
+                    n_groups=n_groups,
+                    cond_predict_scale=cond_predict_scale,
+                ),
+            ])
+
+        mid_dim = all_dims[-1]
+        self.mid_modules = nn.ModuleList([
+            ConditionalResidualBlock1D(
+                mid_dim,
+                mid_dim,
+                cond_dim=cond_dim,
+                kernel_size=kernel_size,
+                n_groups=n_groups,
+                cond_predict_scale=cond_predict_scale,
+            ),
+            ConditionalResidualBlock1D(
+                mid_dim,
+                mid_dim,
+                cond_dim=cond_dim,
+                kernel_size=kernel_size,
+                n_groups=n_groups,
+                cond_predict_scale=cond_predict_scale,
+            ),
+        ])
+
+        down_modules = nn.ModuleList([])
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (len(in_out) - 1)
+            down_modules.append(
+                nn.ModuleList([
+                    ConditionalResidualBlock1D(
+                        dim_in,
+                        dim_out,
+                        cond_dim=cond_dim,
+                        kernel_size=kernel_size,
+                        n_groups=n_groups,
+                        cond_predict_scale=cond_predict_scale,
+                    ),
+                    ConditionalResidualBlock1D(
+                        dim_out,
+                        dim_out,
+                        cond_dim=cond_dim,
+                        kernel_size=kernel_size,
+                        n_groups=n_groups,
+                        cond_predict_scale=cond_predict_scale,
+                    ),
+                    Downsample1d(dim_out) if not is_last else nn.Identity(),
+                ]))
+
+        up_modules = nn.ModuleList([])
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            is_last = ind >= (len(in_out) - 1)
+            up_modules.append(
+                nn.ModuleList([
+                    ConditionalResidualBlock1D(
+                        dim_out * 2,
+                        dim_in,
+                        cond_dim=cond_dim,
+                        kernel_size=kernel_size,
+                        n_groups=n_groups,
+                        cond_predict_scale=cond_predict_scale,
+                    ),
+                    ConditionalResidualBlock1D(
+                        dim_in,
+                        dim_in,
+                        cond_dim=cond_dim,
+                        kernel_size=kernel_size,
+                        n_groups=n_groups,
+                        cond_predict_scale=cond_predict_scale,
+                    ),
+                    Upsample1d(dim_in) if not is_last else nn.Identity(),
+                ]))
+
+        final_conv = nn.Sequential(
+            Conv1dBlock(start_dim, start_dim, kernel_size=kernel_size),
+            nn.Conv1d(start_dim, input_dim, 1),
+        )
+
+        self.diffusion_step_encoder = diffusion_step_encoder
+        self.local_cond_encoder = local_cond_encoder
+        self.up_modules = up_modules
+        self.down_modules = down_modules
+        self.final_conv = final_conv
+
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def forward(self,
+                sample: torch.Tensor,
+                timestep: Union[torch.Tensor, float, int],
+                local_cond=None,
+                global_cond=None,
+                **kwargs):
+        """
+        x: (B,T,input_dim)
+        timestep: (B,) or int, diffusion step
+        local_cond: (B,T,local_cond_dim)
+        global_cond: (B,global_cond_dim)
+        output: (B,T,input_dim)
+        """
+        sample = einops.rearrange(sample, "b h t -> b t h")
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        global_feature = self.diffusion_step_encoder(timesteps)
+
+        if global_cond is not None:
+            global_feature = torch.cat([global_feature, global_cond], axis=-1)
+
+        # encode local features
+        h_local = list()
+        if local_cond is not None:
+            local_cond = einops.rearrange(local_cond, "b h t -> b t h")
+            resnet, resnet2 = self.local_cond_encoder
+            x = resnet(local_cond, global_feature)
+            h_local.append(x)
+            x = resnet2(local_cond, global_feature)
+            h_local.append(x)
+
+        x = sample
+        h = []
+        for idx, (resnet, resnet2, downsample) in enumerate(self.down_modules):
+            x = resnet(x, global_feature)
+            if idx == 0 and len(h_local) > 0:
+                x = x + h_local[0]
+            x = resnet2(x, global_feature)
+            h.append(x)
+            x = downsample(x)
+
+        for mid_module in self.mid_modules:
+            x = mid_module(x, global_feature)
+
+        for idx, (resnet, resnet2, upsample) in enumerate(self.up_modules):
+            x = torch.cat((x, h.pop()), dim=1)
+            x = resnet(x, global_feature)
+            # The correct condition should be:
+            # if idx == (len(self.up_modules)-1) and len(h_local) > 0:
+            # However this change will break compatibility with published checkpoints.
+            # Therefore it is left as a comment.
+            if idx == len(self.up_modules) and len(h_local) > 0:
+                x = x + h_local[1]
+            x = resnet2(x, global_feature)
+            x = upsample(x)
+
+        x = self.final_conv(x)
+
+        x = einops.rearrange(x, "b t h -> b h t")
+        return x
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/diffusion/conv1d_components.py b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/conv1d_components.py
new file mode 100644
index 0000000..163ed05
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/conv1d_components.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# from einops.layers.torch import Rearrange
+
+
+class Downsample1d(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.Conv1d(dim, dim, 3, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Upsample1d(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.ConvTranspose1d(dim, dim, 4, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Conv1dBlock(nn.Module):
+    """
+    Conv1d --> GroupNorm --> Mish
+    """
+
+    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
+        super().__init__()
+
+        self.block = nn.Sequential(
+            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
+            # Rearrange('batch channels horizon -> batch channels 1 horizon'),
+            nn.GroupNorm(n_groups, out_channels),
+            # Rearrange('batch channels 1 horizon -> batch channels horizon'),
+            nn.Mish(),
+        )
+
+    def forward(self, x):
+        return self.block(x)
+
+
+def test():
+    cb = Conv1dBlock(256, 128, kernel_size=3)
+    x = torch.zeros((1, 256, 16))
+    o = cb(x)
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/diffusion/ema_model.py b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/ema_model.py
new file mode 100644
index 0000000..c6835f7
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/ema_model.py
@@ -0,0 +1,89 @@
+import copy
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(
+        self,
+        model,
+        update_after_step=0,
+        inv_gamma=1.0,
+        power=2 / 3,
+        min_value=0.0,
+        max_value=0.9999,
+    ):
+        """
+        @crowsonkb's notes on EMA Warmup:
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
+        Args:
+            inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
+            power (float): Exponential factor of EMA warmup. Default: 2/3.
+            min_value (float): The minimum EMA decay rate. Default: 0.
+        """
+
+        self.averaged_model = model
+        self.averaged_model.eval()
+        self.averaged_model.requires_grad_(False)
+
+        self.update_after_step = update_after_step
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.min_value = min_value
+        self.max_value = max_value
+
+        self.decay = 0.0
+        self.optimization_step = 0
+
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        step = max(0, optimization_step - self.update_after_step - 1)
+        value = 1 - (1 + step / self.inv_gamma)**-self.power
+
+        if step <= 0:
+            return 0.0
+
+        return max(self.min_value, min(value, self.max_value))
+
+    @torch.no_grad()
+    def step(self, new_model):
+        self.decay = self.get_decay(self.optimization_step)
+
+        # old_all_dataptrs = set()
+        # for param in new_model.parameters():
+        #     data_ptr = param.data_ptr()
+        #     if data_ptr != 0:
+        #         old_all_dataptrs.add(data_ptr)
+
+        all_dataptrs = set()
+        for module, ema_module in zip(new_model.modules(), self.averaged_model.modules()):
+            for param, ema_param in zip(module.parameters(recurse=False), ema_module.parameters(recurse=False)):
+                # iterative over immediate parameters only.
+                if isinstance(param, dict):
+                    raise RuntimeError("Dict parameter not supported")
+
+                # data_ptr = param.data_ptr()
+                # if data_ptr != 0:
+                #     all_dataptrs.add(data_ptr)
+
+                if isinstance(module, _BatchNorm):
+                    # skip batchnorms
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                elif not param.requires_grad:
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                else:
+                    ema_param.mul_(self.decay)
+                    ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
+
+        # verify that iterating over module and then parameters is identical to parameters recursively.
+        # assert old_all_dataptrs == all_dataptrs
+        self.optimization_step += 1
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/diffusion/mask_generator.py b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/mask_generator.py
new file mode 100644
index 0000000..8c14931
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/mask_generator.py
@@ -0,0 +1,225 @@
+from typing import Sequence, Optional
+import torch
+from torch import nn
+from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
+
+
+def get_intersection_slice_mask(shape: tuple, dim_slices: Sequence[slice], device: Optional[torch.device] = None):
+    assert len(shape) == len(dim_slices)
+    mask = torch.zeros(size=shape, dtype=torch.bool, device=device)
+    mask[dim_slices] = True
+    return mask
+
+
+def get_union_slice_mask(shape: tuple, dim_slices: Sequence[slice], device: Optional[torch.device] = None):
+    assert len(shape) == len(dim_slices)
+    mask = torch.zeros(size=shape, dtype=torch.bool, device=device)
+    for i in range(len(dim_slices)):
+        this_slices = [slice(None)] * len(shape)
+        this_slices[i] = dim_slices[i]
+        mask[this_slices] = True
+    return mask
+
+
+class DummyMaskGenerator(ModuleAttrMixin):
+
+    def __init__(self):
+        super().__init__()
+
+    @torch.no_grad()
+    def forward(self, shape):
+        device = self.device
+        mask = torch.ones(size=shape, dtype=torch.bool, device=device)
+        return mask
+
+
+class LowdimMaskGenerator(ModuleAttrMixin):
+
+    def __init__(
+        self,
+        action_dim,
+        obs_dim,
+        # obs mask setup
+        max_n_obs_steps=2,
+        fix_obs_steps=True,
+        # action mask
+        action_visible=False,
+    ):
+        super().__init__()
+        self.action_dim = action_dim
+        self.obs_dim = obs_dim
+        self.max_n_obs_steps = max_n_obs_steps
+        self.fix_obs_steps = fix_obs_steps
+        self.action_visible = action_visible
+
+    @torch.no_grad()
+    def forward(self, shape, seed=None):
+        device = self.device
+        B, T, D = shape
+        assert D == (self.action_dim + self.obs_dim)
+
+        # create all tensors on this device
+        rng = torch.Generator(device=device)
+        if seed is not None:
+            rng = rng.manual_seed(seed)
+
+        # generate dim mask
+        dim_mask = torch.zeros(size=shape, dtype=torch.bool, device=device)
+        is_action_dim = dim_mask.clone()
+        is_action_dim[..., :self.action_dim] = True
+        is_obs_dim = ~is_action_dim
+
+        # generate obs mask
+        if self.fix_obs_steps:
+            obs_steps = torch.full((B, ), fill_value=self.max_n_obs_steps, device=device)
+        else:
+            obs_steps = torch.randint(
+                low=1,
+                high=self.max_n_obs_steps + 1,
+                size=(B, ),
+                generator=rng,
+                device=device,
+            )
+
+        steps = torch.arange(0, T, device=device).reshape(1, T).expand(B, T)
+        obs_mask = (steps.T < obs_steps).T.reshape(B, T, 1).expand(B, T, D)
+        obs_mask = obs_mask & is_obs_dim
+
+        # generate action mask
+        if self.action_visible:
+            action_steps = torch.maximum(
+                obs_steps - 1,
+                torch.tensor(0, dtype=obs_steps.dtype, device=obs_steps.device),
+            )
+            action_mask = (steps.T < action_steps).T.reshape(B, T, 1).expand(B, T, D)
+            action_mask = action_mask & is_action_dim
+
+        mask = obs_mask
+        if self.action_visible:
+            mask = mask | action_mask
+
+        return mask
+
+
+class KeypointMaskGenerator(ModuleAttrMixin):
+
+    def __init__(
+        self,
+        # dimensions
+        action_dim,
+        keypoint_dim,
+        # obs mask setup
+        max_n_obs_steps=2,
+        fix_obs_steps=True,
+        # keypoint mask setup
+        keypoint_visible_rate=0.7,
+        time_independent=False,
+        # action mask
+        action_visible=False,
+        context_dim=0,  # dim for context
+        n_context_steps=1,
+    ):
+        super().__init__()
+        self.action_dim = action_dim
+        self.keypoint_dim = keypoint_dim
+        self.context_dim = context_dim
+        self.max_n_obs_steps = max_n_obs_steps
+        self.fix_obs_steps = fix_obs_steps
+        self.keypoint_visible_rate = keypoint_visible_rate
+        self.time_independent = time_independent
+        self.action_visible = action_visible
+        self.n_context_steps = n_context_steps
+
+    @torch.no_grad()
+    def forward(self, shape, seed=None):
+        device = self.device
+        B, T, D = shape
+        all_keypoint_dims = D - self.action_dim - self.context_dim
+        n_keypoints = all_keypoint_dims // self.keypoint_dim
+
+        # create all tensors on this device
+        rng = torch.Generator(device=device)
+        if seed is not None:
+            rng = rng.manual_seed(seed)
+
+        # generate dim mask
+        dim_mask = torch.zeros(size=shape, dtype=torch.bool, device=device)
+        is_action_dim = dim_mask.clone()
+        is_action_dim[..., :self.action_dim] = True
+        is_context_dim = dim_mask.clone()
+        if self.context_dim > 0:
+            is_context_dim[..., -self.context_dim:] = True
+        is_obs_dim = ~(is_action_dim | is_context_dim)
+        # assumption trajectory=cat([action, keypoints, context], dim=-1)
+
+        # generate obs mask
+        if self.fix_obs_steps:
+            obs_steps = torch.full((B, ), fill_value=self.max_n_obs_steps, device=device)
+        else:
+            obs_steps = torch.randint(
+                low=1,
+                high=self.max_n_obs_steps + 1,
+                size=(B, ),
+                generator=rng,
+                device=device,
+            )
+
+        steps = torch.arange(0, T, device=device).reshape(1, T).expand(B, T)
+        obs_mask = (steps.T < obs_steps).T.reshape(B, T, 1).expand(B, T, D)
+        obs_mask = obs_mask & is_obs_dim
+
+        # generate action mask
+        if self.action_visible:
+            action_steps = torch.maximum(
+                obs_steps - 1,
+                torch.tensor(0, dtype=obs_steps.dtype, device=obs_steps.device),
+            )
+            action_mask = (steps.T < action_steps).T.reshape(B, T, 1).expand(B, T, D)
+            action_mask = action_mask & is_action_dim
+
+        # generate keypoint mask
+        if self.time_independent:
+            visible_kps = (torch.rand(size=(B, T, n_keypoints), generator=rng, device=device)
+                           < self.keypoint_visible_rate)
+            visible_dims = torch.repeat_interleave(visible_kps, repeats=self.keypoint_dim, dim=-1)
+            visible_dims_mask = torch.cat(
+                [
+                    torch.ones((B, T, self.action_dim), dtype=torch.bool, device=device),
+                    visible_dims,
+                    torch.ones((B, T, self.context_dim), dtype=torch.bool, device=device),
+                ],
+                axis=-1,
+            )
+            keypoint_mask = visible_dims_mask
+        else:
+            visible_kps = (torch.rand(size=(B, n_keypoints), generator=rng, device=device) < self.keypoint_visible_rate)
+            visible_dims = torch.repeat_interleave(visible_kps, repeats=self.keypoint_dim, dim=-1)
+            visible_dims_mask = torch.cat(
+                [
+                    torch.ones((B, self.action_dim), dtype=torch.bool, device=device),
+                    visible_dims,
+                    torch.ones((B, self.context_dim), dtype=torch.bool, device=device),
+                ],
+                axis=-1,
+            )
+            keypoint_mask = visible_dims_mask.reshape(B, 1, D).expand(B, T, D)
+        keypoint_mask = keypoint_mask & is_obs_dim
+
+        # generate context mask
+        context_mask = is_context_dim.clone()
+        context_mask[:, self.n_context_steps:, :] = False
+
+        mask = obs_mask & keypoint_mask
+        if self.action_visible:
+            mask = mask | action_mask
+        if self.context_dim > 0:
+            mask = mask | context_mask
+
+        return mask
+
+
+def test():
+    # kmg = KeypointMaskGenerator(2,2, random_obs_steps=True)
+    # self = KeypointMaskGenerator(2,2,context_dim=2, action_visible=True)
+    # self = KeypointMaskGenerator(2,2,context_dim=0, action_visible=True)
+    self = LowdimMaskGenerator(2, 20, max_n_obs_steps=3, action_visible=True)
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/diffusion/positional_embedding.py b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/positional_embedding.py
new file mode 100644
index 0000000..1b1d646
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/positional_embedding.py
@@ -0,0 +1,19 @@
+import math
+import torch
+import torch.nn as nn
+
+
+class SinusoidalPosEmb(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/diffusion/transformer_for_diffusion.py b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/transformer_for_diffusion.py
new file mode 100644
index 0000000..7b41071
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/diffusion/transformer_for_diffusion.py
@@ -0,0 +1,391 @@
+from typing import Union, Optional, Tuple
+import logging
+import torch
+import torch.nn as nn
+from diffusion_policy.model.diffusion.positional_embedding import SinusoidalPosEmb
+from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
+
+logger = logging.getLogger(__name__)
+
+
+class TransformerForDiffusion(ModuleAttrMixin):
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        horizon: int,
+        n_obs_steps: int = None,
+        cond_dim: int = 0,
+        n_layer: int = 12,
+        n_head: int = 12,
+        n_emb: int = 768,
+        p_drop_emb: float = 0.1,
+        p_drop_attn: float = 0.1,
+        causal_attn: bool = False,
+        time_as_cond: bool = True,
+        obs_as_cond: bool = False,
+        n_cond_layers: int = 0,
+    ) -> None:
+        super().__init__()
+
+        # compute number of tokens for main trunk and condition encoder
+        if n_obs_steps is None:
+            n_obs_steps = horizon
+
+        T = horizon
+        T_cond = 1
+        if not time_as_cond:
+            T += 1
+            T_cond -= 1
+        obs_as_cond = cond_dim > 0
+        if obs_as_cond:
+            assert time_as_cond
+            T_cond += n_obs_steps
+
+        # input embedding stem
+        self.input_emb = nn.Linear(input_dim, n_emb)
+        self.pos_emb = nn.Parameter(torch.zeros(1, T, n_emb))
+        self.drop = nn.Dropout(p_drop_emb)
+
+        # cond encoder
+        self.time_emb = SinusoidalPosEmb(n_emb)
+        self.cond_obs_emb = None
+
+        if obs_as_cond:
+            self.cond_obs_emb = nn.Linear(cond_dim, n_emb)
+
+        self.cond_pos_emb = None
+        self.encoder = None
+        self.decoder = None
+        encoder_only = False
+        if T_cond > 0:
+            self.cond_pos_emb = nn.Parameter(torch.zeros(1, T_cond, n_emb))
+            if n_cond_layers > 0:
+                encoder_layer = nn.TransformerEncoderLayer(
+                    d_model=n_emb,
+                    nhead=n_head,
+                    dim_feedforward=4 * n_emb,
+                    dropout=p_drop_attn,
+                    activation="gelu",
+                    batch_first=True,
+                    norm_first=True,
+                )
+                self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=n_cond_layers)
+            else:
+                self.encoder = nn.Sequential(nn.Linear(n_emb, 4 * n_emb), nn.Mish(), nn.Linear(4 * n_emb, n_emb))
+            # decoder
+            decoder_layer = nn.TransformerDecoderLayer(
+                d_model=n_emb,
+                nhead=n_head,
+                dim_feedforward=4 * n_emb,
+                dropout=p_drop_attn,
+                activation="gelu",
+                batch_first=True,
+                norm_first=True,  # important for stability
+            )
+            self.decoder = nn.TransformerDecoder(decoder_layer=decoder_layer, num_layers=n_layer)
+        else:
+            # encoder only BERT
+            encoder_only = True
+
+            encoder_layer = nn.TransformerEncoderLayer(
+                d_model=n_emb,
+                nhead=n_head,
+                dim_feedforward=4 * n_emb,
+                dropout=p_drop_attn,
+                activation="gelu",
+                batch_first=True,
+                norm_first=True,
+            )
+            self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=n_layer)
+
+        # attention mask
+        if causal_attn:
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            # torch.nn.Transformer uses additive mask as opposed to multiplicative mask in minGPT
+            # therefore, the upper triangle should be -inf and others (including diag) should be 0.
+            sz = T
+            mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+            mask = (mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0)))
+            self.register_buffer("mask", mask)
+
+            if time_as_cond and obs_as_cond:
+                S = T_cond
+                t, s = torch.meshgrid(torch.arange(T), torch.arange(S), indexing="ij")
+                mask = t >= (s - 1)  # add one dimension since time is the first token in cond
+                mask = (mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0)))
+                self.register_buffer("memory_mask", mask)
+            else:
+                self.memory_mask = None
+        else:
+            self.mask = None
+            self.memory_mask = None
+
+        # decoder head
+        self.ln_f = nn.LayerNorm(n_emb)
+        self.head = nn.Linear(n_emb, output_dim)
+
+        # constants
+        self.T = T
+        self.T_cond = T_cond
+        self.horizon = horizon
+        self.time_as_cond = time_as_cond
+        self.obs_as_cond = obs_as_cond
+        self.encoder_only = encoder_only
+
+        # init
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        ignore_types = (
+            nn.Dropout,
+            SinusoidalPosEmb,
+            nn.TransformerEncoderLayer,
+            nn.TransformerDecoderLayer,
+            nn.TransformerEncoder,
+            nn.TransformerDecoder,
+            nn.ModuleList,
+            nn.Mish,
+            nn.Sequential,
+        )
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.MultiheadAttention):
+            weight_names = [
+                "in_proj_weight",
+                "q_proj_weight",
+                "k_proj_weight",
+                "v_proj_weight",
+            ]
+            for name in weight_names:
+                weight = getattr(module, name)
+                if weight is not None:
+                    torch.nn.init.normal_(weight, mean=0.0, std=0.02)
+
+            bias_names = ["in_proj_bias", "bias_k", "bias_v"]
+            for name in bias_names:
+                bias = getattr(module, name)
+                if bias is not None:
+                    torch.nn.init.zeros_(bias)
+        elif isinstance(module, nn.LayerNorm):
+            torch.nn.init.zeros_(module.bias)
+            torch.nn.init.ones_(module.weight)
+        elif isinstance(module, TransformerForDiffusion):
+            torch.nn.init.normal_(module.pos_emb, mean=0.0, std=0.02)
+            if module.cond_obs_emb is not None:
+                torch.nn.init.normal_(module.cond_pos_emb, mean=0.0, std=0.02)
+        elif isinstance(module, ignore_types):
+            # no param
+            pass
+        else:
+            raise RuntimeError("Unaccounted module {}".format(module))
+
+    def get_optim_groups(self, weight_decay: float = 1e-3):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, torch.nn.MultiheadAttention)
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = "%s.%s" % (mn, pn) if mn else pn  # full param name
+
+                if pn.endswith("bias"):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.startswith("bias"):
+                    # MultiheadAttention bias starts with "bias"
+                    no_decay.add(fpn)
+                elif pn.endswith("weight") and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith("weight") and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+
+        # special case the position embedding parameter in the root GPT module as not decayed
+        no_decay.add("pos_emb")
+        no_decay.add("_dummy_variable")
+        if self.cond_pos_emb is not None:
+            no_decay.add("cond_pos_emb")
+
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert (len(inter_params) == 0), "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert (len(param_dict.keys() -
+                    union_params) == 0), "parameters %s were not separated into either decay/no_decay set!" % (
+                        str(param_dict.keys() - union_params), )
+
+        # create the pytorch optimizer object
+        optim_groups = [
+            {
+                "params": [param_dict[pn] for pn in sorted(list(decay))],
+                "weight_decay": weight_decay,
+            },
+            {
+                "params": [param_dict[pn] for pn in sorted(list(no_decay))],
+                "weight_decay": 0.0,
+            },
+        ]
+        return optim_groups
+
+    def configure_optimizers(
+            self,
+            learning_rate: float = 1e-4,
+            weight_decay: float = 1e-3,
+            betas: Tuple[float, float] = (0.9, 0.95),
+    ):
+        optim_groups = self.get_optim_groups(weight_decay=weight_decay)
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas)
+        return optimizer
+
+    def forward(self,
+                sample: torch.Tensor,
+                timestep: Union[torch.Tensor, float, int],
+                cond: Optional[torch.Tensor] = None,
+                **kwargs):
+        """
+        x: (B,T,input_dim)
+        timestep: (B,) or int, diffusion step
+        cond: (B,T',cond_dim)
+        output: (B,T,input_dim)
+        """
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        time_emb = self.time_emb(timesteps).unsqueeze(1)
+        # (B,1,n_emb)
+
+        # process input
+        input_emb = self.input_emb(sample)
+
+        if self.encoder_only:
+            # BERT
+            token_embeddings = torch.cat([time_emb, input_emb], dim=1)
+            t = token_embeddings.shape[1]
+            position_embeddings = self.pos_emb[:, :t, :]  # each position maps to a (learnable) vector
+            x = self.drop(token_embeddings + position_embeddings)
+            # (B,T+1,n_emb)
+            x = self.encoder(src=x, mask=self.mask)
+            # (B,T+1,n_emb)
+            x = x[:, 1:, :]
+            # (B,T,n_emb)
+        else:
+            # encoder
+            cond_embeddings = time_emb
+            if self.obs_as_cond:
+                cond_obs_emb = self.cond_obs_emb(cond)
+                # (B,To,n_emb)
+                cond_embeddings = torch.cat([cond_embeddings, cond_obs_emb], dim=1)
+            tc = cond_embeddings.shape[1]
+            position_embeddings = self.cond_pos_emb[:, :tc, :]  # each position maps to a (learnable) vector
+            x = self.drop(cond_embeddings + position_embeddings)
+            x = self.encoder(x)
+            memory = x
+            # (B,T_cond,n_emb)
+
+            # decoder
+            token_embeddings = input_emb
+            t = token_embeddings.shape[1]
+            position_embeddings = self.pos_emb[:, :t, :]  # each position maps to a (learnable) vector
+            x = self.drop(token_embeddings + position_embeddings)
+            # (B,T,n_emb)
+            x = self.decoder(tgt=x, memory=memory, tgt_mask=self.mask, memory_mask=self.memory_mask)
+            # (B,T,n_emb)
+
+        # head
+        x = self.ln_f(x)
+        x = self.head(x)
+        # (B,T,n_out)
+        return x
+
+
+def test():
+    # GPT with time embedding
+    transformer = TransformerForDiffusion(
+        input_dim=16,
+        output_dim=16,
+        horizon=8,
+        n_obs_steps=4,
+        # cond_dim=10,
+        causal_attn=True,
+        # time_as_cond=False,
+        # n_cond_layers=4
+    )
+    opt = transformer.configure_optimizers()
+
+    timestep = torch.tensor(0)
+    sample = torch.zeros((4, 8, 16))
+    out = transformer(sample, timestep)
+
+    # GPT with time embedding and obs cond
+    transformer = TransformerForDiffusion(
+        input_dim=16,
+        output_dim=16,
+        horizon=8,
+        n_obs_steps=4,
+        cond_dim=10,
+        causal_attn=True,
+        # time_as_cond=False,
+        # n_cond_layers=4
+    )
+    opt = transformer.configure_optimizers()
+
+    timestep = torch.tensor(0)
+    sample = torch.zeros((4, 8, 16))
+    cond = torch.zeros((4, 4, 10))
+    out = transformer(sample, timestep, cond)
+
+    # GPT with time embedding and obs cond and encoder
+    transformer = TransformerForDiffusion(
+        input_dim=16,
+        output_dim=16,
+        horizon=8,
+        n_obs_steps=4,
+        cond_dim=10,
+        causal_attn=True,
+        # time_as_cond=False,
+        n_cond_layers=4,
+    )
+    opt = transformer.configure_optimizers()
+
+    timestep = torch.tensor(0)
+    sample = torch.zeros((4, 8, 16))
+    cond = torch.zeros((4, 4, 10))
+    out = transformer(sample, timestep, cond)
+
+    # BERT with time embedding token
+    transformer = TransformerForDiffusion(
+        input_dim=16,
+        output_dim=16,
+        horizon=8,
+        n_obs_steps=4,
+        # cond_dim=10,
+        # causal_attn=True,
+        time_as_cond=False,
+        # n_cond_layers=4
+    )
+    opt = transformer.configure_optimizers()
+
+    timestep = torch.tensor(0)
+    sample = torch.zeros((4, 8, 16))
+    out = transformer(sample, timestep)
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/vision/crop_randomizer.py b/r2sVLA/algos/DP/diffusion_policy/model/vision/crop_randomizer.py
new file mode 100644
index 0000000..7124fce
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/vision/crop_randomizer.py
@@ -0,0 +1,298 @@
+import torch
+import torch.nn as nn
+import torchvision.transforms.functional as ttf
+import diffusion_policy.model.common.tensor_util as tu
+
+
+class CropRandomizer(nn.Module):
+    """
+    Randomly sample crops at input, and then average across crop features at output.
+    """
+
+    def __init__(
+        self,
+        input_shape,
+        crop_height,
+        crop_width,
+        num_crops=1,
+        pos_enc=False,
+    ):
+        """
+        Args:
+            input_shape (tuple, list): shape of input (not including batch dimension)
+            crop_height (int): crop height
+            crop_width (int): crop width
+            num_crops (int): number of random crops to take
+            pos_enc (bool): if True, add 2 channels to the output to encode the spatial
+                location of the cropped pixels in the source image
+        """
+        super().__init__()
+
+        assert len(input_shape) == 3  # (C, H, W)
+        assert crop_height < input_shape[1]
+        assert crop_width < input_shape[2]
+
+        self.input_shape = input_shape
+        self.crop_height = crop_height
+        self.crop_width = crop_width
+        self.num_crops = num_crops
+        self.pos_enc = pos_enc
+
+    def output_shape_in(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. Corresponds to
+        the @forward_in operation, where raw inputs (usually observation modalities)
+        are passed in.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # outputs are shape (C, CH, CW), or maybe C + 2 if using position encoding, because
+        # the number of crops are reshaped into the batch dimension, increasing the batch
+        # size from B to B * N
+        out_c = self.input_shape[0] + 2 if self.pos_enc else self.input_shape[0]
+        return [out_c, self.crop_height, self.crop_width]
+
+    def output_shape_out(self, input_shape=None):
+        """
+        Function to compute output shape from inputs to this module. Corresponds to
+        the @forward_out operation, where processed inputs (usually encoded observation
+        modalities) are passed in.
+
+        Args:
+            input_shape (iterable of int): shape of input. Does not include batch dimension.
+                Some modules may not need this argument, if their output does not depend
+                on the size of the input, or if they assume fixed size input.
+
+        Returns:
+            out_shape ([int]): list of integers corresponding to output shape
+        """
+
+        # since the forward_out operation splits [B * N, ...] -> [B, N, ...]
+        # and then pools to result in [B, ...], only the batch dimension changes,
+        # and so the other dimensions retain their shape.
+        return list(input_shape)
+
+    def forward_in(self, inputs):
+        """
+        Samples N random crops for each input in the batch, and then reshapes
+        inputs to [B * N, ...].
+        """
+        assert len(inputs.shape) >= 3  # must have at least (C, H, W) dimensions
+        if self.training:
+            # generate random crops
+            out, _ = sample_random_image_crops(
+                images=inputs,
+                crop_height=self.crop_height,
+                crop_width=self.crop_width,
+                num_crops=self.num_crops,
+                pos_enc=self.pos_enc,
+            )
+            # [B, N, ...] -> [B * N, ...]
+            return tu.join_dimensions(out, 0, 1)
+        else:
+            # take center crop during eval
+            out = ttf.center_crop(img=inputs, output_size=(self.crop_height, self.crop_width))
+            if self.num_crops > 1:
+                B, C, H, W = out.shape
+                out = (out.unsqueeze(1).expand(B, self.num_crops, C, H, W).reshape(-1, C, H, W))
+                # [B * N, ...]
+            return out
+
+    def forward_out(self, inputs):
+        """
+        Splits the outputs from shape [B * N, ...] -> [B, N, ...] and then average across N
+        to result in shape [B, ...] to make sure the network output is consistent with
+        what would have happened if there were no randomization.
+        """
+        if self.num_crops <= 1:
+            return inputs
+        else:
+            batch_size = inputs.shape[0] // self.num_crops
+            out = tu.reshape_dimensions(
+                inputs,
+                begin_axis=0,
+                end_axis=0,
+                target_dims=(batch_size, self.num_crops),
+            )
+            return out.mean(dim=1)
+
+    def forward(self, inputs):
+        return self.forward_in(inputs)
+
+    def __repr__(self):
+        """Pretty print network."""
+        header = "{}".format(str(self.__class__.__name__))
+        msg = header + "(input_shape={}, crop_size=[{}, {}], num_crops={})".format(self.input_shape, self.crop_height,
+                                                                                   self.crop_width, self.num_crops)
+        return msg
+
+
+def crop_image_from_indices(images, crop_indices, crop_height, crop_width):
+    """
+    Crops images at the locations specified by @crop_indices. Crops will be
+    taken across all channels.
+
+    Args:
+        images (torch.Tensor): batch of images of shape [..., C, H, W]
+
+        crop_indices (torch.Tensor): batch of indices of shape [..., N, 2] where
+            N is the number of crops to take per image and each entry corresponds
+            to the pixel height and width of where to take the crop. Note that
+            the indices can also be of shape [..., 2] if only 1 crop should
+            be taken per image. Leading dimensions must be consistent with
+            @images argument. Each index specifies the top left of the crop.
+            Values must be in range [0, H - CH - 1] x [0, W - CW - 1] where
+            H and W are the height and width of @images and CH and CW are
+            @crop_height and @crop_width.
+
+        crop_height (int): height of crop to take
+
+        crop_width (int): width of crop to take
+
+    Returns:
+        crops (torch.Tesnor): cropped images of shape [..., C, @crop_height, @crop_width]
+    """
+
+    # make sure length of input shapes is consistent
+    assert crop_indices.shape[-1] == 2
+    ndim_im_shape = len(images.shape)
+    ndim_indices_shape = len(crop_indices.shape)
+    assert (ndim_im_shape == ndim_indices_shape + 1) or (ndim_im_shape == ndim_indices_shape + 2)
+
+    # maybe pad so that @crop_indices is shape [..., N, 2]
+    is_padded = False
+    if ndim_im_shape == ndim_indices_shape + 2:
+        crop_indices = crop_indices.unsqueeze(-2)
+        is_padded = True
+
+    # make sure leading dimensions between images and indices are consistent
+    assert images.shape[:-3] == crop_indices.shape[:-2]
+
+    device = images.device
+    image_c, image_h, image_w = images.shape[-3:]
+    num_crops = crop_indices.shape[-2]
+
+    # make sure @crop_indices are in valid range
+    assert (crop_indices[..., 0] >= 0).all().item()
+    assert (crop_indices[..., 0] < (image_h - crop_height)).all().item()
+    assert (crop_indices[..., 1] >= 0).all().item()
+    assert (crop_indices[..., 1] < (image_w - crop_width)).all().item()
+
+    # convert each crop index (ch, cw) into a list of pixel indices that correspond to the entire window.
+
+    # 2D index array with columns [0, 1, ..., CH - 1] and shape [CH, CW]
+    crop_ind_grid_h = torch.arange(crop_height).to(device)
+    crop_ind_grid_h = tu.unsqueeze_expand_at(crop_ind_grid_h, size=crop_width, dim=-1)
+    # 2D index array with rows [0, 1, ..., CW - 1] and shape [CH, CW]
+    crop_ind_grid_w = torch.arange(crop_width).to(device)
+    crop_ind_grid_w = tu.unsqueeze_expand_at(crop_ind_grid_w, size=crop_height, dim=0)
+    # combine into shape [CH, CW, 2]
+    crop_in_grid = torch.cat((crop_ind_grid_h.unsqueeze(-1), crop_ind_grid_w.unsqueeze(-1)), dim=-1)
+
+    # Add above grid with the offset index of each sampled crop to get 2d indices for each crop.
+    # After broadcasting, this will be shape [..., N, CH, CW, 2] and each crop has a [CH, CW, 2]
+    # shape array that tells us which pixels from the corresponding source image to grab.
+    grid_reshape = [1] * len(crop_indices.shape[:-1]) + [crop_height, crop_width, 2]
+    all_crop_inds = crop_indices.unsqueeze(-2).unsqueeze(-2) + crop_in_grid.reshape(grid_reshape)
+
+    # For using @torch.gather, convert to flat indices from 2D indices, and also
+    # repeat across the channel dimension. To get flat index of each pixel to grab for
+    # each sampled crop, we just use the mapping: ind = h_ind * @image_w + w_ind
+    all_crop_inds = (all_crop_inds[..., 0] * image_w + all_crop_inds[..., 1])  # shape [..., N, CH, CW]
+    all_crop_inds = tu.unsqueeze_expand_at(all_crop_inds, size=image_c, dim=-3)  # shape [..., N, C, CH, CW]
+    all_crop_inds = tu.flatten(all_crop_inds, begin_axis=-2)  # shape [..., N, C, CH * CW]
+
+    # Repeat and flatten the source images -> [..., N, C, H * W] and then use gather to index with crop pixel inds
+    images_to_crop = tu.unsqueeze_expand_at(images, size=num_crops, dim=-4)
+    images_to_crop = tu.flatten(images_to_crop, begin_axis=-2)
+    crops = torch.gather(images_to_crop, dim=-1, index=all_crop_inds)
+    # [..., N, C, CH * CW] -> [..., N, C, CH, CW]
+    reshape_axis = len(crops.shape) - 1
+    crops = tu.reshape_dimensions(
+        crops,
+        begin_axis=reshape_axis,
+        end_axis=reshape_axis,
+        target_dims=(crop_height, crop_width),
+    )
+
+    if is_padded:
+        # undo padding -> [..., C, CH, CW]
+        crops = crops.squeeze(-4)
+    return crops
+
+
+def sample_random_image_crops(images, crop_height, crop_width, num_crops, pos_enc=False):
+    """
+    For each image, randomly sample @num_crops crops of size (@crop_height, @crop_width), from
+    @images.
+
+    Args:
+        images (torch.Tensor): batch of images of shape [..., C, H, W]
+
+        crop_height (int): height of crop to take
+
+        crop_width (int): width of crop to take
+
+        num_crops (n): number of crops to sample
+
+        pos_enc (bool): if True, also add 2 channels to the outputs that gives a spatial
+            encoding of the original source pixel locations. This means that the
+            output crops will contain information about where in the source image
+            it was sampled from.
+
+    Returns:
+        crops (torch.Tensor): crops of shape (..., @num_crops, C, @crop_height, @crop_width)
+            if @pos_enc is False, otherwise (..., @num_crops, C + 2, @crop_height, @crop_width)
+
+        crop_inds (torch.Tensor): sampled crop indices of shape (..., N, 2)
+    """
+    device = images.device
+
+    # maybe add 2 channels of spatial encoding to the source image
+    source_im = images
+    if pos_enc:
+        # spatial encoding [y, x] in [0, 1]
+        h, w = source_im.shape[-2:]
+        pos_y, pos_x = torch.meshgrid(torch.arange(h), torch.arange(w))
+        pos_y = pos_y.float().to(device) / float(h)
+        pos_x = pos_x.float().to(device) / float(w)
+        position_enc = torch.stack((pos_y, pos_x))  # shape [C, H, W]
+
+        # unsqueeze and expand to match leading dimensions -> shape [..., C, H, W]
+        leading_shape = source_im.shape[:-3]
+        position_enc = position_enc[(None, ) * len(leading_shape)]
+        position_enc = position_enc.expand(*leading_shape, -1, -1, -1)
+
+        # concat across channel dimension with input
+        source_im = torch.cat((source_im, position_enc), dim=-3)
+
+    # make sure sample boundaries ensure crops are fully within the images
+    image_c, image_h, image_w = source_im.shape[-3:]
+    max_sample_h = image_h - crop_height
+    max_sample_w = image_w - crop_width
+
+    # Sample crop locations for all tensor dimensions up to the last 3, which are [C, H, W].
+    # Each gets @num_crops samples - typically this will just be the batch dimension (B), so
+    # we will sample [B, N] indices, but this supports having more than one leading dimension,
+    # or possibly no leading dimension.
+    #
+    # Trick: sample in [0, 1) with rand, then re-scale to [0, M) and convert to long to get sampled ints
+    crop_inds_h = (max_sample_h * torch.rand(*source_im.shape[:-3], num_crops).to(device)).long()
+    crop_inds_w = (max_sample_w * torch.rand(*source_im.shape[:-3], num_crops).to(device)).long()
+    crop_inds = torch.cat((crop_inds_h.unsqueeze(-1), crop_inds_w.unsqueeze(-1)), dim=-1)  # shape [..., N, 2]
+
+    crops = crop_image_from_indices(
+        images=source_im,
+        crop_indices=crop_inds,
+        crop_height=crop_height,
+        crop_width=crop_width,
+    )
+
+    return crops, crop_inds
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/vision/model_getter.py b/r2sVLA/algos/DP/diffusion_policy/model/vision/model_getter.py
new file mode 100644
index 0000000..6997242
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/vision/model_getter.py
@@ -0,0 +1,36 @@
+import torch
+import torchvision
+
+
+def get_resnet(name, weights=None, **kwargs):
+    """
+    name: resnet18, resnet34, resnet50
+    weights: "IMAGENET1K_V1", "r3m"
+    """
+    # load r3m weights
+    if (weights == "r3m") or (weights == "R3M"):
+        return get_r3m(name=name, **kwargs)
+
+    func = getattr(torchvision.models, name)
+    resnet = func(weights=weights, **kwargs)
+    resnet.fc = torch.nn.Identity()
+    # resnet_new = torch.nn.Sequential(
+    #     resnet,
+    #     torch.nn.Linear(512, 128)
+    # )
+    # return resnet_new
+    return resnet
+
+
+def get_r3m(name, **kwargs):
+    """
+    name: resnet18, resnet34, resnet50
+    """
+    import r3m
+
+    r3m.device = "cpu"
+    model = r3m.load_r3m(name)
+    r3m_model = model.module
+    resnet_model = r3m_model.convnet
+    resnet_model = resnet_model.to("cpu")
+    return resnet_model
diff --git a/r2sVLA/algos/DP/diffusion_policy/model/vision/multi_image_obs_encoder.py b/r2sVLA/algos/DP/diffusion_policy/model/vision/multi_image_obs_encoder.py
new file mode 100644
index 0000000..c7e77ac
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/model/vision/multi_image_obs_encoder.py
@@ -0,0 +1,191 @@
+from typing import Dict, Tuple, Union
+import copy
+import torch
+import torch.nn as nn
+import torchvision
+from diffusion_policy.model.vision.crop_randomizer import CropRandomizer
+from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
+from diffusion_policy.common.pytorch_util import dict_apply, replace_submodules
+
+
+class MultiImageObsEncoder(ModuleAttrMixin):
+
+    def __init__(
+        self,
+        shape_meta: dict,
+        rgb_model: Union[nn.Module, Dict[str, nn.Module]],
+        resize_shape: Union[Tuple[int, int], Dict[str, tuple], None] = None,
+        crop_shape: Union[Tuple[int, int], Dict[str, tuple], None] = None,
+        random_crop: bool = True,
+        # replace BatchNorm with GroupNorm
+        use_group_norm: bool = False,
+        # use single rgb model for all rgb inputs
+        share_rgb_model: bool = False,
+        # renormalize rgb input with imagenet normalization
+        # assuming input in [0,1]
+        imagenet_norm: bool = False,
+    ):
+        """
+        Assumes rgb input: B,C,H,W
+        Assumes low_dim input: B,D
+        """
+        super().__init__()
+
+        rgb_keys = list()
+        low_dim_keys = list()
+        key_model_map = nn.ModuleDict()
+        key_transform_map = nn.ModuleDict()
+        key_shape_map = dict()
+
+        # handle sharing vision backbone
+        if share_rgb_model:
+            assert isinstance(rgb_model, nn.Module)
+            key_model_map["rgb"] = rgb_model
+
+        obs_shape_meta = shape_meta["obs"]
+        for key, attr in obs_shape_meta.items():
+            shape = tuple(attr["shape"])
+            type = attr.get("type", "low_dim")
+            key_shape_map[key] = shape
+            if type == "rgb":
+                rgb_keys.append(key)
+                # configure model for this key
+                this_model = None
+                if not share_rgb_model:
+                    if isinstance(rgb_model, dict):
+                        # have provided model for each key
+                        this_model = rgb_model[key]
+                    else:
+                        assert isinstance(rgb_model, nn.Module)
+                        # have a copy of the rgb model
+                        this_model = copy.deepcopy(rgb_model)
+
+                if this_model is not None:
+                    if use_group_norm:
+                        this_model = replace_submodules(
+                            root_module=this_model,
+                            predicate=lambda x: isinstance(x, nn.BatchNorm2d),
+                            func=lambda x: nn.GroupNorm(
+                                num_groups=x.num_features // 16,
+                                num_channels=x.num_features,
+                            ),
+                        )
+                    key_model_map[key] = this_model
+
+                # configure resize
+                input_shape = shape
+                this_resizer = nn.Identity()
+                if resize_shape is not None:
+                    if isinstance(resize_shape, dict):
+                        h, w = resize_shape[key]
+                    else:
+                        h, w = resize_shape
+                    this_resizer = torchvision.transforms.Resize(size=(h, w))
+                    input_shape = (shape[0], h, w)
+
+                # configure randomizer
+                this_randomizer = nn.Identity()
+                if crop_shape is not None:
+                    if isinstance(crop_shape, dict):
+                        h, w = crop_shape[key]
+                    else:
+                        h, w = crop_shape
+                    if random_crop:
+                        this_randomizer = CropRandomizer(
+                            input_shape=input_shape,
+                            crop_height=h,
+                            crop_width=w,
+                            num_crops=1,
+                            pos_enc=False,
+                        )
+                    else:
+                        this_normalizer = torchvision.transforms.CenterCrop(size=(h, w))
+                # configure normalizer
+                this_normalizer = nn.Identity()
+                if imagenet_norm:
+                    this_normalizer = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                                                       std=[0.229, 0.224, 0.225])
+
+                this_transform = nn.Sequential(this_resizer, this_randomizer, this_normalizer)
+                key_transform_map[key] = this_transform
+            elif type == "low_dim":
+                low_dim_keys.append(key)
+            else:
+                raise RuntimeError(f"Unsupported obs type: {type}")
+        rgb_keys = sorted(rgb_keys)
+        low_dim_keys = sorted(low_dim_keys)
+
+        self.shape_meta = shape_meta
+        self.key_model_map = key_model_map
+        self.key_transform_map = key_transform_map
+        self.share_rgb_model = share_rgb_model
+        self.rgb_keys = rgb_keys
+        self.low_dim_keys = low_dim_keys
+        self.key_shape_map = key_shape_map
+
+    def forward(self, obs_dict):
+        batch_size = None
+        features = list()
+        # process rgb input
+        if self.share_rgb_model:
+            # pass all rgb obs to rgb model
+            imgs = list()
+            for key in self.rgb_keys:
+                img = obs_dict[key]
+                if batch_size is None:
+                    batch_size = img.shape[0]
+                else:
+                    assert batch_size == img.shape[0]
+                assert img.shape[1:] == self.key_shape_map[key]
+                img = self.key_transform_map[key](img)
+                imgs.append(img)
+            # (N*B,C,H,W)
+            imgs = torch.cat(imgs, dim=0)
+            # (N*B,D)
+            feature = self.key_model_map["rgb"](imgs)
+            # (N,B,D)
+            feature = feature.reshape(-1, batch_size, *feature.shape[1:])
+            # (B,N,D)
+            feature = torch.moveaxis(feature, 0, 1)
+            # (B,N*D)
+            feature = feature.reshape(batch_size, -1)
+            features.append(feature)
+        else:
+            # run each rgb obs to independent models
+            for key in self.rgb_keys:
+                img = obs_dict[key]
+                if batch_size is None:
+                    batch_size = img.shape[0]
+                else:
+                    assert batch_size == img.shape[0]
+                assert img.shape[1:] == self.key_shape_map[key]
+                img = self.key_transform_map[key](img)
+                feature = self.key_model_map[key](img)
+                features.append(feature)
+
+        # process lowdim input
+        for key in self.low_dim_keys:
+            data = obs_dict[key]
+            if batch_size is None:
+                batch_size = data.shape[0]
+            else:
+                assert batch_size == data.shape[0]
+            assert data.shape[1:] == self.key_shape_map[key]
+            features.append(data)
+
+        # concatenate all features
+        result = torch.cat(features, dim=-1)
+        return result
+
+    @torch.no_grad()
+    def output_shape(self):
+        example_obs_dict = dict()
+        obs_shape_meta = self.shape_meta["obs"]
+        batch_size = 1
+        for key, attr in obs_shape_meta.items():
+            shape = tuple(attr["shape"])
+            this_obs = torch.zeros((batch_size, ) + shape, dtype=self.dtype, device=self.device)
+            example_obs_dict[key] = this_obs
+        example_output = self.forward(example_obs_dict)
+        output_shape = example_output.shape[1:]
+        return output_shape
diff --git a/r2sVLA/algos/DP/diffusion_policy/policy/base_image_policy.py b/r2sVLA/algos/DP/diffusion_policy/policy/base_image_policy.py
new file mode 100644
index 0000000..3a926a3
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/policy/base_image_policy.py
@@ -0,0 +1,26 @@
+from typing import Dict
+import torch
+import torch.nn as nn
+from diffusion_policy.model.common.module_attr_mixin import ModuleAttrMixin
+from diffusion_policy.model.common.normalizer import LinearNormalizer
+
+
+class BaseImagePolicy(ModuleAttrMixin):
+    # init accepts keyword argument shape_meta, see config/task/*_image.yaml
+
+    def predict_action(self, obs_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        obs_dict:
+            str: B,To,*
+        return: B,Ta,Da
+        """
+        raise NotImplementedError()
+
+    # reset state for stateful policies
+    def reset(self):
+        pass
+
+    # ========== training ===========
+    # no standard training interface except setting normalizer
+    def set_normalizer(self, normalizer: LinearNormalizer):
+        raise NotImplementedError()
diff --git a/r2sVLA/algos/DP/diffusion_policy/policy/diffusion_unet_image_policy.py b/r2sVLA/algos/DP/diffusion_policy/policy/diffusion_unet_image_policy.py
new file mode 100644
index 0000000..c93c302
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/policy/diffusion_unet_image_policy.py
@@ -0,0 +1,258 @@
+from typing import Dict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, reduce
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+
+from diffusion_policy.model.common.normalizer import LinearNormalizer
+from diffusion_policy.policy.base_image_policy import BaseImagePolicy
+from diffusion_policy.model.diffusion.conditional_unet1d import ConditionalUnet1D
+from diffusion_policy.model.diffusion.mask_generator import LowdimMaskGenerator
+from diffusion_policy.model.vision.multi_image_obs_encoder import MultiImageObsEncoder
+from diffusion_policy.common.pytorch_util import dict_apply
+
+
+class DiffusionUnetImagePolicy(BaseImagePolicy):
+
+    def __init__(
+        self,
+        shape_meta: dict,
+        noise_scheduler: DDPMScheduler,
+        obs_encoder: MultiImageObsEncoder,
+        horizon,
+        n_action_steps,
+        n_obs_steps,
+        num_inference_steps=None,
+        obs_as_global_cond=True,
+        diffusion_step_embed_dim=256,
+        down_dims=(256, 512, 1024),
+        kernel_size=5,
+        n_groups=8,
+        cond_predict_scale=True,
+        # parameters passed to step
+        **kwargs,
+    ):
+        super().__init__()
+
+        # parse shapes
+        action_shape = shape_meta["action"]["shape"]
+        assert len(action_shape) == 1
+        action_dim = action_shape[0]
+        # get feature dim
+        obs_feature_dim = obs_encoder.output_shape()[0]
+
+        # create diffusion model
+        input_dim = action_dim + obs_feature_dim
+        global_cond_dim = None
+        if obs_as_global_cond:
+            input_dim = action_dim
+            global_cond_dim = obs_feature_dim * n_obs_steps
+
+        model = ConditionalUnet1D(
+            input_dim=input_dim,
+            local_cond_dim=None,
+            global_cond_dim=global_cond_dim,
+            diffusion_step_embed_dim=diffusion_step_embed_dim,
+            down_dims=down_dims,
+            kernel_size=kernel_size,
+            n_groups=n_groups,
+            cond_predict_scale=cond_predict_scale,
+        )
+
+        self.obs_encoder = obs_encoder
+        self.model = model
+        self.noise_scheduler = noise_scheduler
+        self.mask_generator = LowdimMaskGenerator(
+            action_dim=action_dim,
+            obs_dim=0 if obs_as_global_cond else obs_feature_dim,
+            max_n_obs_steps=n_obs_steps,
+            fix_obs_steps=True,
+            action_visible=False,
+        )
+        self.normalizer = LinearNormalizer()
+        self.horizon = horizon
+        self.obs_feature_dim = obs_feature_dim
+        self.action_dim = action_dim
+        self.n_action_steps = n_action_steps
+        self.n_obs_steps = n_obs_steps
+        self.obs_as_global_cond = obs_as_global_cond
+        self.kwargs = kwargs
+
+        if num_inference_steps is None:
+            num_inference_steps = noise_scheduler.config.num_train_timesteps
+        self.num_inference_steps = num_inference_steps
+
+    # ========= inference  ============
+    def conditional_sample(
+        self,
+        condition_data,
+        condition_mask,
+        local_cond=None,
+        global_cond=None,
+        generator=None,
+        # keyword arguments to scheduler.step
+        **kwargs,
+    ):
+        model = self.model
+        scheduler = self.noise_scheduler
+
+        trajectory = torch.randn(
+            size=condition_data.shape,
+            dtype=condition_data.dtype,
+            device=condition_data.device,
+            generator=generator,
+        )
+
+        # set step values
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        for t in scheduler.timesteps:
+            # 1. apply conditioning
+            trajectory[condition_mask] = condition_data[condition_mask]
+
+            # 2. predict model output
+            model_output = model(trajectory, t, local_cond=local_cond, global_cond=global_cond)
+
+            # 3. compute previous image: x_t -> x_t-1
+            trajectory = scheduler.step(model_output, t, trajectory, generator=generator, **kwargs).prev_sample
+
+        # finally make sure conditioning is enforced
+        trajectory[condition_mask] = condition_data[condition_mask]
+
+        return trajectory
+
+    def predict_action(self, obs_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        obs_dict: must include "obs" key
+        result: must include "action" key
+        """
+        assert "past_action" not in obs_dict  # not implemented yet
+        # normalize input
+        nobs = self.normalizer.normalize(obs_dict)
+        value = next(iter(nobs.values()))
+        B, To = value.shape[:2]
+        T = self.horizon
+        Da = self.action_dim
+        Do = self.obs_feature_dim
+        To = self.n_obs_steps
+
+        # build input
+        device = self.device
+        dtype = self.dtype
+
+        # handle different ways of passing observation
+        local_cond = None
+        global_cond = None
+        if self.obs_as_global_cond:
+            # condition through global feature
+            this_nobs = dict_apply(nobs, lambda x: x[:, :To, ...].reshape(-1, *x.shape[2:]))
+            nobs_features = self.obs_encoder(this_nobs)
+            # reshape back to B, Do
+            global_cond = nobs_features.reshape(B, -1)
+            # empty data for action
+            cond_data = torch.zeros(size=(B, T, Da), device=device, dtype=dtype)
+            cond_mask = torch.zeros_like(cond_data, dtype=torch.bool)
+        else:
+            # condition through impainting
+            this_nobs = dict_apply(nobs, lambda x: x[:, :To, ...].reshape(-1, *x.shape[2:]))
+            nobs_features = self.obs_encoder(this_nobs)
+            # reshape back to B, T, Do
+            nobs_features = nobs_features.reshape(B, To, -1)
+            cond_data = torch.zeros(size=(B, T, Da + Do), device=device, dtype=dtype)
+            cond_mask = torch.zeros_like(cond_data, dtype=torch.bool)
+            cond_data[:, :To, Da:] = nobs_features
+            cond_mask[:, :To, Da:] = True
+
+        # run sampling
+        nsample = self.conditional_sample(
+            cond_data,
+            cond_mask,
+            local_cond=local_cond,
+            global_cond=global_cond,
+            **self.kwargs,
+        )
+
+        # unnormalize prediction
+        naction_pred = nsample[..., :Da]
+        action_pred = self.normalizer["action"].unnormalize(naction_pred)
+
+        # get action
+        start = To - 1
+        end = start + self.n_action_steps
+        action = action_pred[:, start:end]
+
+        result = {"action": action, "action_pred": action_pred}
+        return result
+
+    # ========= training  ============
+    def set_normalizer(self, normalizer: LinearNormalizer):
+        self.normalizer.load_state_dict(normalizer.state_dict())
+
+    def compute_loss(self, batch):
+        # normalize input
+        assert "valid_mask" not in batch
+        nobs = self.normalizer.normalize(batch["obs"])
+        nactions = self.normalizer["action"].normalize(batch["action"])
+        batch_size = nactions.shape[0]
+        horizon = nactions.shape[1]
+
+        # handle different ways of passing observation
+        local_cond = None
+        global_cond = None
+        trajectory = nactions
+        cond_data = trajectory
+        if self.obs_as_global_cond:
+            # reshape B, T, ... to B*T
+            this_nobs = dict_apply(nobs, lambda x: x[:, :self.n_obs_steps, ...].reshape(-1, *x.shape[2:]))
+            nobs_features = self.obs_encoder(this_nobs)
+            # reshape back to B, Do
+            global_cond = nobs_features.reshape(batch_size, -1)
+        else:
+            # reshape B, T, ... to B*T
+            this_nobs = dict_apply(nobs, lambda x: x.reshape(-1, *x.shape[2:]))
+            nobs_features = self.obs_encoder(this_nobs)
+            # reshape back to B, T, Do
+            nobs_features = nobs_features.reshape(batch_size, horizon, -1)
+            cond_data = torch.cat([nactions, nobs_features], dim=-1)
+            trajectory = cond_data.detach()
+
+        # generate impainting mask
+        condition_mask = self.mask_generator(trajectory.shape)
+
+        # Sample noise that we'll add to the images
+        noise = torch.randn(trajectory.shape, device=trajectory.device)
+        bsz = trajectory.shape[0]
+        # Sample a random timestep for each image
+        timesteps = torch.randint(
+            0,
+            self.noise_scheduler.config.num_train_timesteps,
+            (bsz, ),
+            device=trajectory.device,
+        ).long()
+        # Add noise to the clean images according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_trajectory = self.noise_scheduler.add_noise(trajectory, noise, timesteps)
+
+        # compute loss mask
+        loss_mask = ~condition_mask
+
+        # apply conditioning
+        noisy_trajectory[condition_mask] = cond_data[condition_mask]
+
+        # Predict the noise residual
+        pred = self.model(noisy_trajectory, timesteps, local_cond=local_cond, global_cond=global_cond)
+
+        pred_type = self.noise_scheduler.config.prediction_type
+        if pred_type == "epsilon":
+            target = noise
+        elif pred_type == "sample":
+            target = trajectory
+        else:
+            raise ValueError(f"Unsupported prediction type {pred_type}")
+
+        loss = F.mse_loss(pred, target, reduction="none")
+        loss = loss * loss_mask.type(loss.dtype)
+        loss = reduce(loss, "b ... -> b (...)", "mean")
+        loss = loss.mean()
+        return loss
diff --git a/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_queue.py b/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_queue.py
new file mode 100644
index 0000000..39a099d
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_queue.py
@@ -0,0 +1,184 @@
+from typing import Dict, List, Union
+import numbers
+from queue import Empty, Full
+from multiprocessing.managers import SharedMemoryManager
+import numpy as np
+from diffusion_policy.shared_memory.shared_memory_util import (
+    ArraySpec,
+    SharedAtomicCounter,
+)
+from diffusion_policy.shared_memory.shared_ndarray import SharedNDArray
+
+
+class SharedMemoryQueue:
+    """
+    A Lock-Free FIFO Shared Memory Data Structure.
+    Stores a sequence of dict of numpy arrays.
+    """
+
+    def __init__(
+        self,
+        shm_manager: SharedMemoryManager,
+        array_specs: List[ArraySpec],
+        buffer_size: int,
+    ):
+
+        # create atomic counter
+        write_counter = SharedAtomicCounter(shm_manager)
+        read_counter = SharedAtomicCounter(shm_manager)
+
+        # allocate shared memory
+        shared_arrays = dict()
+        for spec in array_specs:
+            key = spec.name
+            assert key not in shared_arrays
+            array = SharedNDArray.create_from_shape(
+                mem_mgr=shm_manager,
+                shape=(buffer_size, ) + tuple(spec.shape),
+                dtype=spec.dtype,
+            )
+            shared_arrays[key] = array
+
+        self.buffer_size = buffer_size
+        self.array_specs = array_specs
+        self.write_counter = write_counter
+        self.read_counter = read_counter
+        self.shared_arrays = shared_arrays
+
+    @classmethod
+    def create_from_examples(
+        cls,
+        shm_manager: SharedMemoryManager,
+        examples: Dict[str, Union[np.ndarray, numbers.Number]],
+        buffer_size: int,
+    ):
+        specs = list()
+        for key, value in examples.items():
+            shape = None
+            dtype = None
+            if isinstance(value, np.ndarray):
+                shape = value.shape
+                dtype = value.dtype
+                assert dtype != np.dtype("O")
+            elif isinstance(value, numbers.Number):
+                shape = tuple()
+                dtype = np.dtype(type(value))
+            else:
+                raise TypeError(f"Unsupported type {type(value)}")
+
+            spec = ArraySpec(name=key, shape=shape, dtype=dtype)
+            specs.append(spec)
+
+        obj = cls(shm_manager=shm_manager, array_specs=specs, buffer_size=buffer_size)
+        return obj
+
+    def qsize(self):
+        read_count = self.read_counter.load()
+        write_count = self.write_counter.load()
+        n_data = write_count - read_count
+        return n_data
+
+    def empty(self):
+        n_data = self.qsize()
+        return n_data <= 0
+
+    def clear(self):
+        self.read_counter.store(self.write_counter.load())
+
+    def put(self, data: Dict[str, Union[np.ndarray, numbers.Number]]):
+        read_count = self.read_counter.load()
+        write_count = self.write_counter.load()
+        n_data = write_count - read_count
+        if n_data >= self.buffer_size:
+            raise Full()
+
+        next_idx = write_count % self.buffer_size
+
+        # write to shared memory
+        for key, value in data.items():
+            arr: np.ndarray
+            arr = self.shared_arrays[key].get()
+            if isinstance(value, np.ndarray):
+                arr[next_idx] = value
+            else:
+                arr[next_idx] = np.array(value, dtype=arr.dtype)
+
+        # update idx
+        self.write_counter.add(1)
+
+    def get(self, out=None) -> Dict[str, np.ndarray]:
+        write_count = self.write_counter.load()
+        read_count = self.read_counter.load()
+        n_data = write_count - read_count
+        if n_data <= 0:
+            raise Empty()
+
+        if out is None:
+            out = self._allocate_empty()
+
+        next_idx = read_count % self.buffer_size
+        for key, value in self.shared_arrays.items():
+            arr = value.get()
+            np.copyto(out[key], arr[next_idx])
+
+        # update idx
+        self.read_counter.add(1)
+        return out
+
+    def get_k(self, k, out=None) -> Dict[str, np.ndarray]:
+        write_count = self.write_counter.load()
+        read_count = self.read_counter.load()
+        n_data = write_count - read_count
+        if n_data <= 0:
+            raise Empty()
+        assert k <= n_data
+
+        out = self._get_k_impl(k, read_count, out=out)
+        self.read_counter.add(k)
+        return out
+
+    def get_all(self, out=None) -> Dict[str, np.ndarray]:
+        write_count = self.write_counter.load()
+        read_count = self.read_counter.load()
+        n_data = write_count - read_count
+        if n_data <= 0:
+            raise Empty()
+
+        out = self._get_k_impl(n_data, read_count, out=out)
+        self.read_counter.add(n_data)
+        return out
+
+    def _get_k_impl(self, k, read_count, out=None) -> Dict[str, np.ndarray]:
+        if out is None:
+            out = self._allocate_empty(k)
+
+        curr_idx = read_count % self.buffer_size
+        for key, value in self.shared_arrays.items():
+            arr = value.get()
+            target = out[key]
+
+            start = curr_idx
+            end = min(start + k, self.buffer_size)
+            target_start = 0
+            target_end = end - start
+            target[target_start:target_end] = arr[start:end]
+
+            remainder = k - (end - start)
+            if remainder > 0:
+                # wrap around
+                start = 0
+                end = start + remainder
+                target_start = target_end
+                target_end = k
+                target[target_start:target_end] = arr[start:end]
+
+        return out
+
+    def _allocate_empty(self, k=None):
+        result = dict()
+        for spec in self.array_specs:
+            shape = spec.shape
+            if k is not None:
+                shape = (k, ) + shape
+            result[spec.name] = np.empty(shape=shape, dtype=spec.dtype)
+        return result
diff --git a/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_ring_buffer.py b/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_ring_buffer.py
new file mode 100644
index 0000000..d918eb7
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_ring_buffer.py
@@ -0,0 +1,213 @@
+from typing import Dict, List, Union
+
+from queue import Empty
+import numbers
+import time
+from multiprocessing.managers import SharedMemoryManager
+import numpy as np
+
+from diffusion_policy.shared_memory.shared_ndarray import SharedNDArray
+from diffusion_policy.shared_memory.shared_memory_util import (
+    ArraySpec,
+    SharedAtomicCounter,
+)
+
+
+class SharedMemoryRingBuffer:
+    """
+    A Lock-Free FILO Shared Memory Data Structure.
+    Stores a sequence of dict of numpy arrays.
+    """
+
+    def __init__(
+        self,
+        shm_manager: SharedMemoryManager,
+        array_specs: List[ArraySpec],
+        get_max_k: int,
+        get_time_budget: float,
+        put_desired_frequency: float,
+        safety_margin: float = 1.5,
+    ):
+        """
+        shm_manager: Manages the life cycle of share memories
+            across processes. Remember to run .start() before passing.
+        array_specs: Name, shape and type of arrays for a single time step.
+        get_max_k: The maxmum number of items can be queried at once.
+        get_time_budget: The maxmum amount of time spent copying data from
+            shared memory to local memory. Increase this number for larger arrays.
+        put_desired_frequency: The maximum frequency that .put() can be called.
+            This influces the buffer size.
+        """
+
+        # create atomic counter
+        counter = SharedAtomicCounter(shm_manager)
+
+        # compute buffer size
+        # At any given moment, the past get_max_k items should never
+        # be touched (to be read freely). Assuming the reading is reading
+        # these k items, which takes maximum of get_time_budget seconds,
+        # we need enough empty slots to make sure put_desired_frequency Hz
+        # of put can be sustaied.
+        buffer_size = (int(np.ceil(put_desired_frequency * get_time_budget * safety_margin)) + get_max_k)
+
+        # allocate shared memory
+        shared_arrays = dict()
+        for spec in array_specs:
+            key = spec.name
+            assert key not in shared_arrays
+            array = SharedNDArray.create_from_shape(
+                mem_mgr=shm_manager,
+                shape=(buffer_size, ) + tuple(spec.shape),
+                dtype=spec.dtype,
+            )
+            shared_arrays[key] = array
+
+        # allocate timestamp array
+        timestamp_array = SharedNDArray.create_from_shape(mem_mgr=shm_manager, shape=(buffer_size, ), dtype=np.float64)
+        timestamp_array.get()[:] = -np.inf
+
+        self.buffer_size = buffer_size
+        self.array_specs = array_specs
+        self.counter = counter
+        self.shared_arrays = shared_arrays
+        self.timestamp_array = timestamp_array
+        self.get_time_budget = get_time_budget
+        self.get_max_k = get_max_k
+        self.put_desired_frequency = put_desired_frequency
+
+    @property
+    def count(self):
+        return self.counter.load()
+
+    @classmethod
+    def create_from_examples(
+        cls,
+        shm_manager: SharedMemoryManager,
+        examples: Dict[str, Union[np.ndarray, numbers.Number]],
+        get_max_k: int = 32,
+        get_time_budget: float = 0.01,
+        put_desired_frequency: float = 60,
+    ):
+        specs = list()
+        for key, value in examples.items():
+            shape = None
+            dtype = None
+            if isinstance(value, np.ndarray):
+                shape = value.shape
+                dtype = value.dtype
+                assert dtype != np.dtype("O")
+            elif isinstance(value, numbers.Number):
+                shape = tuple()
+                dtype = np.dtype(type(value))
+            else:
+                raise TypeError(f"Unsupported type {type(value)}")
+
+            spec = ArraySpec(name=key, shape=shape, dtype=dtype)
+            specs.append(spec)
+
+        obj = cls(
+            shm_manager=shm_manager,
+            array_specs=specs,
+            get_max_k=get_max_k,
+            get_time_budget=get_time_budget,
+            put_desired_frequency=put_desired_frequency,
+        )
+        return obj
+
+    def clear(self):
+        self.counter.store(0)
+
+    def put(self, data: Dict[str, Union[np.ndarray, numbers.Number]], wait: bool = True):
+        count = self.counter.load()
+        next_idx = count % self.buffer_size
+        # Make sure the next self.get_max_k elements in the ring buffer have at least
+        # self.get_time_budget seconds untouched after written, so that
+        # get_last_k can safely read k elements from any count location.
+        # Sanity check: when get_max_k == 1, the element pointed by next_idx
+        # should be rewritten at minimum self.get_time_budget seconds later.
+        timestamp_lookahead_idx = (next_idx + self.get_max_k - 1) % self.buffer_size
+        old_timestamp = self.timestamp_array.get()[timestamp_lookahead_idx]
+        t = time.monotonic()
+        if (t - old_timestamp) < self.get_time_budget:
+            deltat = t - old_timestamp
+            if wait:
+                # sleep the remaining time to be safe
+                time.sleep(self.get_time_budget - deltat)
+            else:
+                # throw an error
+                past_iters = self.buffer_size - self.get_max_k
+                hz = past_iters / deltat
+                raise TimeoutError("Put executed too fast {}items/{:.4f}s ~= {}Hz".format(past_iters, deltat, hz))
+
+        # write to shared memory
+        for key, value in data.items():
+            arr: np.ndarray
+            arr = self.shared_arrays[key].get()
+            if isinstance(value, np.ndarray):
+                arr[next_idx] = value
+            else:
+                arr[next_idx] = np.array(value, dtype=arr.dtype)
+
+        # update timestamp
+        self.timestamp_array.get()[next_idx] = time.monotonic()
+        self.counter.add(1)
+
+    def _allocate_empty(self, k=None):
+        result = dict()
+        for spec in self.array_specs:
+            shape = spec.shape
+            if k is not None:
+                shape = (k, ) + shape
+            result[spec.name] = np.empty(shape=shape, dtype=spec.dtype)
+        return result
+
+    def get(self, out=None) -> Dict[str, np.ndarray]:
+        if out is None:
+            out = self._allocate_empty()
+        start_time = time.monotonic()
+        count = self.counter.load()
+        curr_idx = (count - 1) % self.buffer_size
+        for key, value in self.shared_arrays.items():
+            arr = value.get()
+            np.copyto(out[key], arr[curr_idx])
+        end_time = time.monotonic()
+        dt = end_time - start_time
+        if dt > self.get_time_budget:
+            raise TimeoutError(f"Get time out {dt} vs {self.get_time_budget}")
+        return out
+
+    def get_last_k(self, k: int, out=None) -> Dict[str, np.ndarray]:
+        assert k <= self.get_max_k
+        if out is None:
+            out = self._allocate_empty(k)
+        start_time = time.monotonic()
+        count = self.counter.load()
+        assert k <= count
+        curr_idx = (count - 1) % self.buffer_size
+        for key, value in self.shared_arrays.items():
+            arr = value.get()
+            target = out[key]
+
+            end = curr_idx + 1
+            start = max(0, end - k)
+            target_end = k
+            target_start = target_end - (end - start)
+            target[target_start:target_end] = arr[start:end]
+
+            remainder = k - (end - start)
+            if remainder > 0:
+                # wrap around
+                end = self.buffer_size
+                start = end - remainder
+                target_start = 0
+                target_end = end - start
+                target[target_start:target_end] = arr[start:end]
+        end_time = time.monotonic()
+        dt = end_time - start_time
+        if dt > self.get_time_budget:
+            raise TimeoutError(f"Get time out {dt} vs {self.get_time_budget}")
+        return out
+
+    def get_all(self) -> Dict[str, np.ndarray]:
+        k = min(self.count, self.get_max_k)
+        return self.get_last_k(k=k)
diff --git a/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_util.py b/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_util.py
new file mode 100644
index 0000000..2396208
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_memory_util.py
@@ -0,0 +1,38 @@
+from typing import Tuple
+from dataclasses import dataclass
+import numpy as np
+from multiprocessing.managers import SharedMemoryManager
+from atomics import atomicview, MemoryOrder, UINT
+
+
+@dataclass
+class ArraySpec:
+    name: str
+    shape: Tuple[int]
+    dtype: np.dtype
+
+
+class SharedAtomicCounter:
+
+    def __init__(self, shm_manager: SharedMemoryManager, size: int = 8):  # 64bit int
+        shm = shm_manager.SharedMemory(size=size)
+        self.shm = shm
+        self.size = size
+        self.store(0)  # initialize
+
+    @property
+    def buf(self):
+        return self.shm.buf[:self.size]
+
+    def load(self) -> int:
+        with atomicview(buffer=self.buf, atype=UINT) as a:
+            value = a.load(order=MemoryOrder.ACQUIRE)
+        return value
+
+    def store(self, value: int):
+        with atomicview(buffer=self.buf, atype=UINT) as a:
+            a.store(value, order=MemoryOrder.RELEASE)
+
+    def add(self, value: int):
+        with atomicview(buffer=self.buf, atype=UINT) as a:
+            a.add(value, order=MemoryOrder.ACQ_REL)
diff --git a/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_ndarray.py b/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_ndarray.py
new file mode 100644
index 0000000..d027bb8
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/shared_memory/shared_ndarray.py
@@ -0,0 +1,161 @@
+from __future__ import annotations
+
+import multiprocessing
+import multiprocessing.synchronize
+from multiprocessing.managers import SharedMemoryManager
+from multiprocessing.shared_memory import SharedMemory
+from typing import Any, TYPE_CHECKING, Generic, Optional, Tuple, TypeVar, Union
+
+import numpy as np
+import numpy.typing as npt
+from diffusion_policy.common.nested_dict_util import nested_dict_check, nested_dict_map
+
+SharedMemoryLike = Union[str, SharedMemory]  # shared memory or name of shared memory
+SharedT = TypeVar("SharedT", bound=np.generic)
+
+
+class SharedNDArray(Generic[SharedT]):
+    """Class to keep track of and retrieve the data in a shared array
+    Attributes
+    ----------
+    shm
+        SharedMemory object containing the data of the array
+    shape
+        Shape of the NumPy array
+    dtype
+        Type of the NumPy array. Anything that may be passed to the `dtype=` argument in `np.ndarray`.
+    lock
+        (Optional) multiprocessing.Lock to manage access to the SharedNDArray. This is only created if
+        lock=True is passed to the constructor, otherwise it is set to `None`.
+    A SharedNDArray object may be created either directly with a preallocated shared memory object plus the
+    dtype and shape of the numpy array it represents:
+    >>> from multiprocessing.shared_memory import SharedMemory
+    >>> import numpy as np
+    >>> from shared_ndarray2 import SharedNDArray
+    >>> x = np.array([1, 2, 3])
+    >>> shm = SharedMemory(name="x", create=True, size=x.nbytes)
+    >>> arr = SharedNDArray(shm, x.shape, x.dtype)
+    >>> arr[:] = x[:]  # copy x into the array
+    >>> print(arr[:])
+    [1 2 3]
+    >>> shm.close()
+    >>> shm.unlink()
+    Or using a SharedMemoryManager either from an existing array or from arbitrary shape and nbytes:
+    >>> from multiprocessing.managers import SharedMemoryManager
+    >>> mem_mgr = SharedMemoryManager()
+    >>> mem_mgr.start()  # Better yet, use SharedMemoryManager context manager
+    >>> arr = SharedNDArray.from_shape(mem_mgr, x.shape, x.dtype)
+    >>> arr[:] = x[:]  # copy x into the array
+    >>> print(arr[:])
+    [1 2 3]
+    >>> # -or in one step-
+    >>> arr = SharedNDArray.from_array(mem_mgr, x)
+    >>> print(arr[:])
+    [1 2 3]
+    `SharedNDArray` does not subclass numpy.ndarray but rather generates an ndarray on-the-fly in get(),
+    which is used in __getitem__ and __setitem__. Thus to access the data and/or use any ndarray methods
+    get() or __getitem__ or __setitem__ must be used
+    >>> arr.max()  # ERROR: SharedNDArray has no `max` method.
+    Traceback (most recent call last):
+        ....
+    AttributeError: SharedNDArray object has no attribute 'max'. To access NumPy ndarray object use .get() method.
+    >>> arr.get().max()  # (or arr[:].max())  OK: This gets an ndarray on which we can operate
+    3
+    >>> y = np.zeros(3)
+    >>> y[:] = arr  # ERROR: Cannot broadcast-assign a SharedNDArray to ndarray `y`
+    Traceback (most recent call last):
+        ...
+    ValueError: setting an array element with a sequence.
+    >>> y[:] = arr[:]  # OK: This gets an ndarray that can be copied element-wise to `y`
+    >>> mem_mgr.shutdown()
+    """
+
+    shm: SharedMemory
+    # shape: Tuple[int, ...]  # is a property
+    dtype: np.dtype
+    lock: Optional[multiprocessing.synchronize.Lock]
+
+    def __init__(self, shm: SharedMemoryLike, shape: Tuple[int, ...], dtype: npt.DTypeLike):
+        """Initialize a SharedNDArray object from existing shared memory, object shape, and dtype.
+        To initialize a SharedNDArray object from a memory manager and data or shape, use the `from_array()
+        or `from_shape()` classmethods.
+        Parameters
+        ----------
+        shm
+            `multiprocessing.shared_memory.SharedMemory` object or name for connecting to an existing block
+            of shared memory (using SharedMemory constructor)
+        shape
+            Shape of the NumPy array to be represented in the shared memory
+        dtype
+            Data type for the NumPy array to be represented in shared memory. Any valid argument for
+            `np.dtype` may be used as it will be converted to an actual `dtype` object.
+        lock : bool, optional
+            If True, create a multiprocessing.Lock object accessible with the `.lock` attribute, by default
+            False.  If passing the `SharedNDArray` as an argument to a `multiprocessing.Pool` function this
+            should not be used -- see this comment to a Stack Overflow question about `multiprocessing.Lock`:
+            https://stackoverflow.com/questions/25557686/python-sharing-a-lock-between-processes#comment72803059_25558333
+        Raises
+        ------
+        ValueError
+            The SharedMemory size (number of bytes) does not match the product of the shape and dtype
+            itemsize.
+        """
+        if isinstance(shm, str):
+            shm = SharedMemory(name=shm, create=False)
+        dtype = np.dtype(dtype)  # Try to convert to dtype
+        assert shm.size >= (dtype.itemsize * np.prod(shape))
+        self.shm = shm
+        self.dtype = dtype
+        self._shape: Tuple[int, ...] = shape
+
+    def __repr__(self):
+        # Like numpy's ndarray repr
+        cls_name = self.__class__.__name__
+        nspaces = len(cls_name) + 1
+        array_repr = str(self.get())
+        array_repr = array_repr.replace("\n", "\n" + " " * nspaces)
+        return f"{cls_name}({array_repr}, dtype={self.dtype})"
+
+    @classmethod
+    def create_from_array(cls, mem_mgr: SharedMemoryManager, arr: npt.NDArray[SharedT]) -> SharedNDArray[SharedT]:
+        """Create a SharedNDArray from a SharedMemoryManager and an existing numpy array.
+        Parameters
+        ----------
+        mem_mgr
+            Running `multiprocessing.managers.SharedMemoryManager` instance from which to create the
+            SharedMemory for the SharedNDArray
+        arr
+            NumPy `ndarray` object to copy into the created SharedNDArray upon initialization.
+        """
+        # Simply use from_shape() to create the SharedNDArray and copy the data into it.
+        shared_arr = cls.create_from_shape(mem_mgr, arr.shape, arr.dtype)
+        shared_arr.get()[:] = arr[:]
+        return shared_arr
+
+    @classmethod
+    def create_from_shape(cls, mem_mgr: SharedMemoryManager, shape: Tuple, dtype: npt.DTypeLike) -> SharedNDArray:
+        """Create a SharedNDArray directly from a SharedMemoryManager
+        Parameters
+        ----------
+        mem_mgr
+            SharedMemoryManager instance that has been started
+        shape
+            Shape of the array
+        dtype
+            Data type for the NumPy array to be represented in shared memory. Any valid argument for
+            `np.dtype` may be used as it will be converted to an actual `dtype` object.
+        """
+        dtype = np.dtype(dtype)  # Convert to dtype if possible
+        shm = mem_mgr.SharedMemory(np.prod(shape) * dtype.itemsize)
+        return cls(shm=shm, shape=shape, dtype=dtype)
+
+    @property
+    def shape(self) -> Tuple[int, ...]:
+        return self._shape
+
+    def get(self) -> npt.NDArray[SharedT]:
+        """Get a numpy array with access to the shared memory"""
+        return np.ndarray(self.shape, dtype=self.dtype, buffer=self.shm.buf)
+
+    def __del__(self):
+        self.shm.close()
diff --git a/r2sVLA/algos/DP/diffusion_policy/workspace/base_workspace.py b/r2sVLA/algos/DP/diffusion_policy/workspace/base_workspace.py
new file mode 100644
index 0000000..d11c157
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/workspace/base_workspace.py
@@ -0,0 +1,138 @@
+from typing import Optional
+import os
+import pathlib
+import hydra
+import copy
+from hydra.core.hydra_config import HydraConfig
+from omegaconf import OmegaConf
+import dill
+import torch
+import threading
+
+
+class BaseWorkspace:
+    include_keys = tuple()
+    exclude_keys = tuple()
+
+    def __init__(self, cfg: OmegaConf, output_dir: Optional[str] = None):
+        self.cfg = cfg
+        self._output_dir = output_dir
+        self._saving_thread = None
+
+    @property
+    def output_dir(self):
+        output_dir = self._output_dir
+        if output_dir is None:
+            output_dir = HydraConfig.get().runtime.output_dir
+        return output_dir
+
+    def run(self):
+        """
+        Create any resource shouldn't be serialized as local variables
+        """
+        pass
+
+    def save_checkpoint(
+        self,
+        path=None,
+        tag="latest",
+        exclude_keys=None,
+        include_keys=None,
+        use_thread=True,
+    ):
+        if path is None:
+            path = pathlib.Path(self.output_dir).joinpath("checkpoints", f"{tag}.ckpt")
+        else:
+            path = pathlib.Path(path)
+        if exclude_keys is None:
+            exclude_keys = tuple(self.exclude_keys)
+        if include_keys is None:
+            include_keys = tuple(self.include_keys) + ("_output_dir", )
+
+        path.parent.mkdir(parents=True, exist_ok=True)
+        payload = {"cfg": self.cfg, "state_dicts": dict(), "pickles": dict()}
+
+        for key, value in self.__dict__.items():
+            if hasattr(value, "state_dict") and hasattr(value, "load_state_dict"):
+                # modules, optimizers and samplers etc
+                if key not in exclude_keys:
+                    if use_thread:
+                        payload["state_dicts"][key] = _copy_to_cpu(value.state_dict())
+                    else:
+                        payload["state_dicts"][key] = value.state_dict()
+            elif key in include_keys:
+                payload["pickles"][key] = dill.dumps(value)
+        if use_thread:
+            self._saving_thread = threading.Thread(
+                target=lambda: torch.save(payload, path.open("wb"), pickle_module=dill))
+            self._saving_thread.start()
+        else:
+            torch.save(payload, path.open("wb"), pickle_module=dill)
+        return str(path.absolute())
+
+    def get_checkpoint_path(self, tag="latest"):
+        return pathlib.Path(self.output_dir).joinpath("checkpoints", f"{tag}.ckpt")
+
+    def load_payload(self, payload, exclude_keys=None, include_keys=None, **kwargs):
+        if exclude_keys is None:
+            exclude_keys = tuple()
+        if include_keys is None:
+            include_keys = payload["pickles"].keys()
+
+        for key, value in payload["state_dicts"].items():
+            if key not in exclude_keys:
+                self.__dict__[key].load_state_dict(value, **kwargs)
+        for key in include_keys:
+            if key in payload["pickles"]:
+                self.__dict__[key] = dill.loads(payload["pickles"][key])
+
+    def load_checkpoint(self, path=None, tag="latest", exclude_keys=None, include_keys=None, **kwargs):
+        if path is None:
+            path = self.get_checkpoint_path(tag=tag)
+        else:
+            path = pathlib.Path(path)
+        payload = torch.load(path.open("rb"), pickle_module=dill, **kwargs)
+        self.load_payload(payload, exclude_keys=exclude_keys, include_keys=include_keys)
+        return payload
+
+    @classmethod
+    def create_from_checkpoint(cls, path, exclude_keys=None, include_keys=None, **kwargs):
+        payload = torch.load(open(path, "rb"), pickle_module=dill)
+        instance = cls(payload["cfg"])
+        instance.load_payload(
+            payload=payload,
+            exclude_keys=exclude_keys,
+            include_keys=include_keys,
+            **kwargs,
+        )
+        return instance
+
+    def save_snapshot(self, tag="latest"):
+        """
+        Quick loading and saving for reserach, saves full state of the workspace.
+
+        However, loading a snapshot assumes the code stays exactly the same.
+        Use save_checkpoint for long-term storage.
+        """
+        path = pathlib.Path(self.output_dir).joinpath("snapshots", f"{tag}.pkl")
+        path.parent.mkdir(parents=False, exist_ok=True)
+        torch.save(self, path.open("wb"), pickle_module=dill)
+        return str(path.absolute())
+
+    @classmethod
+    def create_from_snapshot(cls, path):
+        return torch.load(open(path, "rb"), pickle_module=dill)
+
+
+def _copy_to_cpu(x):
+    if isinstance(x, torch.Tensor):
+        return x.detach().to("cpu")
+    elif isinstance(x, dict):
+        result = dict()
+        for k, v in x.items():
+            result[k] = _copy_to_cpu(v)
+        return result
+    elif isinstance(x, list):
+        return [_copy_to_cpu(k) for k in x]
+    else:
+        return copy.deepcopy(x)
diff --git a/r2sVLA/algos/DP/diffusion_policy/workspace/robotworkspace.py b/r2sVLA/algos/DP/diffusion_policy/workspace/robotworkspace.py
new file mode 100644
index 0000000..8575a7e
--- /dev/null
+++ b/r2sVLA/algos/DP/diffusion_policy/workspace/robotworkspace.py
@@ -0,0 +1,348 @@
+if __name__ == "__main__":
+    import sys
+    import os
+    import pathlib
+
+    ROOT_DIR = str(pathlib.Path(__file__).parent.parent.parent)
+    sys.path.append(ROOT_DIR)
+    os.chdir(ROOT_DIR)
+
+import os
+import hydra
+import torch
+from omegaconf import OmegaConf
+import pathlib
+from torch.utils.data import DataLoader
+import copy
+
+import tqdm, random
+import numpy as np
+from diffusion_policy.workspace.base_workspace import BaseWorkspace
+from diffusion_policy.policy.diffusion_unet_image_policy import DiffusionUnetImagePolicy
+from diffusion_policy.dataset.base_dataset import BaseImageDataset
+from diffusion_policy.common.checkpoint_util import TopKCheckpointManager
+from diffusion_policy.common.json_logger import JsonLogger
+from diffusion_policy.common.pytorch_util import dict_apply, optimizer_to
+from diffusion_policy.model.diffusion.ema_model import EMAModel
+from diffusion_policy.model.common.lr_scheduler import get_scheduler
+
+OmegaConf.register_new_resolver("eval", eval, replace=True)
+
+
+class RobotWorkspace(BaseWorkspace):
+    include_keys = ["global_step", "epoch"]
+
+    def __init__(self, cfg: OmegaConf, output_dir=None):
+        super().__init__(cfg, output_dir=output_dir)
+
+        # set seed
+        seed = cfg.training.seed
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+
+        # configure model
+        self.model: DiffusionUnetImagePolicy = hydra.utils.instantiate(cfg.policy)
+
+        self.ema_model: DiffusionUnetImagePolicy = None
+        if cfg.training.use_ema:
+            self.ema_model = copy.deepcopy(self.model)
+
+        # configure training state
+        self.optimizer = hydra.utils.instantiate(cfg.optimizer, params=self.model.parameters())
+
+        # configure training state
+        self.global_step = 0
+        self.epoch = 0
+
+    def run(self):
+        cfg = copy.deepcopy(self.cfg)
+        seed = cfg.training.seed
+        head_camera_type = cfg.head_camera_type
+
+        # resume training
+        if cfg.training.resume:
+            lastest_ckpt_path = self.get_checkpoint_path()
+            if lastest_ckpt_path.is_file():
+                print(f"Resuming from checkpoint {lastest_ckpt_path}")
+                self.load_checkpoint(path=lastest_ckpt_path)
+
+        # configure dataset
+        dataset: BaseImageDataset
+        dataset = hydra.utils.instantiate(cfg.task.dataset)
+        assert isinstance(dataset, BaseImageDataset)
+        train_dataloader = create_dataloader(dataset, **cfg.dataloader)
+        normalizer = dataset.get_normalizer()
+
+        # configure validation dataset
+        val_dataset = dataset.get_validation_dataset()
+        val_dataloader = create_dataloader(val_dataset, **cfg.val_dataloader)
+
+        self.model.set_normalizer(normalizer)
+        if cfg.training.use_ema:
+            self.ema_model.set_normalizer(normalizer)
+
+        # configure lr scheduler
+        lr_scheduler = get_scheduler(
+            cfg.training.lr_scheduler,
+            optimizer=self.optimizer,
+            num_warmup_steps=cfg.training.lr_warmup_steps,
+            num_training_steps=(len(train_dataloader) * cfg.training.num_epochs) //
+            cfg.training.gradient_accumulate_every,
+            # pytorch assumes stepping LRScheduler every epoch
+            # however huggingface diffusers steps it every batch
+            last_epoch=self.global_step - 1,
+        )
+
+        # configure ema
+        ema: EMAModel = None
+        if cfg.training.use_ema:
+            ema = hydra.utils.instantiate(cfg.ema, model=self.ema_model)
+
+        # configure env
+        # env_runner: BaseImageRunner
+        # env_runner = hydra.utils.instantiate(
+        #     cfg.task.env_runner,
+        #     output_dir=self.output_dir)
+        # assert isinstance(env_runner, BaseImageRunner)
+        env_runner = None
+
+        # configure logging
+        # wandb_run = wandb.init(
+        #     dir=str(self.output_dir),
+        #     config=OmegaConf.to_container(cfg, resolve=True),
+        #     **cfg.logging
+        # )
+        # wandb.config.update(
+        #     {
+        #         "output_dir": self.output_dir,
+        #     }
+        # )
+
+        # configure checkpoint
+        topk_manager = TopKCheckpointManager(save_dir=os.path.join(self.output_dir, "checkpoints"),
+                                             **cfg.checkpoint.topk)
+
+        # device transfer
+        device = torch.device(cfg.training.device)
+        self.model.to(device)
+        if self.ema_model is not None:
+            self.ema_model.to(device)
+        optimizer_to(self.optimizer, device)
+
+        # save batch for sampling
+        train_sampling_batch = None
+
+        if cfg.training.debug:
+            cfg.training.num_epochs = 2
+            cfg.training.max_train_steps = 3
+            cfg.training.max_val_steps = 3
+            cfg.training.rollout_every = 1
+            cfg.training.checkpoint_every = 1
+            cfg.training.val_every = 1
+            cfg.training.sample_every = 1
+
+        # training loop
+        log_path = os.path.join(self.output_dir, "logs.json.txt")
+
+        with JsonLogger(log_path) as json_logger:
+            for local_epoch_idx in range(cfg.training.num_epochs):
+                step_log = dict()
+                # ========= train for this epoch ==========
+                if cfg.training.freeze_encoder:
+                    self.model.obs_encoder.eval()
+                    self.model.obs_encoder.requires_grad_(False)
+
+                train_losses = list()
+                with tqdm.tqdm(
+                        train_dataloader,
+                        desc=f"Training epoch {self.epoch}",
+                        leave=False,
+                        mininterval=cfg.training.tqdm_interval_sec,
+                ) as tepoch:
+                    for batch_idx, batch in enumerate(tepoch):
+                        batch = dataset.postprocess(batch, device)
+                        if train_sampling_batch is None:
+                            train_sampling_batch = batch
+                        # compute loss
+                        raw_loss = self.model.compute_loss(batch)
+                        loss = raw_loss / cfg.training.gradient_accumulate_every
+                        loss.backward()
+
+                        # step optimizer
+                        if (self.global_step % cfg.training.gradient_accumulate_every == 0):
+                            self.optimizer.step()
+                            self.optimizer.zero_grad()
+                            lr_scheduler.step()
+
+                        # update ema
+                        if cfg.training.use_ema:
+                            ema.step(self.model)
+
+                        # logging
+                        raw_loss_cpu = raw_loss.item()
+                        tepoch.set_postfix(loss=raw_loss_cpu, refresh=False)
+                        train_losses.append(raw_loss_cpu)
+                        step_log = {
+                            "train_loss": raw_loss_cpu,
+                            "global_step": self.global_step,
+                            "epoch": self.epoch,
+                            "lr": lr_scheduler.get_last_lr()[0],
+                        }
+
+                        is_last_batch = batch_idx == (len(train_dataloader) - 1)
+                        if not is_last_batch:
+                            # log of last step is combined with validation and rollout
+                            json_logger.log(step_log)
+                            self.global_step += 1
+
+                        if (cfg.training.max_train_steps
+                                is not None) and batch_idx >= (cfg.training.max_train_steps - 1):
+                            break
+
+                # at the end of each epoch
+                # replace train_loss with epoch average
+                train_loss = np.mean(train_losses)
+                step_log["train_loss"] = train_loss
+
+                # ========= eval for this epoch ==========
+                policy = self.model
+                if cfg.training.use_ema:
+                    policy = self.ema_model
+                policy.eval()
+
+                # run rollout
+                # if (self.epoch % cfg.training.rollout_every) == 0:
+                #     runner_log = env_runner.run(policy)
+                #     # log all
+                #     step_log.update(runner_log)
+
+                # run validation
+                if (self.epoch % cfg.training.val_every) == 0:
+                    with torch.no_grad():
+                        val_losses = list()
+                        with tqdm.tqdm(
+                                val_dataloader,
+                                desc=f"Validation epoch {self.epoch}",
+                                leave=False,
+                                mininterval=cfg.training.tqdm_interval_sec,
+                        ) as tepoch:
+                            for batch_idx, batch in enumerate(tepoch):
+                                batch = dataset.postprocess(batch, device)
+                                loss = self.model.compute_loss(batch)
+                                val_losses.append(loss)
+                                if (cfg.training.max_val_steps
+                                        is not None) and batch_idx >= (cfg.training.max_val_steps - 1):
+                                    break
+                        if len(val_losses) > 0:
+                            val_loss = torch.mean(torch.tensor(val_losses)).item()
+                            # log epoch average validation loss
+                            step_log["val_loss"] = val_loss
+
+                # run diffusion sampling on a training batch
+                if (self.epoch % cfg.training.sample_every) == 0:
+                    with torch.no_grad():
+                        # sample trajectory from training set, and evaluate difference
+                        batch = train_sampling_batch
+                        obs_dict = batch["obs"]
+                        gt_action = batch["action"]
+
+                        result = policy.predict_action(obs_dict)
+                        pred_action = result["action_pred"]
+                        mse = torch.nn.functional.mse_loss(pred_action, gt_action)
+                        step_log["train_action_mse_error"] = mse.item()
+                        del batch
+                        del obs_dict
+                        del gt_action
+                        del result
+                        del pred_action
+                        del mse
+
+                # checkpoint
+                if ((self.epoch + 1) % cfg.training.checkpoint_every) == 0:
+                    # checkpointing
+                    save_name = pathlib.Path(self.cfg.task.dataset.zarr_path).stem
+                    self.save_checkpoint(f"checkpoints/{save_name}-{seed}/{self.epoch + 1}.ckpt")  # TODO
+
+                # ========= eval end for this epoch ==========
+                policy.train()
+
+                # end of epoch
+                # log of last step is combined with validation and rollout
+                json_logger.log(step_log)
+                self.global_step += 1
+                self.epoch += 1
+
+
+class BatchSampler:
+
+    def __init__(
+        self,
+        data_size: int,
+        batch_size: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        drop_last: bool = True,
+    ):
+        assert drop_last
+        self.data_size = data_size
+        self.batch_size = batch_size
+        self.num_batch = data_size // batch_size
+        self.discard = data_size - batch_size * self.num_batch
+        self.shuffle = shuffle
+        self.rng = np.random.default_rng(seed) if shuffle else None
+
+    def __iter__(self):
+        if self.shuffle:
+            perm = self.rng.permutation(self.data_size)
+        else:
+            perm = np.arange(self.data_size)
+        if self.discard > 0:
+            perm = perm[:-self.discard]
+        perm = perm.reshape(self.num_batch, self.batch_size)
+        for i in range(self.num_batch):
+            yield perm[i]
+
+    def __len__(self):
+        return self.num_batch
+
+
+def create_dataloader(
+    dataset,
+    *,
+    batch_size: int,
+    shuffle: bool,
+    num_workers: int,
+    pin_memory: bool,
+    persistent_workers: bool,
+    seed: int = 0,
+):
+    batch_sampler = BatchSampler(len(dataset), batch_size, shuffle=shuffle, seed=seed, drop_last=True)
+
+    def collate(x):
+        assert len(x) == 1
+        return x[0]
+
+    dataloader = DataLoader(
+        dataset,
+        collate_fn=collate,
+        sampler=batch_sampler,
+        num_workers=num_workers,
+        pin_memory=False,
+        persistent_workers=persistent_workers,
+    )
+    return dataloader
+
+
+@hydra.main(
+    version_base=None,
+    config_path=str(pathlib.Path(__file__).parent.parent.joinpath("config")),
+    config_name=pathlib.Path(__file__).stem,
+)
+def main(cfg):
+    workspace = RobotWorkspace(cfg)
+    workspace.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/r2sVLA/algos/DP/dp_model.py b/r2sVLA/algos/DP/dp_model.py
new file mode 100644
index 0000000..7a84fd4
--- /dev/null
+++ b/r2sVLA/algos/DP/dp_model.py
@@ -0,0 +1,51 @@
+import numpy as np
+import torch
+import hydra
+import dill
+import sys, os
+
+current_file_path = os.path.abspath(__file__)
+parent_dir = os.path.dirname(current_file_path)
+sys.path.append(parent_dir)
+
+from diffusion_policy.workspace.robotworkspace import RobotWorkspace
+from diffusion_policy.env_runner.dp_runner import DPRunner
+
+class DP:
+
+    def __init__(self, ckpt_file: str, n_obs_steps, n_action_steps):
+        self.policy = self.get_policy(ckpt_file, None, "cuda:0")
+        self.runner = DPRunner(n_obs_steps=n_obs_steps, n_action_steps=n_action_steps)
+
+    def update_obs(self, observation):
+        self.runner.update_obs(observation)
+    
+    def reset_obs(self):
+        self.runner.reset_obs()
+
+    def get_action(self, observation=None):
+        action = self.runner.get_action(self.policy, observation)
+        return action
+
+    def get_last_obs(self):
+        return self.runner.obs[-1]
+
+    def get_policy(self, checkpoint, output_dir, device):
+        # load checkpoint
+        payload = torch.load(open(checkpoint, "rb"), pickle_module=dill)
+        cfg = payload["cfg"]
+        cls = hydra.utils.get_class(cfg._target_)
+        workspace = cls(cfg, output_dir=output_dir)
+        workspace: RobotWorkspace
+        workspace.load_payload(payload, exclude_keys=None, include_keys=None)
+
+        # get policy from workspace
+        policy = workspace.model
+        if cfg.training.use_ema:
+            policy = workspace.ema_model
+
+        device = torch.device(device)
+        policy.to(device)
+        policy.eval()
+
+        return policy
diff --git a/r2sVLA/algos/DP/eval.sh b/r2sVLA/algos/DP/eval.sh
new file mode 100644
index 0000000..75365f9
--- /dev/null
+++ b/r2sVLA/algos/DP/eval.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# == keep unchanged ==
+policy_name=DP
+task_name=${1}
+task_config=${2}
+ckpt_setting=${3}
+expert_data_num=${4}
+seed=${5}
+gpu_id=${6}
+DEBUG=False
+
+export CUDA_VISIBLE_DEVICES=${gpu_id}
+echo -e "\033[33mgpu id (to use): ${gpu_id}\033[0m"
+
+cd ../..
+
+PYTHONWARNINGS=ignore::UserWarning \
+python script/eval_policy.py --config policy/$policy_name/deploy_policy.yml \
+    --overrides \
+    --task_name ${task_name} \
+    --task_config ${task_config} \
+    --ckpt_setting ${ckpt_setting} \
+    --expert_data_num ${expert_data_num} \
+    --seed ${seed}
\ No newline at end of file
diff --git a/r2sVLA/algos/DP/eval_double_env.sh b/r2sVLA/algos/DP/eval_double_env.sh
new file mode 100644
index 0000000..7c4e630
--- /dev/null
+++ b/r2sVLA/algos/DP/eval_double_env.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# == keep unchanged ==
+policy_name=DP
+task_name=${1}
+task_config=${2}
+ckpt_setting=${3}
+expert_data_num=${4}
+seed=${5}
+gpu_id=${6}
+policy_conda_env=${7}
+
+export CUDA_VISIBLE_DEVICES=${gpu_id}
+echo -e "\033[33mgpu id (to use): ${gpu_id}\033[0m"
+
+cd ../..
+
+yaml_file="policy/${policy_name}/deploy_policy.yml"
+
+echo "policy_conda_env is '$policy_conda_env'"
+
+# Find an available port
+FREE_PORT=$(python3 - << 'EOF'
+import socket
+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+    s.bind(('', 0))
+    print(s.getsockname()[1])
+EOF
+)
+echo -e "\033[33mUsing socket port: ${FREE_PORT}\033[0m"
+
+# Start the server in the background
+echo -e "\033[32m[server] Activating Conda environment: ${policy_conda_env}\033[0m"
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate "${policy_conda_env}"
+
+echo -e "\033[32m[server] Launching policy_model_server (PID will be recorded)...\033[0m"
+PYTHONWARNINGS=ignore::UserWarning \
+python script/policy_model_server.py \
+    --port ${FREE_PORT} \
+    --policy_conda_env ${policy_conda_env} \
+    --config policy/${policy_name}/deploy_policy.yml \
+    --overrides \
+    --task_name ${task_name} \
+    --task_config ${task_config} \
+    --ckpt_setting ${ckpt_setting} \
+    --expert_data_num ${expert_data_num} \
+    --seed ${seed} &
+SERVER_PID=$!
+
+# Ensure the server is killed when this script exits
+trap "echo -e '\033[31m[cleanup] Killing server (PID=${SERVER_PID})\033[0m'; kill ${SERVER_PID} 2>/dev/null" EXIT
+
+conda deactivate
+
+# Start the client in the foreground
+echo -e "\033[34m[client] Starting eval_policy_client on port ${FREE_PORT}...\033[0m"
+PYTHONWARNINGS=ignore::UserWarning \
+python script/eval_policy_client.py \
+    --port ${FREE_PORT} \
+    --policy_conda_env ${policy_conda_env} \
+    --config policy/${policy_name}/deploy_policy.yml \
+    --overrides \
+    --task_name ${task_name} \
+    --task_config ${task_config} \
+    --ckpt_setting ${ckpt_setting} \
+    --expert_data_num ${expert_data_num} \
+    --seed ${seed}
+
+echo -e "\033[33m[main] eval_policy_client has finished; the server will be terminated.\033[0m"
diff --git a/r2sVLA/algos/DP/process_data.py b/r2sVLA/algos/DP/process_data.py
new file mode 100644
index 0000000..fb3b6b7
--- /dev/null
+++ b/r2sVLA/algos/DP/process_data.py
@@ -0,0 +1,260 @@
+import pickle, os
+import numpy as np
+import pdb
+from copy import deepcopy
+import zarr
+import shutil
+import argparse
+import yaml
+import cv2
+import h5py
+
+
+def load_hdf5(dataset_path, use_franka_format=False):
+    """
+    Load HDF5 data.
+    
+    Args:
+        dataset_path: Path to HDF5 file
+        use_franka_format: If True, load from Franka format (joint_pos + gripper_cmd)
+                          If False, load from ViperX format (left/right arm + gripper)
+    """
+    if not os.path.isfile(dataset_path):
+        print(f"Dataset does not exist at \n{dataset_path}\n")
+        exit()
+
+    with h5py.File(dataset_path, "r") as root:
+        if use_franka_format or '/joint_action/joint_pos' in root:
+            # Franka format: single arm robot
+            joint_pos = root["/joint_action/joint_pos"][()]  # (T, 7)
+            gripper_cmd = root["/joint_action/gripper_cmd"][()]  # (T, 2)
+            
+            # Normalize gripper: 0.0 = close, 1.0 = open
+            FRANKA_GRIPPER_OPEN = 0.04
+            FRANKA_GRIPPER_CLOSE = 0.00
+            gripper_normalized = (gripper_cmd[:, 0] - FRANKA_GRIPPER_CLOSE) / (FRANKA_GRIPPER_OPEN - FRANKA_GRIPPER_CLOSE)
+            gripper_normalized = gripper_normalized.reshape(-1, 1).astype(np.float32)
+            
+            # Combine to 8D: [7 joints, 1 normalized gripper]
+            vector = np.concatenate([joint_pos, gripper_normalized], axis=1).astype(np.float32)  # (T, 8)
+            
+            # For compatibility, return empty arrays for left/right
+            left_gripper = np.array([])
+            left_arm = np.array([])
+            right_gripper = np.array([])
+            right_arm = np.array([])
+        else:
+            # ViperX format: dual arm robot
+            left_gripper, left_arm = (
+                root["/joint_action/left_gripper"][()],
+                root["/joint_action/left_arm"][()],
+            )
+            right_gripper, right_arm = (
+                root["/joint_action/right_gripper"][()],
+                root["/joint_action/right_arm"][()],
+            )
+            vector = root["/joint_action/vector"][()]
+        
+        image_dict = dict()
+        camera_config = dict()  # Store camera intrinsics/extrinsics and computed fovy
+        for cam_name in root[f"/observation/"].keys():
+            image_dict[cam_name] = root[f"/observation/{cam_name}/rgb"][()]
+            # Store camera config if available
+            cam_group = root[f"/observation/{cam_name}"]
+            if 'intrinsics' in cam_group:
+                intrinsics = cam_group['intrinsics'][:]
+                camera_config[f"{cam_name}_intrinsics"] = intrinsics
+                # Calculate fovy from intrinsics (will need image resolution)
+                # fovy = 2 * arctan(h / (2 * fy)) * 180 / π
+                # We'll calculate this after we decode the first image
+                camera_config[f"{cam_name}_fy"] = float(intrinsics[1, 1])  # Store fy for later calculation
+            if 'extrinsics' in cam_group:
+                camera_config[f"{cam_name}_extrinsics"] = cam_group['extrinsics'][:]
+
+    return left_gripper, left_arm, right_gripper, right_arm, vector, image_dict, camera_config
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process some episodes.")
+    parser.add_argument(
+        "task_name",
+        type=str,
+        help="The name of the task (e.g., beat_block_hammer)",
+    )
+    parser.add_argument("task_config", type=str)
+    parser.add_argument(
+        "expert_data_num",
+        type=int,
+        help="Number of episodes to process (e.g., 50)",
+    )
+    parser.add_argument(
+        "--use_franka_format",
+        action="store_true",
+        help="Use Franka format (single arm, 8D) instead of ViperX format (dual arm, 14D)",
+    )
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default=None,
+        help="Custom data directory path (default: ../../data/{task_name}/{task_config})",
+    )
+    args = parser.parse_args()
+
+    task_name = args.task_name
+    num = args.expert_data_num
+    task_config = args.task_config
+    use_franka_format = args.use_franka_format
+
+    if args.data_dir:
+        load_dir = args.data_dir
+    else:
+        load_dir = "../../data/" + str(task_name) + "/" + str(task_config)
+
+    total_count = 0
+
+    save_dir = f"./data/{task_name}-{task_config}-{num}.zarr"
+
+    if os.path.exists(save_dir):
+        shutil.rmtree(save_dir)
+
+    current_ep = 0
+
+    zarr_root = zarr.group(save_dir)
+    zarr_data = zarr_root.create_group("data")
+    zarr_meta = zarr_root.create_group("meta")
+    zarr_camera = zarr_root.create_group("camera")  # Store camera config
+
+    head_camera_arrays, front_camera_arrays, left_camera_arrays, right_camera_arrays = (
+        [],
+        [],
+        [],
+        [],
+    )
+    episode_ends_arrays, action_arrays, state_arrays, joint_action_arrays = (
+        [],
+        [],
+        [],
+        [],
+    )
+
+    while current_ep < num:
+        print(f"processing episode: {current_ep + 1} / {num}", end="\r")
+
+        # Support both episode naming formats
+        if use_franka_format:
+            # Franka format: episode_000.hdf5, episode_001.hdf5, etc.
+            load_path = os.path.join(load_dir, f"episode_{current_ep:03d}.hdf5")
+            if not os.path.exists(load_path):
+                load_path = os.path.join(load_dir, f"episode_{current_ep}.hdf5")
+        else:
+            # ViperX format: episode0.hdf5, episode1.hdf5, etc.
+            load_path = os.path.join(load_dir, f"data/episode{current_ep}.hdf5")
+        
+        (
+            left_gripper_all,
+            left_arm_all,
+            right_gripper_all,
+            right_arm_all,
+            vector_all,
+            image_dict_all,
+            camera_config_all,
+        ) = load_hdf5(load_path, use_franka_format=use_franka_format)
+        
+        # Store camera config from first episode
+        if current_ep == 0 and camera_config_all:
+            for key, value in camera_config_all.items():
+                zarr_camera.create_dataset(key, data=value, overwrite=True)
+
+        # Determine episode length from vector_all
+        episode_len = vector_all.shape[0]
+
+        for j in range(0, episode_len):
+            head_img_bit = image_dict_all["head_camera"][j]
+            joint_state = vector_all[j]
+
+            if j != episode_len - 1:
+                head_img = cv2.imdecode(np.frombuffer(head_img_bit, np.uint8), cv2.IMREAD_COLOR)
+                if head_img is None:
+                    print(f"\nWarning: Failed to decode image at episode {current_ep}, timestep {j}")
+                    # Create black image as fallback - use detected resolution if available
+                    if current_ep == 0 and j == 0:
+                        # Try to get resolution from first successful decode
+                        fallback_h, fallback_w = 480, 640
+                    else:
+                        fallback_h, fallback_w = 480, 640
+                    head_img = np.zeros((fallback_h, fallback_w, 3), dtype=np.uint8)
+                # Store resolution and calculate fovy from first image
+                if current_ep == 0 and j == 0 and head_img is not None:
+                    img_h, img_w = head_img.shape[:2]
+                    zarr_meta.attrs['head_camera_h'] = int(img_h)
+                    zarr_meta.attrs['head_camera_w'] = int(img_w)
+                    
+                    # Calculate fovy from intrinsics if available
+                    if 'head_camera_fy' in camera_config_all:
+                        fy = camera_config_all['head_camera_fy']
+                        # fovy = 2 * arctan(h / (2 * fy)) * 180 / π
+                        fovy_rad = 2 * np.arctan(img_h / (2 * fy))
+                        fovy_deg = np.degrees(fovy_rad)
+                        zarr_meta.attrs['head_camera_fovy'] = float(fovy_deg)
+                        print(f"\n[Camera Config] Detected image resolution: {img_w}x{img_h}")
+                        print(f"[Camera Config] Calculated fovy: {fovy_deg:.2f} degrees (from fy={fy:.2f})")
+                    else:
+                        print(f"\n[Camera Config] Detected image resolution: {img_w}x{img_h}")
+                        print(f"[Camera Config] Warning: No intrinsics found, cannot calculate fovy")
+                head_camera_arrays.append(head_img)
+                state_arrays.append(joint_state)
+            if j != 0:
+                joint_action_arrays.append(joint_state)
+
+        current_ep += 1
+        total_count += episode_len - 1
+        episode_ends_arrays.append(total_count)
+
+    print()
+    episode_ends_arrays = np.array(episode_ends_arrays)
+    # action_arrays = np.array(action_arrays)
+    state_arrays = np.array(state_arrays)
+    head_camera_arrays = np.array(head_camera_arrays)
+    joint_action_arrays = np.array(joint_action_arrays)
+
+    head_camera_arrays = np.moveaxis(head_camera_arrays, -1, 1)  # NHWC -> NCHW
+
+    compressor = zarr.Blosc(cname="zstd", clevel=3, shuffle=1)
+    # action_chunk_size = (100, action_arrays.shape[1])
+    state_chunk_size = (100, state_arrays.shape[1])
+    joint_chunk_size = (100, joint_action_arrays.shape[1])
+    head_camera_chunk_size = (100, *head_camera_arrays.shape[1:])
+    zarr_data.create_dataset(
+        "head_camera",
+        data=head_camera_arrays,
+        chunks=head_camera_chunk_size,
+        overwrite=True,
+        compressor=compressor,
+    )
+    zarr_data.create_dataset(
+        "state",
+        data=state_arrays,
+        chunks=state_chunk_size,
+        dtype="float32",
+        overwrite=True,
+        compressor=compressor,
+    )
+    zarr_data.create_dataset(
+        "action",
+        data=joint_action_arrays,
+        chunks=joint_chunk_size,
+        dtype="float32",
+        overwrite=True,
+        compressor=compressor,
+    )
+    zarr_meta.create_dataset(
+        "episode_ends",
+        data=episode_ends_arrays,
+        dtype="int64",
+        overwrite=True,
+        compressor=compressor,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/r2sVLA/algos/DP/process_data.sh b/r2sVLA/algos/DP/process_data.sh
new file mode 100644
index 0000000..986b317
--- /dev/null
+++ b/r2sVLA/algos/DP/process_data.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+task_name=${1}
+task_config=${2}
+expert_data_num=${3}
+use_franka_format=${4:-false}  # Default to false (ViperX format)
+data_dir=${5}  # Optional custom data directory
+
+# Build command
+cmd="python process_data.py $task_name $task_config $expert_data_num"
+
+if [ "$use_franka_format" = "true" ] || [ "$use_franka_format" = "True" ]; then
+    cmd="$cmd --use_franka_format"
+fi
+
+if [ -n "$data_dir" ]; then
+    cmd="$cmd --data_dir $data_dir"
+fi
+
+echo "Running: $cmd"
+eval $cmd
\ No newline at end of file
diff --git a/r2sVLA/algos/DP/pyproject.toml b/r2sVLA/algos/DP/pyproject.toml
new file mode 100644
index 0000000..ba2028f
--- /dev/null
+++ b/r2sVLA/algos/DP/pyproject.toml
@@ -0,0 +1,13 @@
+[build-system]
+requires = ["flit_core >=3.7,<4"]
+build-backend = "flit_core.buildapi"
+
+[project]
+name = "diffusion_policy"
+version = "0.1.0"
+description = "Diffusion policy for RoboTwin"
+requires-python = ">=3.8"
+dependencies = [
+    "hydra-core==1.2.0",
+    "numba"
+]
\ No newline at end of file
diff --git a/r2sVLA/algos/DP/train.py b/r2sVLA/algos/DP/train.py
new file mode 100644
index 0000000..9d05fd8
--- /dev/null
+++ b/r2sVLA/algos/DP/train.py
@@ -0,0 +1,147 @@
+"""
+Usage:
+Training:
+python train.py --config-name=train_diffusion_lowdim_workspace
+"""
+
+import sys
+
+# use line-buffering for both stdout and stderr
+sys.stdout = open(sys.stdout.fileno(), mode="w", buffering=1)
+sys.stderr = open(sys.stderr.fileno(), mode="w", buffering=1)
+
+import hydra, pdb
+from omegaconf import OmegaConf
+import pathlib, yaml
+from diffusion_policy.workspace.base_workspace import BaseWorkspace
+
+import os
+
+current_file_path = os.path.abspath(__file__)
+parent_directory = os.path.dirname(current_file_path)
+
+
+def get_camera_config_from_data(zarr_path):
+    """
+    Dynamically get camera config from zarr dataset.
+    Returns dict with 'h', 'w', and 'fovy' keys.
+    """
+    import zarr
+    try:
+        root = zarr.open(zarr_path, mode='r')
+        config = {}
+        
+        # Method 1: Try to get from meta attributes (stored during processing)
+        if 'meta' in root and hasattr(root['meta'], 'attrs'):
+            attrs = root['meta'].attrs
+            if 'head_camera_h' in attrs and 'head_camera_w' in attrs:
+                h = int(attrs['head_camera_h'])
+                w = int(attrs['head_camera_w'])
+                config['h'] = h
+                config['w'] = w
+                
+                # Get fovy if available
+                if 'head_camera_fovy' in attrs:
+                    config['fovy'] = float(attrs['head_camera_fovy'])
+                    print(f"[Camera Config] Detected from meta: {w}x{h}, fovy={config['fovy']:.2f}°")
+                else:
+                    print(f"[Camera Config] Detected from meta: {w}x{h} (fovy not available)")
+                
+                if config:
+                    return config
+        
+        # Method 2: Try to read from actual image data
+        if 'data' in root and 'head_camera' in root['data']:
+            head_cam = root['data']['head_camera']
+            if len(head_cam) > 0:
+                first_img = head_cam[0]
+                # Handle CHW format (3, H, W)
+                if len(first_img.shape) == 3 and first_img.shape[0] == 3:
+                    h, w = first_img.shape[1], first_img.shape[2]
+                    config['h'] = int(h)
+                    config['w'] = int(w)
+                    print(f"[Camera Config] Detected from image data: {w}x{h}")
+                    return config
+    except Exception as e:
+        print(f"[Camera Config] Warning: Could not read from zarr: {e}")
+    
+    return None
+
+
+def get_camera_config(camera_type, zarr_path=None):
+    """
+    Get camera config, either from data (preferred) or from config file (fallback).
+    
+    Args:
+        camera_type: Camera type name (for fallback)
+        zarr_path: Path to zarr dataset (optional, for dynamic detection)
+    """
+    # Try to get from data first
+    if zarr_path and os.path.exists(zarr_path):
+        config = get_camera_config_from_data(zarr_path)
+        if config is not None:
+            return config
+    
+    # Fallback to config file
+    camera_config_path = os.path.join(parent_directory, "../../task_config/_camera_config.yml")
+    
+    if not os.path.isfile(camera_config_path):
+        raise FileNotFoundError(f"Camera config file not found: {camera_config_path}")
+    
+    with open(camera_config_path, "r", encoding="utf-8") as f:
+        args = yaml.load(f.read(), Loader=yaml.FullLoader)
+    
+    if camera_type not in args:
+        raise KeyError(f"Camera type '{camera_type}' not found in config. Available: {list(args.keys())}")
+    
+    print(f"[Camera Config] Using from config file: {camera_type}")
+    return args[camera_type]
+
+
+# allows arbitrary python code execution in configs using the ${eval:''} resolver
+OmegaConf.register_new_resolver("eval", eval, replace=True)
+
+
+@hydra.main(
+    version_base=None,
+    config_path=str(pathlib.Path(__file__).parent.joinpath("diffusion_policy", "config")),
+)
+def main(cfg: OmegaConf):
+    # resolve immediately so all the ${now:} resolvers
+    # will use the same time.
+    head_camera_type = cfg.head_camera_type
+    zarr_path = cfg.task.dataset.zarr_path if hasattr(cfg.task, 'dataset') and hasattr(cfg.task.dataset, 'zarr_path') else None
+    
+    # Get camera config dynamically from data, fallback to config file
+    head_camera_cfg = get_camera_config(head_camera_type, zarr_path=zarr_path)
+    
+    # Ensure we have h and w
+    if "h" not in head_camera_cfg or "w" not in head_camera_cfg:
+        raise ValueError(f"Camera config missing h or w: {head_camera_cfg}")
+    
+    cfg.task.image_shape = [3, head_camera_cfg["h"], head_camera_cfg["w"]]
+    cfg.task.shape_meta.obs.head_cam.shape = [
+        3,
+        head_camera_cfg["h"],
+        head_camera_cfg["w"],
+    ]
+    OmegaConf.resolve(cfg)
+    cfg.task.image_shape = [3, head_camera_cfg["h"], head_camera_cfg["w"]]
+    cfg.task.shape_meta.obs.head_cam.shape = [
+        3,
+        head_camera_cfg["h"],
+        head_camera_cfg["w"],
+    ]
+    
+    # Log fovy if available
+    if "fovy" in head_camera_cfg:
+        print(f"[Camera Config] Using fovy: {head_camera_cfg['fovy']:.2f}°")
+
+    cls = hydra.utils.get_class(cfg._target_)
+    workspace: BaseWorkspace = cls(cfg)
+    print(cfg.task.dataset.zarr_path, cfg.task_name)
+    workspace.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/r2sVLA/algos/DP/train.sh b/r2sVLA/algos/DP/train.sh
new file mode 100644
index 0000000..783b02d
--- /dev/null
+++ b/r2sVLA/algos/DP/train.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+task_name=${1}
+task_config=${2}
+expert_data_num=${3}
+seed=${4}
+action_dim=${5}
+gpu_id=${6}
+
+head_camera_type=D435
+
+DEBUG=False
+save_ckpt=True
+
+alg_name=robot_dp_$action_dim
+config_name=${alg_name}
+addition_info=train
+exp_name=${task_name}-robot_dp-${addition_info}
+run_dir="data/outputs/${exp_name}_seed${seed}"
+
+echo -e "\033[33mgpu id (to use): ${gpu_id}\033[0m"
+
+
+if [ $DEBUG = True ]; then
+    wandb_mode=offline
+    # wandb_mode=online
+    echo -e "\033[33mDebug mode!\033[0m"
+    echo -e "\033[33mDebug mode!\033[0m"
+    echo -e "\033[33mDebug mode!\033[0m"
+else
+    wandb_mode=online
+    echo -e "\033[33mTrain mode\033[0m"
+fi
+
+export HYDRA_FULL_ERROR=1 
+export CUDA_VISIBLE_DEVICES=${gpu_id}
+
+if [ ! -d "./data/${task_name}-${task_config}-${expert_data_num}.zarr" ]; then
+    bash process_data.sh ${task_name} ${task_config} ${expert_data_num}
+fi
+
+python train.py --config-name=${config_name}.yaml \
+                            task.name=${task_name} \
+                            task.dataset.zarr_path="data/${task_name}-${task_config}-${expert_data_num}.zarr" \
+                            training.debug=$DEBUG \
+                            training.seed=${seed} \
+                            training.device="cuda:0" \
+                            exp_name=${exp_name} \
+                            logging.mode=${wandb_mode} \
+                            setting=${task_config} \
+                            expert_data_num=${expert_data_num} \
+                            head_camera_type=$head_camera_type
+                            # checkpoint.save_ckpt=${save_ckpt}
+                            # hydra.run.dir=${run_dir} \
\ No newline at end of file
diff --git a/r2sVLA/algos/DP/train_demo_video.sh b/r2sVLA/algos/DP/train_demo_video.sh
new file mode 100755
index 0000000..0a03696
--- /dev/null
+++ b/r2sVLA/algos/DP/train_demo_video.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# Training script for DP on demo_video dataset (Franka, 8D)
+
+task_name=${1:-demo_video}
+task_config=${2:-config}
+expert_data_num=${3:-15}
+seed=${4:-1}
+action_dim=${5:-8}  # Franka: 8D (7 joints + 1 gripper)
+gpu_id=${6:-0}
+
+head_camera_type=D435
+
+DEBUG=False
+save_ckpt=True
+
+alg_name=robot_dp_$action_dim
+config_name=${alg_name}
+addition_info=train
+exp_name=${task_name}-robot_dp-${addition_info}
+run_dir="data/outputs/${exp_name}_seed${seed}"
+
+echo -e "\033[33mgpu id (to use): ${gpu_id}\033[0m"
+echo -e "\033[33maction_dim: ${action_dim} (Franka: 8D)\033[0m"
+
+if [ $DEBUG = True ]; then
+    wandb_mode=offline
+    echo -e "\033[33mDebug mode!\033[0m"
+else
+    wandb_mode=online
+    echo -e "\033[33mTrain mode\033[0m"
+fi
+
+export HYDRA_FULL_ERROR=1 
+export CUDA_VISIBLE_DEVICES=${gpu_id}
+
+# Data directory for Franka format
+DATA_DIR="/home/peiqiduan/OpenReal2Sim/h5py/demo_video"
+
+# Process data if zarr doesn't exist
+if [ ! -d "./data/${task_name}-${task_config}-${expert_data_num}.zarr" ]; then
+    echo "Processing data from ${DATA_DIR}..."
+    bash process_data.sh ${task_name} ${task_config} ${expert_data_num} true ${DATA_DIR}
+fi
+
+python train.py --config-name=${config_name}.yaml \
+                            task.name=${task_name} \
+                            task.dataset.zarr_path="data/${task_name}-${task_config}-${expert_data_num}.zarr" \
+                            training.debug=$DEBUG \
+                            training.seed=${seed} \
+                            training.device="cuda:0" \
+                            exp_name=${exp_name} \
+                            logging.mode=${wandb_mode} \
+                            setting=${task_config} \
+                            expert_data_num=${expert_data_num} \
+                            head_camera_type=$head_camera_type
+
diff --git a/r2sVLA/algos/act b/r2sVLA/algos/act
new file mode 160000
index 0000000..742c753
--- /dev/null
+++ b/r2sVLA/algos/act
@@ -0,0 +1 @@
+Subproject commit 742c753c0d4a5d87076c8f69e5628c79a8cc5488
diff --git a/r2sVLA/envs/cfgs/eval_cfg.py b/r2sVLA/envs/cfgs/eval_cfg.py
new file mode 100644
index 0000000..eb7ac21
--- /dev/null
+++ b/r2sVLA/envs/cfgs/eval_cfg.py
@@ -0,0 +1,53 @@
+"""
+Evaluation configuration for policy evaluation.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+import yaml
+
+
+@dataclass
+class EvaluationConfig:
+    """Configuration for policy evaluation."""
+    max_steps: int = 1000  # Maximum number of steps per episode
+    success_threshold: Optional[float] = None  # Optional success threshold
+    record_video: bool = False  # Whether to record video
+    video_save_dir: Optional[Path] = None  # Directory to save videos
+    use_verified_randomization: bool = False  # Whether to use verified randomization
+    success_keys: List[str] = field(default_factory=lambda: ["grasping", "strict", "metric"])
+    pose_dist_threshold: float = 0.05  # Threshold for pose distance
+    angle_dist_threshold: float = 0.1  # Threshold for angle distance
+    lift_height_threshold: float = 0.02  # Threshold for lift height
+    num_trials: int = 10  # Number of environments to evaluate
+    physics_freq: int = 100  # Frequency to update the physics
+    decimation: int = 1
+    save_interval: int = 1
+
+   
+
+    
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> 'EvaluationConfig':
+        """Create EvaluationConfig from dictionary."""
+        return cls(**{k: v for k, v in config_dict.items() if k in cls.__dataclass_fields__})
+    
+    @classmethod
+    def from_yaml(cls, yaml_path: Path, task_name: Optional[str] = None) -> 'EvaluationConfig':
+        """Load EvaluationConfig from YAML file."""
+        if not yaml_path.exists():
+            print(f"Warning: Evaluation config file not found at {yaml_path}, using defaults")
+            return cls()
+        
+        with open(yaml_path, 'r') as f:
+            data = yaml.safe_load(f)
+        
+        if task_name and task_name in data:
+            return cls.from_dict(data[task_name])
+        elif 'default' in data:
+            return cls.from_dict(data['default'])
+        else:
+            return cls.from_dict(data)
+
diff --git a/r2sVLA/envs/cfgs/policy_interface.py b/r2sVLA/envs/cfgs/policy_interface.py
new file mode 100644
index 0000000..43fb0e2
--- /dev/null
+++ b/r2sVLA/envs/cfgs/policy_interface.py
@@ -0,0 +1,187 @@
+"""
+Policy interface definitions for Isaac Lab simulation.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Literal, Union, Optional, List
+import torch
+import torch.nn.functional as F
+
+
+class Action:
+    """
+    Action class supporting both joint position (qpos) and end-effector (ee) control.
+    
+    Attributes:
+        action_type: 'qpos' for joint position control, 'ee' for end-effector control
+        qpos: Joint positions [B, n_joints] (for action_type='qpos')
+        ee_pose: End-effector pose [B, 7] (pos [3] + quat [4] in wxyz format) (for action_type='ee')
+        gripper_open: Gripper state (True = open, False = closed)
+    """
+    def __init__(
+        self,
+        action_type: Literal['qpos', 'ee_direct', 'ee_cam', 'ee_l'],
+        qpos: Optional[torch.Tensor] = None,
+        ee_pose: Optional[torch.Tensor] = None,
+        gripper_open: Union[bool, torch.Tensor] = True,
+    ):
+        self.action_type = action_type
+        
+        if action_type == 'qpos':
+            if qpos is None:
+                raise ValueError("qpos must be provided when action_type='qpos'")
+            self.qpos = qpos if torch.is_tensor(qpos) else torch.tensor(qpos, dtype=torch.float32)
+            self.ee_pose = None
+        elif action_type == 'ee_direct' or action_type == 'ee_l' or action_type == 'ee_cam':
+            if ee_pose is None:
+                raise ValueError("ee_pose must be provided when action_type='ee'")
+            self.ee_pose = ee_pose if torch.is_tensor(ee_pose) else torch.tensor(ee_pose, dtype=torch.float32)
+            self.qpos = None
+        else:
+            raise ValueError(f"Invalid action_type: {action_type}. Must be 'qpos' or 'ee'")
+        
+        # Handle gripper_open: can be bool or tensor
+        if isinstance(gripper_open, bool):
+            self.gripper_open = gripper_open
+        elif torch.is_tensor(gripper_open):
+            self.gripper_open = gripper_open
+        else:
+            self.gripper_open = torch.tensor(gripper_open, dtype=torch.bool)
+    
+    def __repr__(self):
+        if self.action_type == 'qpos':
+            return f"Action(type='qpos', qpos_shape={self.qpos.shape}, gripper_open={self.gripper_open})"
+        else:
+            return f"Action(type='ee', ee_pose_shape={self.ee_pose.shape}, gripper_open={self.gripper_open})"
+
+
+from abc import ABC, abstractmethod
+from typing import List
+
+class BasePolicy(ABC):
+    """
+    Abstract base class for policies.
+    All policies should inherit from this class and implement the required methods.
+
+    Required attributes:
+        - observation_keys: List[str], specifies what observation fields the policy needs
+        - action_type: str, 'qpos' or 'ee'
+        - image_resolution: List[int], e.g., [128, 128]
+    """
+
+    # Should be set by the derived class (not enforced at construction)
+    observation_keys: List[str]
+    action_type: str
+    image_resolution: List[int]
+
+    @abstractmethod
+    def __init__(self, observation_keys: List[str], action_type: str, image_resolution: List[int]):
+        self.observation_keys = observation_keys
+        self.action_type = action_type
+        self.image_resolution = image_resolution
+    
+    @abstractmethod
+    def get_action(self, observation: Dict[str, Any]) -> Union[Action, List[Action]]:
+        """
+        Get action from policy given observation.
+        
+        Args:
+            observation: Dictionary containing observation data (images, states, etc.)
+        
+        Returns:
+            Either a single Action object or a list of Action objects (action sequence/chunk).
+            If returning a list, each Action will be executed sequentially in the environment.
+            This allows policies to return action chunks (like ACT, RDT, etc.) for better temporal consistency.
+        """
+        
+
+    @abstractmethod
+    def reset(self) -> None:
+        """
+        Reset policy state (e.g., clear observation history, reset temporal aggregation).
+        Called at the beginning of each episode.
+        """
+        pass
+
+    def update_obs(self, observation: Dict[str, Any]) -> None:
+        """
+        Update policy's internal observation buffer (optional).
+        Useful for policies that maintain observation history.
+        
+        Args:
+            observation: New observation to add to history
+        """
+        pass
+    
+    def preprocess_observation(self, observation: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Preprocess observation before passing to policy.
+        Default implementation resizes images to policy's required resolution.
+        Only resizes keys that are in observation_keys and are image tensors.
+        
+        Args:
+            observation: Raw observation dictionary
+        
+        Returns:
+            Preprocessed observation dictionary with images resized to image_resolution
+        """
+        processed_obs = {}
+        
+        # Image keys that typically need resizing (can be extended)
+        image_keys = ['rgb', 'composed_rgb', 'depth', 'mask', 'fg_mask', 'obj_mask']
+        
+        for key, value in observation.items():
+            # Only process keys that are in observation_keys and are image-like
+            if key in self.observation_keys and key in image_keys and torch.is_tensor(value):
+                # Check if this is an image tensor (4D: [B, H, W, C] or [B, C, H, W])
+                if value.ndim == 4:
+                    # Determine format and resize
+                    # Case 1: [B, H, W, C] format (channels last)
+                    if value.shape[-1] in [1, 3, 4]:
+                        B, H, W, C = value.shape
+                        # Convert to [B, C, H, W] for F.interpolate
+                        value = value.permute(0, 3, 1, 2)
+                        # Convert to float if needed (F.interpolate requires float)
+                        if value.dtype != torch.float32:
+                            value = value.float()
+                            # Normalize if values are in [0, 255] range
+                            if value.max() > 1.0:
+                                value = value / 255.0
+                        # Resize
+                        mode = 'bilinear' if C in [3, 4] else 'nearest'
+                        value = F.interpolate(
+                            value,
+                            size=self.image_resolution,
+                            mode=mode,
+                            align_corners=False if mode == 'bilinear' else None
+                        )
+                        # Convert back to [B, H, W, C]
+                        value = value.permute(0, 2, 3, 1)
+                    
+                    # Case 2: [B, C, H, W] format (channels first)
+                    elif value.shape[1] in [1, 3, 4]:
+                        # Convert to float if needed (F.interpolate requires float)
+                        if value.dtype != torch.float32:
+                            value = value.float()
+                            # Normalize if values are in [0, 255] range
+                            if value.max() > 1.0:
+                                value = value / 255.0
+                        # Resize directly
+                        mode = 'bilinear' if value.shape[1] in [3, 4] else 'nearest'
+                        value = F.interpolate(
+                            value,
+                            size=self.image_resolution,
+                            mode=mode,
+                            align_corners=False if mode == 'bilinear' else None
+                        )
+                    
+                    processed_obs[key] = value
+                else:
+                    # Not a 4D tensor, keep as is
+                    processed_obs[key] = value
+            else:
+                # Not in observation_keys or not an image key, keep as is
+                processed_obs[key] = value
+        
+        return processed_obs
diff --git a/r2sVLA/envs/cfgs/randomizer.py b/r2sVLA/envs/cfgs/randomizer.py
new file mode 100755
index 0000000..67e48be
--- /dev/null
+++ b/r2sVLA/envs/cfgs/randomizer.py
@@ -0,0 +1,366 @@
+import numpy as np
+import random
+from scipy.spatial.transform import Rotation as R
+from scipy.spatial.transform import Slerp
+from .task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg
+
+import torch
+import transforms3d
+
+def pose_to_mat(pos, quat):
+    if torch.is_tensor(pos):  pos  = pos.cpu().numpy()
+    if torch.is_tensor(quat): quat = quat.cpu().numpy()
+    m = np.eye(4, dtype=np.float32)
+    m[:3, :3] = transforms3d.quaternions.quat2mat(quat)
+    m[:3,  3] = pos
+    return m
+
+def mat_to_pose(mat):
+    pos  = mat[:3, 3]
+    quat = transforms3d.quaternions.mat2quat(mat[:3, :3])
+    return pos, quat
+
+
+
+class Randomizer(TaskCfg):
+    def __init__(self, task_cfg: TaskCfg):
+        self.task_cfg = task_cfg
+
+    
+    def generate_randomized_scene_cfg(self, grid_dist: float, grid_num: int, angle_random_range: float, angle_random_num: int, traj_randomize_num:int, scene_randomize_num: int, robot_pose_randomize_range, robot_pose_randomize_angle, robot_pose_randomize_num, fix_end_pose: bool = False):
+        # Step 1: Generate candidate transforms
+      
+        candidate_transforms = []
+        for i in range(-grid_num, grid_num):
+            for j in range(-grid_num, grid_num):
+                random_angles = np.random.uniform(-angle_random_range, angle_random_range, angle_random_num)
+                for angle in random_angles:
+                    orig = np.eye(4)
+                    orig[0, 3] = i * grid_dist
+                    orig[1, 3] = j * grid_dist
+                    orig[0, 0] = np.cos(angle)
+                    orig[0, 1] = -np.sin(angle)
+                    orig[1, 0] = np.sin(angle)
+                    orig[1, 1] = np.cos(angle)
+                    candidate_transforms.append(orig)
+        
+        # Step 2: Generate traj_randomize_num combinations of (start_pose, end_pose)
+        traj_pose_pairs = []
+        for _ in range(traj_randomize_num):
+            start_pose = random.choice(candidate_transforms)
+            if fix_end_pose:
+                #import pdb; pdb.set_trace()
+                end_pose = np.eye(4)
+            else:
+                if self.task_cfg.task_type == TaskType.SIMPLE_PICK:
+                    end_pose = start_pose.copy()  # For SIMPLE_PICK, end same as start
+                else:
+                    end_pose = random.choice(candidate_transforms)
+            traj_pose_pairs.append((start_pose, end_pose))
+        
+        # Step 3: Generate scene_randomize_num combinations for other objects
+        other_object_ids = [obj.object_id for obj in self.task_cfg.objects 
+                           if obj.object_id != self.task_cfg.manipulated_oid and obj.object_id not in self.task_cfg.start_related and obj.object_id not in self.task_cfg.end_related]
+        
+        scene_poses_combinations = []
+        for _ in range(scene_randomize_num):
+            # Create list of (oid, pose) pairs
+            other_object_poses = [(oid, random.choice(candidate_transforms)) 
+                                  for oid in other_object_ids]
+            scene_poses_combinations.append(other_object_poses)
+        
+        # Step 4: Generate robot_pose_randomize_num random robot poses
+        # robot_pose format: [x, y, z, w, x, y, z] (position + quaternion wxyz)
+        ## FIXME: hacking
+        ref_traj = self.task_cfg.reference_trajectory[-1]
+        assert ref_traj is not None, "Reference trajectory is not found"
+        ref_robot_pose = np.array(ref_traj.robot_pose)
+        robot_pose_mat = pose_to_mat(ref_robot_pose[:3], ref_robot_pose[3:7])
+        
+        robot_poses = []
+        for _ in range(robot_pose_randomize_num):
+            # Random translation within range
+            random_trans = np.random.uniform(
+                -robot_pose_randomize_range, 
+                robot_pose_randomize_range, 
+                3
+            )
+            random_rot = np.random.uniform(
+                -robot_pose_randomize_angle, 
+                robot_pose_randomize_angle, 
+            )
+
+            rotate_matrix = np.eye(4)
+            rotate_matrix[:3, :3] = R.from_euler('z', random_rot).as_matrix()
+            new_robot_pose = robot_pose_mat @ rotate_matrix
+            new_robot_pose[:3, 3] += random_trans
+            # Combine position and quaternion [x, y, z, w, x, y, z]
+            pos, quat = mat_to_pose(new_robot_pose)
+            robot_pose_7d = np.concatenate([pos, quat])
+            robot_poses.append(robot_pose_7d.tolist())
+        
+        # Step 5: Generate trajectories for all combinations
+        generated_trajectories = []
+        ref_traj = self.task_cfg.reference_trajectory[-1]
+        for start_pose, end_pose in traj_pose_pairs:
+            for other_object_poses in scene_poses_combinations:
+                for robot_pose in robot_poses:
+                    # Generate trajectory for manipulated object
+                    if self.task_cfg.task_type == TaskType.SIMPLE_PICK:
+                        new_traj_mats = []
+                        for traj_pose_7d in ref_traj.object_trajectory:
+                            pose_mat = pose_to_mat(np.array(traj_pose_7d[:3]), np.array(traj_pose_7d[3:7]))
+                            new_traj_mats.append(start_pose @ pose_mat)
+                        # Convert back to 7D format
+                        new_traj_7d = []
+                        for mat in new_traj_mats:
+                            pos, quat = mat_to_pose(mat)
+                            new_traj_7d.append(np.concatenate([pos, quat]).tolist())
+                        
+                        # Transform pregrasp and grasp poses
+                        if ref_traj.pregrasp_pose:
+                            pregrasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.pregrasp_pose[:3]), np.array(ref_traj.pregrasp_pose[3:7]))
+                            pos, quat = mat_to_pose(pregrasp_mat)
+                            pregrasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            pregrasp_pose = None
+                            
+                        if ref_traj.grasp_pose:
+                            grasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.grasp_pose[:3]), np.array(ref_traj.grasp_pose[3:7]))
+                            pos, quat = mat_to_pose(grasp_mat)
+                            grasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            grasp_pose = None
+                    else:
+                        # Convert reference trajectory to mat format
+                        ref_traj_mats = []
+                        for traj_pose_7d in ref_traj.object_trajectory:
+                            ref_traj_mats.append(pose_to_mat(np.array(traj_pose_7d[:3]), np.array(traj_pose_7d[3:7])))
+                        ref_traj_mats = np.array(ref_traj_mats)
+                        
+                        new_traj_mats = self.compute_new_traj(start_pose, end_pose, ref_traj_mats)
+                        new_traj_mats = self.lift_traj(ref_traj_mats, new_traj_mats)
+                        # Convert back to 7D format
+                        new_traj_7d = []
+                        for mat in new_traj_mats:
+                            pos, quat = mat_to_pose(mat)
+                            new_traj_7d.append(np.concatenate([pos, quat]).tolist())
+                        #new_traj_7d = lift_traj(new_traj_7d)
+                        # Transform pregrasp and grasp poses
+                        if ref_traj.pregrasp_pose:
+                            pregrasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.pregrasp_pose[:3]), np.array(ref_traj.pregrasp_pose[3:7]))
+                            pos, quat = mat_to_pose(pregrasp_mat)
+                            pregrasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            pregrasp_pose = None
+                            
+                        if ref_traj.grasp_pose:
+                            grasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.grasp_pose[:3]), np.array(ref_traj.grasp_pose[3:7]))
+                            pos, quat = mat_to_pose(grasp_mat)
+                            grasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            grasp_pose = None
+                    
+                    # Apply scene randomization to other objects (if needed, update object_poses here)
+                    # other_object_poses is now a list of (oid, pose) pairs
+                    
+                    # Create success metric    
+
+                    ref_end_pose_mat = pose_to_mat(
+                        np.array(ref_traj.success_metric.end_pose[:3]), 
+                        np.array(ref_traj.success_metric.end_pose[3:7])
+                    )
+                    end_pose_metric_mat = end_pose @ ref_end_pose_mat
+                    pos, quat = mat_to_pose(end_pose_metric_mat)
+                    end_pose_metric_7d = np.concatenate([pos, quat]).tolist()
+
+                    success_metric_type = ref_traj.success_metric.success_metric_type
+                    if success_metric_type == SuccessMetricType.SIMPLE_LIFT:
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.SIMPLE_LIFT,
+                            lift_height=ref_traj.success_metric.lift_height,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    elif success_metric_type == SuccessMetricType.TARGET_POINT:
+                        # Convert end_pose from 7D to mat, transform, then back to 7D
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.TARGET_POINT,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    elif success_metric_type == SuccessMetricType.TARGET_PLANE:
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.TARGET_PLANE,
+                            ground_value=ref_traj.success_metric.ground_value,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    
+                    # Store object poses in 7D format
+                    object_poses = {}
+                    for oid, pose_mat in other_object_poses:
+                        pos, quat = mat_to_pose(pose_mat)
+                        object_poses[oid] = np.concatenate([pos, quat]).tolist()
+                    
+                    # Add start and end related objects
+                    start_pos, start_quat = mat_to_pose(start_pose)
+                    start_pose_7d = np.concatenate([start_pos, start_quat]).tolist()
+                    for oid in self.task_cfg.start_related:
+                        object_poses[oid] = start_pose_7d
+                    
+                    end_pos, end_quat = mat_to_pose(end_pose)
+                    end_pose_7d = np.concatenate([end_pos, end_quat]).tolist()
+                    for oid in self.task_cfg.end_related:
+                        object_poses[oid] = end_pose_7d
+                    
+                    object_poses[self.task_cfg.manipulated_oid] = start_pose_7d
+                    
+                    robot_type = ref_traj.robot_type
+                    generated_trajectories.append(TrajectoryCfg(
+                        robot_pose=robot_pose,
+                        object_poses=object_poses,
+                        object_trajectory=new_traj_7d,
+                        final_gripper_close=ref_traj.final_gripper_close,
+                        pregrasp_pose=pregrasp_pose,
+                        grasp_pose=grasp_pose,
+                        success_metric=success_metric,
+                        robot_type=robot_type
+                    ))
+        
+        # Total trajectories = traj_randomize_num * scene_randomize_num * robot_pose_randomize_num
+        assert len(generated_trajectories) == traj_randomize_num * scene_randomize_num * robot_pose_randomize_num, \
+            f"Expected {traj_randomize_num * scene_randomize_num * robot_pose_randomize_num} trajectories, got {len(generated_trajectories)}"
+      
+        random.shuffle(generated_trajectories)
+        return generated_trajectories
+
+    @staticmethod
+    def compute_new_traj(start_trans: np.ndarray, end_trans: np.ndarray, reference_traj: np.ndarray, 
+                        interp_mode: str = 'slerp') -> np.ndarray:
+        """
+        Compute interpolated trajectory using Real2Render2Real's relative shape preservation method.
+        
+        This maintains the relative motion pattern of the reference trajectory by normalizing
+        the progress along the reference and mapping it to the new start and end positions.
+        
+        Args:
+            start_trans: 4x4 transformation matrix from original start to new start
+            end_trans: 4x4 transformation matrix from original end to new end  
+            reference_traj: Reference trajectory in SE(3), shape (N, 4, 4) or (N, 16)
+            interp_mode: 'linear' or 'slerp' for rotation interpolation (default: 'slerp')
+            
+        Returns:
+            New interpolated trajectory with same shape as reference_traj
+        """
+        # Ensure reference_traj is in (N, 4, 4) format
+        if reference_traj.ndim == 2 and reference_traj.shape[1] == 16:
+            reference_traj = reference_traj.reshape(-1, 4, 4)
+        elif reference_traj.ndim == 3 and reference_traj.shape[1] == 4 and reference_traj.shape[2] == 4:
+            reference_traj = reference_traj.copy()
+        else:
+            raise ValueError(f"Invalid reference_traj shape: {reference_traj.shape}, expected (N, 4, 4) or (N, 16)")
+        
+        N = len(reference_traj)
+        if N == 0:
+            return reference_traj.copy()
+        
+        # Get start and end poses from reference trajectory
+        ref_start = reference_traj[0].copy()
+        ref_end = reference_traj[-1].copy()
+        
+        # Compute new start and end poses
+        new_start_mat = start_trans @ ref_start
+        new_end_mat = end_trans @ ref_end
+        
+        # Convert to 7D format using mat_to_pose
+        ref_traj_7d = []
+        for pose_mat in reference_traj:
+            pos, quat = mat_to_pose(pose_mat)
+            ref_traj_7d.append(np.concatenate([pos, quat]))
+        ref_traj_7d = np.array(ref_traj_7d)
+        
+        pos_start, quat_start = mat_to_pose(new_start_mat)
+        pos_end, quat_end = mat_to_pose(new_end_mat)
+        
+        # Initialize new trajectory
+        new_traj_7d = np.zeros_like(ref_traj_7d)
+        
+        # Normalize time steps
+        t = np.linspace(0, 1, N)
+        
+        # Split into position and rotation components
+        pos_orig = ref_traj_7d[:, :3]
+        quat_orig = ref_traj_7d[:, 3:7]  # wxyz format (from transforms3d)
+        
+        ref_start_pos, ref_start_quat = mat_to_pose(ref_start)
+        ref_end_pos, ref_end_quat = mat_to_pose(ref_end)
+        
+        # Interpolate positions: preserve relative shape from reference trajectory
+        for axis in range(3):
+            ref_range = ref_end_pos[axis] - ref_start_pos[axis]
+            if np.abs(ref_range) > 1e-10:
+                # Normalize progress along reference trajectory
+                normalized_progress = (pos_orig[:, axis] - ref_start_pos[axis]) / ref_range
+                # Map to new range
+                new_traj_7d[:, axis] = pos_start[axis] + (pos_end[axis] - pos_start[axis]) * normalized_progress
+            else:
+                # If no change in reference, use direct transformation
+                new_traj_7d[:, axis] = pos_orig[:, axis] + (pos_start[axis] - ref_start_pos[axis])
+        
+        # Interpolate rotations using SLERP
+        if interp_mode == 'slerp':
+            # Use scipy Slerp for spherical linear interpolation
+            # Convert wxyz to xyzw for scipy
+            quat_start_xyzw = np.array([quat_start[1], quat_start[2], quat_start[3], quat_start[0]])
+            quat_end_xyzw = np.array([quat_end[1], quat_end[2], quat_end[3], quat_end[0]])
+            
+            # Create Slerp interpolator
+            key_rots = R.from_quat([quat_start_xyzw, quat_end_xyzw])
+            key_times = [0, 1]
+            slerp = Slerp(key_times, key_rots)
+            
+            # Interpolate for all time steps
+            interp_rots = slerp(t)
+            quat_interp_xyzw = interp_rots.as_quat()
+            
+            # Convert back to wxyz format
+            new_traj_7d[:, 3] = quat_interp_xyzw[:, 3]  # w
+            new_traj_7d[:, 4] = quat_interp_xyzw[:, 0]  # x
+            new_traj_7d[:, 5] = quat_interp_xyzw[:, 1]  # y
+            new_traj_7d[:, 6] = quat_interp_xyzw[:, 2]  # z
+        else:  # linear
+            # Linear interpolation (needs normalization)
+            for i in range(N):
+                new_traj_7d[i, 3:7] = (1 - t[i]) * quat_start + t[i] * quat_end
+                new_traj_7d[i, 3:7] /= np.linalg.norm(new_traj_7d[i, 3:7])
+        
+        # Convert back to SE(3) matrices using pose_to_mat
+        new_traj = []
+        for pose_7d in new_traj_7d:
+            pose_mat = pose_to_mat(pose_7d[:3], pose_7d[3:7])
+            new_traj.append(pose_mat)
+        new_traj = np.array(new_traj)
+        
+        return new_traj
+
+
+    def lift_traj(self, old_traj, new_traj):
+        T = len(old_traj)
+        renewed_traj = []
+        for t in range(T):
+            old_pose = old_traj[t]
+            new_pose = new_traj[t]
+            old_pose_z = old_pose[2,3]
+            new_pose_z = new_pose[2,3]
+            if old_pose_z > new_pose_z:
+               new_pose[2,3] = old_pose_z
+            renewed_traj.append(new_pose)
+        return renewed_traj
+
+    def select_randomized_cfg(self, train_set_size: int):
+        return random.choice(self.task_cfg.generated_trajectories[train_set_size:])
+
+
+
+
diff --git a/r2sVLA/envs/cfgs/randomizer_cfg.py b/r2sVLA/envs/cfgs/randomizer_cfg.py
new file mode 100644
index 0000000..3bcb88c
--- /dev/null
+++ b/r2sVLA/envs/cfgs/randomizer_cfg.py
@@ -0,0 +1,87 @@
+"""
+Randomizer configuration for scene and trajectory randomization.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Dict, Any
+import yaml
+
+
+@dataclass
+class RandomizerConfig:
+    """
+    Configuration for scene and trajectory randomization.
+    
+    This config controls how randomized scenes and trajectories are generated
+    for data collection and training.
+    """
+    # Grid-based position randomization
+    grid_dist: float = 0.05  # Distance between grid points (meters)
+    grid_num: int = 2  # Number of grid points in each direction (-grid_num to +grid_num)
+    
+    # Rotation randomization
+    angle_random_range: float = 0.5  # Random angle range in radians
+    angle_random_num: int = 3  # Number of random angles per grid point
+    
+    # Trajectory randomization
+    traj_randomize_num: int = 5  # Number of different trajectory start/end pose combinations
+    
+    # Scene randomization (other objects)
+    scene_randomize_num: int = 3  # Number of different scene configurations for other objects
+    
+    # Robot pose randomization
+    robot_pose_randomize_range: float = 0.1  # Random translation range for robot base (meters)
+    robot_pose_randomize_angle: float = 0.2  # Random rotation angle for robot base (radians)
+    robot_pose_randomize_num: int = 2  # Number of different robot poses
+    
+    # Trajectory options
+    fix_end_pose: bool = False  # Whether to fix the end pose (useful for simple pick tasks)
+    
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> 'RandomizerConfig':
+        """Create RandomizerConfig from dictionary."""
+        return cls(**{k: v for k, v in config_dict.items() if k in cls.__dataclass_fields__})
+    
+    @classmethod
+    def from_yaml(cls, yaml_path: Path, task_name: Optional[str] = None) -> 'RandomizerConfig':
+        """Load RandomizerConfig from YAML file."""
+        if not yaml_path.exists():
+            print(f"Warning: Randomizer config file not found at {yaml_path}, using defaults")
+            return cls()
+        
+        with open(yaml_path, 'r') as f:
+            data = yaml.safe_load(f)
+        
+        if task_name and task_name in data:
+            return cls.from_dict(data[task_name])
+        elif 'default' in data:
+            return cls.from_dict(data['default'])
+        else:
+            return cls.from_dict(data)
+    
+    def get_total_trajectories(self) -> int:
+        """
+        Calculate total number of generated trajectories.
+        
+        Returns:
+            Total number of trajectories = traj_randomize_num * scene_randomize_num * robot_pose_randomize_num
+        """
+        return self.traj_randomize_num * self.scene_randomize_num * self.robot_pose_randomize_num
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary."""
+        return {
+            'grid_dist': self.grid_dist,
+            'grid_num': self.grid_num,
+            'angle_random_range': self.angle_random_range,
+            'angle_random_num': self.angle_random_num,
+            'traj_randomize_num': self.traj_randomize_num,
+            'scene_randomize_num': self.scene_randomize_num,
+            'robot_pose_randomize_range': self.robot_pose_randomize_range,
+            'robot_pose_randomize_angle': self.robot_pose_randomize_angle,
+            'robot_pose_randomize_num': self.robot_pose_randomize_num,
+            'fix_end_pose': self.fix_end_pose,
+        }
+
diff --git a/r2sVLA/envs/cfgs/task_cfg.py b/r2sVLA/envs/cfgs/task_cfg.py
new file mode 100644
index 0000000..be0a04b
--- /dev/null
+++ b/r2sVLA/envs/cfgs/task_cfg.py
@@ -0,0 +1,336 @@
+### construct the task config from scene dict.
+
+import json
+import numpy as np
+from enum import Enum, auto
+import shutil
+from pathlib import Path
+import os
+import sys
+from typing import List, Dict, Optional
+from dataclasses import dataclass, field
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+
+class SuccessMetricType(Enum):
+    SIMPLE_LIFT = auto()
+    TARGET_POINT = auto()
+    TARGET_PLANE = auto()
+
+class TaskType(Enum):
+    SIMPLE_PICK = auto()
+    SIMPLE_PICK_PLACE = auto()
+    TARGETTED_PICK_PLACE = auto()
+
+class RobotType(Enum):
+    FRANKA = auto()
+    UR5 = auto()
+
+
+@dataclass
+class BackgroundCfg:
+    background_rgb_path: str
+    background_mesh_path: str
+    background_usd_path: str
+    background_point: List[float]
+
+
+
+@dataclass
+class ObjectCfg:
+    object_id: int
+    object_name: str
+    mesh_path: str
+    usd_path: str
+
+
+@dataclass
+class SuccessMetric:
+    success_metric_type: SuccessMetricType
+    final_gripper_close:bool
+    lift_height: Optional[float] = None
+    ground_value: Optional[float] = None
+    end_pose: Optional[List[float]] = None
+ 
+
+@dataclass
+class TrajectoryCfg:  
+    robot_pose: List[float] # quat wxyz
+    object_poses: Dict[int, List[float]] # quat
+    object_trajectory: List[List[float]] # quat
+    final_gripper_close: bool
+    success_metric: SuccessMetric
+    pregrasp_pose: Optional[List[float]] = None  ### This is eef in world frame.   quat
+    grasp_pose: Optional[List[float]] = None  ### This is eef in world frame. quat
+    robot_type: Optional[RobotType] = None
+    init_manip_object_com: Optional[List[float]] = None # com in world frame
+
+
+
+@dataclass
+class CameraInfo:
+    width: float
+    height: float
+    fx: float
+    fy: float
+    cx: float
+    cy: float
+    camera_opencv_to_world: List[List[float]]
+    camera_position: List[float]
+    camera_heading_wxyz: List[float]
+
+@dataclass
+class TaskCfg:
+    task_key: str
+    task_id: int
+    task_desc: List[str]
+    task_type: TaskType
+    background_cfg: BackgroundCfg
+    camera_info: CameraInfo
+    manipulated_oid: int  
+    start_related: List[int]
+    end_related: List[int]
+    objects: List[ObjectCfg]
+    reference_trajectory: Optional[List[TrajectoryCfg]] = None
+    generated_trajectories: Optional[List[TrajectoryCfg]] = None
+
+
+
+
+
+
+
+
+
+
+def get_next_id(folder: Path) -> int:
+    if not folder.exists():
+        os.makedirs(folder, exist_ok=True)
+        return 0
+    subfolders = [f for f in folder.iterdir() if f.is_dir()]
+    task_num = len(subfolders)
+    return task_num
+
+def get_task_cfg(key: str, base_folder: Path) -> TaskCfg:
+    json_path = base_folder / "task.json"
+    return load_task_cfg(json_path)
+
+def construct_task_config(key, scene_dict: dict, base_folder: Path):
+    task_key = key
+    task_id = get_next_id(base_folder)
+    task_desc = scene_dict["task_desc"]
+    base_folder = base_folder / key
+    if base_folder.exists():
+        shutil.rmtree(base_folder)
+    base_folder.mkdir(parents=True, exist_ok=True)  # Create directory before copying files
+    background_mesh_path = scene_dict["background"]["registered"]
+    background_usd_path = scene_dict["background"]["usd"]
+    shutil.copy(background_mesh_path, base_folder / "background.glb")
+    shutil.copy(background_usd_path, base_folder / "background.usd")
+    background_mesh_path = base_folder / "background.glb"
+    background_usd_path = base_folder / "background.usd"
+    background_rgb_path = scene_dict["background_image"]
+    shutil.copy(background_rgb_path, base_folder / "bg_rgb.jpg")
+    background_rgb_path = base_folder / "bg_rgb.jpg"
+    background_point = scene_dict["groundplane_in_cam"]["point"]
+    background_cfg = BackgroundCfg(str(background_rgb_path), str(background_mesh_path), str(background_usd_path), background_point)
+    width = scene_dict["camera"]["width"]
+    height = scene_dict["camera"]["height"]
+    fx = scene_dict["camera"]["fx"]
+    fy = scene_dict["camera"]["fy"]
+    cx = scene_dict["camera"]["cx"]
+    cy = scene_dict["camera"]["cy"]
+    camera_opencv_to_world = scene_dict["camera"]["camera_opencv_to_world"]
+    camera_position = scene_dict["camera"]["camera_position"]
+    camera_heading_wxyz = scene_dict["camera"]["camera_heading_wxyz"]
+    camera_info = CameraInfo(width, height, fx, fy, cx, cy, camera_opencv_to_world, camera_position, camera_heading_wxyz)
+
+    objects = []
+    for oid, obj in scene_dict["objects"].items():
+        object_id = oid
+        object_name = obj["name"]
+        mesh_path = obj["optimized"]
+        shutil.copy(mesh_path, base_folder / f"object_{object_id}.glb")
+        mesh_path = base_folder / f"object_{object_id}.glb"
+        usd_path = obj['usd']
+        
+        shutil.copy(usd_path, base_folder / Path(usd_path).name)
+       
+        cfg_path = Path(usd_path).parent / "config.yaml"
+        asset_hash_path = Path(usd_path).parent / ".asset_hash"
+        usd_path = base_folder / Path(usd_path).name
+        shutil.copy(cfg_path, base_folder / "config.yaml")
+        shutil.copy(asset_hash_path, base_folder / ".asset_hash")
+        object_cfg = ObjectCfg(object_id, object_name, str(mesh_path), str(usd_path))
+        objects.append(object_cfg)
+
+
+    manipulated_oid = scene_dict["manipulated_oid"]
+    start_related = scene_dict["start_related"]
+    end_related = scene_dict["end_related"]
+
+    if scene_dict["task_type"] == "targetted_pick_place":
+        task_type = TaskType.TARGETTED_PICK_PLACE
+    elif scene_dict["task_type"] == "simple_pick_place":
+        task_type = TaskType.SIMPLE_PICK_PLACE
+    elif scene_dict["task_type"] == "simple_pick":
+        task_type = TaskType.SIMPLE_PICK
+    else:
+        raise ValueError(f"Invalid task type: {scene_dict['info']['task_type']}")
+    task_config = TaskCfg(task_key, task_id, task_desc, task_type, background_cfg, camera_info, manipulated_oid, start_related, end_related, objects)
+    json_path = base_folder / "task.json"
+    with open(json_path, "w") as f:
+        json.dump(serialize_task_cfg(task_config), f)
+    return task_config, base_folder
+
+
+
+def serialize_task_cfg(task_cfg):
+    """
+    Serialize TaskCfg and all nested fields (including numpy arrays) into pure Python dict/list/primitive types,
+    so it can be safely saved as JSON.
+    """
+
+    def serialize(obj):
+        # Handle None
+        if obj is None:
+            return None
+        # Handle basic types
+        elif isinstance(obj, (int, float, str, bool)):
+            return obj
+        # Handle numpy array
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        # Handle enum
+        elif hasattr(obj, 'name') and isinstance(obj, (Enum,)):
+            return obj.name
+        # Handle dict
+        elif isinstance(obj, dict):
+            return {serialize(k): serialize(v) for k, v in obj.items()}
+        # Handle list/tuple
+        elif isinstance(obj, (list, tuple)):
+            return [serialize(i) for i in obj]
+        # Handle dataclass/object with __dict__
+        elif hasattr(obj, '__dict__'):
+            data = {}
+            for key, value in obj.__dict__.items():
+                data[key] = serialize(value)
+            return data
+        # Handle class with __slots__
+        elif hasattr(obj, '__slots__'):
+            data = {}
+            for slot in obj.__slots__:
+                data[slot] = serialize(getattr(obj, slot))
+            return data
+        # Fallback (e.g. Path objects)
+        elif hasattr(obj, '__str__'):
+            return str(obj)
+        else:
+            raise TypeError(f"Cannot serialize object of type {type(obj)}: {repr(obj)}")
+
+    return serialize(task_cfg)
+
+
+def add_reference_trajectory(task_cfg: TaskCfg, reference_trajectory: TrajectoryCfg, base_folder: Path):
+    task_cfg.reference_trajectory = reference_trajectory
+    json_path = base_folder / "task.json"
+    with open(json_path, "w") as f:
+        json.dump(serialize_task_cfg(task_cfg), f)
+    return task_cfg
+
+def add_generated_trajectories(task_cfg: TaskCfg, generated_trajectories: List[TrajectoryCfg], base_folder: Path):
+    task_cfg.generated_trajectories = generated_trajectories
+    json_path = base_folder / "task.json"
+    with open(json_path, "w") as f:
+        json.dump(serialize_task_cfg(task_cfg), f)
+    return task_cfg
+
+
+def load_task_cfg(json_path: Path) -> TaskCfg:
+    """
+    Load a TaskCfg from the given JSON file path and construct a TaskCfg instance.
+    """
+    with open(json_path, "r") as f:
+        cfg_dict = json.load(f)
+
+    # Handle all fields and reconstruct proper datatypes
+    # Helper to reconstruct enums
+    def parse_enum(enum_cls, val):
+        if isinstance(val, enum_cls):
+            return val
+        elif isinstance(val, str):
+            return enum_cls[val]
+        else:
+            raise ValueError(f"Unknown value {val} for enum {enum_cls}")
+
+    # Parse SuccessMetric(s)
+    def parse_success_metric(metric_dict):
+        return SuccessMetric(
+            success_metric_type=parse_enum(SuccessMetricType, metric_dict["success_metric_type"]),
+            final_gripper_close=metric_dict["final_gripper_close"],
+            lift_height=metric_dict.get("lift_height", None),
+            ground_value=metric_dict.get("ground_value", None),
+            end_pose=metric_dict.get("end_pose", None)
+        )
+
+    # Parse TrajectoryCfg(s)
+    def parse_traj_cfg(traj_dict):
+        return TrajectoryCfg(
+            robot_pose=np.array(traj_dict["robot_pose"], dtype=np.float32).tolist(),
+            object_poses={oid: np.array(pose, dtype=np.float32).tolist() for oid, pose in traj_dict["object_poses"].items()},
+            object_trajectory=[np.array(m, dtype=np.float32).tolist() for m in traj_dict["object_trajectory"]],
+            final_gripper_close=traj_dict.get("final_gripper_close", None),
+            success_metric=parse_success_metric(traj_dict.get("success_metric", None)),
+            pregrasp_pose=traj_dict.get("pregrasp_pose", None),
+            grasp_pose=traj_dict.get("grasp_pose", None),
+            robot_type=parse_enum(RobotType, traj_dict.get("robot_type", None)),
+            init_manip_object_com=traj_dict.get("init_manip_object_com", None)
+        )
+
+    def parse_camera_info(camera_dict):
+        return CameraInfo(
+            width=camera_dict["width"],
+            height=camera_dict["height"],
+            fx=camera_dict["fx"],
+            fy=camera_dict["fy"],
+            cx=camera_dict["cx"],
+            cy=camera_dict["cy"],
+            camera_opencv_to_world=np.array(camera_dict["camera_opencv_to_world"], dtype=np.float32).tolist(),
+            camera_position=np.array(camera_dict["camera_position"], dtype=np.float32).tolist(),
+            camera_heading_wxyz=np.array(camera_dict["camera_heading_wxyz"], dtype=np.float32).tolist(),
+        )
+    def parse_object_cfg(object_dict):
+        return ObjectCfg(
+            object_id=object_dict["object_id"],
+            object_name=object_dict["object_name"],
+            mesh_path=object_dict["mesh_path"],
+            usd_path=object_dict["usd_path"]
+        )
+    def parse_background_cfg(background_dict):
+        return BackgroundCfg(
+            background_rgb_path=background_dict["background_rgb_path"],
+            background_mesh_path=background_dict["background_mesh_path"],
+            background_usd_path=background_dict["background_usd_path"],
+            background_point=np.array(background_dict["background_point"], dtype=np.float32).tolist()
+        )
+    # Compose TaskCfg
+    task_cfg = TaskCfg(
+        task_id=cfg_dict["task_id"],
+        task_desc=cfg_dict["task_desc"],
+        task_key=cfg_dict["task_key"],
+        task_type=parse_enum(TaskType, cfg_dict["task_type"]),
+        background_cfg=parse_background_cfg(cfg_dict["background_cfg"]),
+        camera_info=parse_camera_info(cfg_dict["camera_info"]),
+        manipulated_oid=cfg_dict["manipulated_oid"],
+        start_related=cfg_dict["start_related"],
+        end_related=cfg_dict["end_related"],
+        objects=[parse_object_cfg(obj) for obj in cfg_dict["objects"]],
+        reference_trajectory=[
+            parse_traj_cfg(traj) for traj in (cfg_dict.get("reference_trajectory") or [])
+        ],
+        generated_trajectories=[
+            parse_traj_cfg(traj) for traj in (cfg_dict.get("generated_trajectories") or [])
+        ]
+    )
+
+    return task_cfg
\ No newline at end of file
diff --git a/r2sVLA/envs/make_env_isaac.py b/r2sVLA/envs/make_env_isaac.py
new file mode 100644
index 0000000..98a8960
--- /dev/null
+++ b/r2sVLA/envs/make_env_isaac.py
@@ -0,0 +1,681 @@
+# -*- coding: utf-8 -*-
+"""
+Isaac Lab-based simulation environment factory.
+"""
+
+from __future__ import annotations
+import copy
+import json
+import random
+from pathlib import Path
+from typing import Tuple, Optional, List, Dict
+
+import numpy as np
+import transforms3d
+
+# Isaac Lab core
+from isaaclab.utils import configclass
+from isaaclab.scene import InteractiveSceneCfg
+from isaaclab.assets import AssetBaseCfg, RigidObjectCfg
+from isaaclab.sim.schemas import schemas_cfg
+import isaaclab.sim as sim_utils
+from isaaclab.sensors.camera import CameraCfg
+from isaaclab_assets import FRANKA_PANDA_HIGH_PD_CFG
+
+# Manager-based API (terms/configs)
+from isaaclab.managers import (
+    TerminationTermCfg as DoneTerm,
+    EventTermCfg as EventTerm,
+    ObservationGroupCfg as ObsGroup,
+    ObservationTermCfg as ObsTerm,
+    RewardTermCfg as RewTerm,
+    CurriculumTermCfg as CurrTerm,
+    SceneEntityCfg,
+)
+
+# Task-specific MDP helpers (adjust path if needed)
+import isaaclab_tasks.manager_based.manipulation.lift.mdp as mdp
+
+from dataclasses import dataclass, MISSING
+
+from dataclasses import dataclass
+from typing import Optional, List, Dict
+
+import sys
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
+
+
+# Task config
+from envs.cfgs.task_cfg import TaskCfg
+
+
+@dataclass
+class SceneCtx:
+    cam_dict: Dict
+    obj_paths: List[str]
+    background_path: str
+    robot_pos: List[float]
+    robot_rot: List[float]
+    bg_physics: Optional[Dict] = None
+    obj_physics: List[Dict] = None
+    use_ground_plane: bool = False
+    ground_z: Optional[float] = None
+    oid_to_index: Optional[Dict[str, int]] = None  # Maps oid (str) to prim index
+    index_to_oid: Optional[Dict[int, str]] = None  # Maps prim index to oid (str)
+
+
+_SCENE_CTX: Optional[SceneCtx] = None
+
+# ---- default physx presets ----
+DEFAULT_BG_PHYSICS = {
+    "mass_props": {"mass": 100.0},
+    "rigid_props": {"disable_gravity": True, "kinematic_enabled": True},
+    "collision_props": {
+        "collision_enabled": True,
+        "contact_offset": 0.0015,
+        "rest_offset": 0.0003,
+        "torsional_patch_radius": 0.02,
+        "min_torsional_patch_radius": 0.005,
+    },
+}
+DEFAULT_OBJ_PHYSICS = {
+    "mass_props": {"mass": 0.5},
+    "rigid_props": {"disable_gravity": False, "kinematic_enabled": False},
+    "collision_props": {
+        "collision_enabled": True,
+        "contact_offset": 0.0015,
+        "rest_offset": 0.0003,
+        "torsional_patch_radius": 0.02,
+        "min_torsional_patch_radius": 0.005,
+    },
+}
+
+
+def _deep_update(dst: dict, src: dict | None) -> dict:
+    """Recursive dict update without touching the original."""
+    out = copy.deepcopy(dst)
+    if not src:
+        return out
+    for k, v in src.items():
+        if isinstance(v, dict) and isinstance(out.get(k), dict):
+            out[k] = _deep_update(out[k], v)
+        else:
+            out[k] = v
+    return out
+
+
+# --------------------------------------------------------------------------------------
+# Camera / Robot builders (use _SCENE_CTX)
+# --------------------------------------------------------------------------------------
+def create_camera():
+    """Return a CameraCfg using the global SceneCtx."""
+    assert _SCENE_CTX is not None, (
+        "init_scene_configs/init_scene_from_scene_dict must be called first."
+    )
+    C = _SCENE_CTX.cam_dict
+    width = int(C["width"])
+    height = int(C["height"])
+    fx, fy, cx, cy = C["fx"], C["fy"], C["cx"], C["cy"]
+    # Ensure quaternion has 4 elements (w, x, y, z)
+    cam_orientation_list = list(C["camera_heading_wxyz"])
+    if len(cam_orientation_list) != 4:
+        raise ValueError(f"camera_heading_wxyz must have 4 elements (w, x, y, z), got {len(cam_orientation_list)}")
+    cam_orientation = tuple(cam_orientation_list)
+    # Ensure position has 3 elements (x, y, z)
+    cam_pos_list = list(C["camera_position"])
+    if len(cam_pos_list) != 3:
+        raise ValueError(f"camera_position must have 3 elements (x, y, z), got {len(cam_pos_list)}")
+    cam_pos = tuple(cam_pos_list)
+    # Create intrinsic matrix as a simple list (row-major format)
+    # Format: [fx, 0, cx, 0, fy, cy, 0, 0, 1]
+    # Using simple Python list to match reference implementation
+    intrinsic_matrix = [float(fx), 0.0, float(cx), 0.0, float(fy), float(cy), 0.0, 0.0, 1.0]
+    return CameraCfg(
+        prim_path="{ENV_REGEX_NS}/Camera",
+        offset=CameraCfg.OffsetCfg(pos=cam_pos, rot=cam_orientation, convention="ros"),
+        data_types=[
+            "rgb",
+            "distance_to_image_plane",
+            "distance_to_camera",
+            "instance_id_segmentation_fast",
+        ],
+        colorize_instance_id_segmentation=False,
+        spawn=sim_utils.PinholeCameraCfg.from_intrinsic_matrix(
+            intrinsic_matrix=intrinsic_matrix,
+            width=width,
+            height=height,
+        ),
+        width=width,
+        height=height,
+    )
+
+
+def create_robot():
+    """Return a configured Franka Panda config using the global SceneCtx."""
+    assert _SCENE_CTX is not None, (
+        "init_scene_configs/init_scene_from_scene_dict must be called first."
+    )
+    robot = FRANKA_PANDA_HIGH_PD_CFG.replace(prim_path="{ENV_REGEX_NS}/Robot")
+    robot.init_state.pos = tuple(_SCENE_CTX.robot_pos)
+    robot.init_state.rot = tuple(_SCENE_CTX.robot_rot)
+    return robot
+
+
+# --------------------------------------------------------------------------------------
+# Dynamic InteractiveSceneCfg builder
+# --------------------------------------------------------------------------------------
+def build_tabletop_scene_cfg():
+    """
+    Auto-generate a multi-object InteractiveSceneCfg subclass:
+      - background, camera, robot
+      - object_00, object_01, ... based on _SCENE_CTX.obj_paths
+    """
+    assert _SCENE_CTX is not None, (
+        "init_scene_configs/init_scene_from_scene_dict must be called first."
+    )
+    C = _SCENE_CTX
+
+    base_attrs = {}
+
+    # Light
+    base_attrs["light"] = AssetBaseCfg(
+        prim_path="/World/lightDome",
+        spawn=sim_utils.DomeLightCfg(intensity=4000.0, color=(1.0, 1.0, 1.0)),
+    )
+
+    _bg = _deep_update(DEFAULT_BG_PHYSICS, C.bg_physics)
+    _objs = [
+        _deep_update(DEFAULT_OBJ_PHYSICS, obj_physics) for obj_physics in C.obj_physics
+    ]
+
+    bg_mass_cfg = schemas_cfg.MassPropertiesCfg(**_bg["mass_props"])
+    bg_rigid_cfg = schemas_cfg.RigidBodyPropertiesCfg(**_bg["rigid_props"])
+    bg_colli_cfg = schemas_cfg.CollisionPropertiesCfg(**_bg["collision_props"])
+
+    # add another ground plane (mainly for better visualization)
+    base_attrs["backgroundn"] = AssetBaseCfg(
+        prim_path="/World/defaultGroundPlane",
+        spawn=sim_utils.GroundPlaneCfg(),
+    )
+    z = float(C.ground_z if C.ground_z is not None else 0.0)
+    base_attrs["backgroundn"].init_state.pos = (0.0, 0.0, z - 0.2)
+
+    # ---------- Background ----------
+    if C.use_ground_plane:
+        # Simple horizontal ground; only z is customized.
+        # Note: GroundPlaneCfg doesn't take mass/rigid/collision configs (it's a helper),
+        #       so we only set pose here.
+        base_attrs["background"] = AssetBaseCfg(
+            prim_path="/World/defaultGroundPlane",
+            spawn=sim_utils.GroundPlaneCfg(),
+        )
+        z = float(C.ground_z if C.ground_z is not None else 0.0)
+        base_attrs["background"].init_state.pos = (0.0, 0.0, z)
+        # no usd_path assignment in __post_init__ when using ground
+    else:
+        # original USD background
+        base_attrs["background"] = RigidObjectCfg(
+            prim_path="{ENV_REGEX_NS}/Background",
+            spawn=sim_utils.UsdFileCfg(
+                usd_path="",
+                mass_props=bg_mass_cfg,
+                rigid_props=bg_rigid_cfg,
+                collision_props=bg_colli_cfg,
+            ),
+        )
+
+    # Placeholder entries to be replaced in __post_init__
+    base_attrs["camera"] = CameraCfg(prim_path="{ENV_REGEX_NS}/Camera")
+    base_attrs["robot"] = FRANKA_PANDA_HIGH_PD_CFG.replace(
+        prim_path="{ENV_REGEX_NS}/Robot"
+    )
+
+    # Instantiate objects
+    for i, usd_path in enumerate(C.obj_paths):
+        obj_mass_cfg = schemas_cfg.MassPropertiesCfg(**_objs[i]["mass_props"])
+        obj_rigid_cfg = schemas_cfg.RigidBodyPropertiesCfg(**_objs[i]["rigid_props"])
+        obj_colli_cfg = schemas_cfg.CollisionPropertiesCfg(
+            **_objs[i]["collision_props"]
+        )
+
+        obj_template = RigidObjectCfg(
+            prim_path="{ENV_REGEX_NS}/Object",
+            spawn=sim_utils.UsdFileCfg(
+                usd_path="",
+                mass_props=obj_mass_cfg,
+                rigid_props=obj_rigid_cfg,
+                collision_props=obj_colli_cfg,
+            ),
+        )
+
+        name = f"object_{i:02d}"
+        cfg_i = copy.deepcopy(obj_template)
+        cfg_i.prim_path = f"{{ENV_REGEX_NS}}/{name}"
+        cfg_i.spawn.usd_path = usd_path
+        base_attrs[name] = cfg_i
+
+    # Inject background path + replace camera/robot on finalize
+    def __post_init__(self):
+        if not C.use_ground_plane:
+            self.background.spawn.usd_path = C.background_path
+        self.camera = create_camera()
+        self.robot = create_robot()
+
+    attrs = dict(base_attrs)
+    attrs["__doc__"] = "Auto-generated multi-object TableTop scene cfg."
+    attrs["__post_init__"] = __post_init__
+
+    DynamicSceneCfg = configclass(
+        type("TableTopSceneCfgAuto", (InteractiveSceneCfg,), attrs)
+    )
+    return DynamicSceneCfg
+
+
+# --------------------------------------------------------------------------------------
+# Build cam_dict & init directly from a raw scene dict
+# --------------------------------------------------------------------------------------
+def init_scene_from_task_cfg(
+    task_cfg: TaskCfg,
+    *,
+    use_ground_plane: bool = False,
+):
+    """
+    Initialize SceneCtx from a TaskCfg object.
+    If robot pose not provided, sample one with robot_placement_candidates_v2().
+    """
+    # Build cam_dict from task_cfg.camera_info
+    cam_dict = {
+        "width": task_cfg.camera_info.width,
+        "height": task_cfg.camera_info.height,
+        "fx": task_cfg.camera_info.fx,
+        "fy": task_cfg.camera_info.fy,
+        "cx": task_cfg.camera_info.cx,
+        "cy": task_cfg.camera_info.cy,
+        "camera_heading_wxyz": task_cfg.camera_info.camera_heading_wxyz,
+        "camera_position": task_cfg.camera_info.camera_position,
+    }
+    
+    # Sort objects by object_id to ensure consistent ordering
+    sorted_objects = sorted(task_cfg.objects, key=lambda obj: obj.object_id)
+    
+    # Build oid <-> index mapping
+    oid_to_index = {}
+    index_to_oid = {}
+    obj_paths = []
+    
+    for index, obj_cfg in enumerate(sorted_objects):
+        oid_str = str(obj_cfg.object_id)
+        oid_to_index[oid_str] = index
+        index_to_oid[index] = oid_str
+        obj_paths.append(obj_cfg.usd_path)
+    
+    background_path = task_cfg.background_cfg.background_usd_path
+
+    # Get physics from cfgs (TaskCfg doesn't contain physics info)
+    # Priority: args > default
+    obj_physics = [DEFAULT_OBJ_PHYSICS for _ in range(len(obj_paths))]
+    bg_physics = DEFAULT_BG_PHYSICS
+
+    robot_pos = task_cfg.reference_trajectory[0].robot_pose[:3]
+    robot_rot = task_cfg.reference_trajectory[0].robot_pose[3:]
+
+    ground_z = None
+    if use_ground_plane:
+        try:
+            # Use background_point from TaskCfg as ground plane reference
+            ground_z = float(task_cfg.background_cfg.background_point[2])
+        except (IndexError, TypeError, AttributeError) as e:
+            raise ValueError(
+                f"use_ground_plane=True but task_cfg.background_cfg.background_point missing/invalid: {e}"
+            )
+
+    # write global ctx (keep old fields the same)
+    global _SCENE_CTX
+    _SCENE_CTX = SceneCtx(
+        cam_dict=cam_dict,
+        obj_paths=obj_paths,
+        background_path=background_path,
+        robot_pos=list(robot_pos),
+        robot_rot=list(robot_rot),
+        bg_physics=bg_physics,
+        obj_physics=list(obj_physics),
+        use_ground_plane=use_ground_plane,
+        ground_z=ground_z,
+        oid_to_index=oid_to_index,
+        index_to_oid=index_to_oid,
+    )
+    # Verify initialization
+    assert _SCENE_CTX is not None, "Failed to initialize _SCENE_CTX"
+    assert _SCENE_CTX.oid_to_index is not None, "Failed to initialize oid_to_index"
+
+    return {
+        "cam_dict": cam_dict,
+        "obj_usd_paths": obj_paths,
+        "background_usd": background_path,
+        "robot_pos": robot_pos,
+        "robot_rot": robot_rot,
+        "use_ground_plane": use_ground_plane,
+        "ground_z": ground_z,
+    }
+
+
+# --------------------------------------------------------------------------------------
+# Env factory
+# --------------------------------------------------------------------------------------
+def _build_manip_env_cfg(scene_cfg_cls, *, num_envs: int, env_spacing: float = 2.5, eval_cfg: Optional[EvaluationConfig] = None):
+    """Return a ManagerBasedRLEnvCfg subclass stitched together from sub-Cfgs."""
+    from isaaclab.envs import ManagerBasedRLEnvCfg
+    from isaaclab.controllers.differential_ik_cfg import DifferentialIKControllerCfg
+    from isaaclab.envs.mdp.actions.actions_cfg import (
+        DifferentialInverseKinematicsActionCfg,
+    )
+    from envs.cfgs.eval_cfg import EvaluationConfig 
+    @configclass
+    class ManipEnvCfg(ManagerBasedRLEnvCfg):
+        scene = scene_cfg_cls(num_envs=num_envs, env_spacing=env_spacing)
+        commands = CommandsCfg()
+        actions = ActionsCfg()
+        observations = ObservationsCfg()
+        events = EventCfg()
+        rewards = RewardsCfg()
+        terminations = TerminationsCfg()
+        curriculum = CurriculumCfg()
+
+        def __post_init__(self):
+            # ---- Sim & PhysX ----
+            self.decimation = 1
+            self.episode_length_s = 5.0
+            self.sim.dt = 1
+            self.sim.render_interval = 1
+
+            physx = self.sim.physx
+            physx.enable_ccd = True
+            physx.solver_type = 1  # TGS
+            physx.num_position_iterations = 16
+            physx.num_velocity_iterations = 2
+            physx.contact_offset = 0.003
+            physx.rest_offset = 0.0
+            physx.max_depenetration_velocity = 0.5
+            physx.enable_stabilization = True
+            physx.enable_sleeping = True
+
+            # ---- IK arm & binary gripper ----
+            self.actions.arm_action = DifferentialInverseKinematicsActionCfg(
+                asset_name="robot",
+                joint_names=["panda_joint.*"],
+                body_name="panda_hand",
+                controller=DifferentialIKControllerCfg(
+                    command_type="pose", use_relative_mode=True, ik_method="dls"
+                ),
+                scale=0.5,
+                body_offset=DifferentialInverseKinematicsActionCfg.OffsetCfg(
+                    pos=[0.0, 0.0, 0.107]
+                ),
+            )
+            self.actions.gripper_action = mdp.BinaryJointPositionActionCfg(
+                asset_name="robot",
+                joint_names=["panda_finger.*"],
+                open_command_expr={"panda_finger_.*": 0.04},
+                close_command_expr={"panda_finger_.*": 0.0},
+            )
+            self.commands.object_pose.body_name = "panda_hand"
+
+    return ManipEnvCfg
+
+
+def load_scene_json(key: str) -> dict:
+    """Return the raw scene dict from outputs/<key>/simulation/scene.json."""
+    scene_path = Path.cwd() / "outputs" / key / "simulation" / "scene.json"
+    if not scene_path.exists():
+        raise FileNotFoundError(scene_path)
+    return json.load(open(scene_path))
+
+
+def get_prim_name_from_oid(oid: str | int) -> str:
+    """
+    Get the prim name (e.g., "object_00") from object id.
+    
+    Args:
+        oid: Object ID as string (e.g., "1") or integer (e.g., 1)
+        
+    Returns:
+        Prim name string (e.g., "object_00")
+    """
+    global _SCENE_CTX
+    if _SCENE_CTX is None:
+        raise RuntimeError(
+            "Scene context not initialized. Please call make_env() first, which will "
+            "initialize the scene context via init_scene_from_task_cfg(). "
+            "This error usually means make_env() was not called, or the module was reloaded."
+        )
+    oid_str = str(oid)
+    if _SCENE_CTX.oid_to_index is None:
+        raise ValueError("oid_to_index mapping not available. Scene may not have been initialized properly.")
+    index = _SCENE_CTX.oid_to_index.get(oid_str)
+    if index is None:
+        raise ValueError(f"OID '{oid_str}' not found in scene. Available OIDs: {list(_SCENE_CTX.oid_to_index.keys())}")
+    return f"object_{index:02d}"
+
+
+def get_oid_from_prim_name(prim_name: str) -> str:
+    """
+    Get the object id from prim name (e.g., "object_00" -> "1").
+    
+    Args:
+        prim_name: Prim name string (e.g., "object_00")
+        
+    Returns:
+        OID as string (e.g., "1")
+    """
+    global _SCENE_CTX
+    if _SCENE_CTX is None:
+        raise RuntimeError(
+            "Scene context not initialized. Please call make_env() first, which will "
+            "initialize the scene context via init_scene_from_task_cfg()."
+        )
+    if not prim_name.startswith("object_"):
+        raise ValueError(f"Invalid prim name format: {prim_name}")
+    try:
+        index = int(prim_name.split("_")[1])
+    except (IndexError, ValueError):
+        raise ValueError(f"Invalid prim name format: {prim_name}")
+    if _SCENE_CTX.index_to_oid is None:
+        raise ValueError("index_to_oid mapping not available. Scene may not have been initialized properly.")
+    oid = _SCENE_CTX.index_to_oid.get(index)
+    if oid is None:
+        raise ValueError(f"Index {index} not found in scene. Available indices: {list(_SCENE_CTX.index_to_oid.keys())}")
+    return oid
+
+
+def make_env(
+    task_cfg: TaskCfg,
+    eval_cfg: Optional[EvaluationConfig] = None,
+    cfgs: Optional[dict] = None,
+    num_envs: int = 1,
+    device: str = "cuda:0",
+    bg_simplify: bool = False,
+) -> Tuple["ManagerBasedRLEnv", "ManagerBasedRLEnvCfg"]:
+    """
+    Public entry to construct a ManagerBasedRLEnv from TaskCfg.
+    
+    Args:
+        task_cfg: TaskCfg object loaded from task.json
+        cfgs: Optional config dict with physics_cfg, etc.
+        num_envs: Number of parallel environments
+        device: Device to run simulation on (e.g., "cuda:0")
+        bg_simplify: Whether to use simplified ground plane instead of background mesh
+    
+    Returns: (env, env_cfg)
+    """
+    from isaaclab.envs import ManagerBasedRLEnv
+
+    # Default cfgs if not provided
+    if cfgs is None:
+        cfgs = {}
+
+    # Load scene json and initialize global SceneCtx
+    init_scene_from_task_cfg(
+        task_cfg,
+        use_ground_plane=bg_simplify,
+    )
+
+    # Build scene & env cfg
+    SceneCfg = build_tabletop_scene_cfg()
+    ManipEnvCfg = _build_manip_env_cfg(SceneCfg, num_envs=num_envs, env_spacing=2.5, eval_cfg=eval_cfg)
+    env_cfg = ManipEnvCfg()
+    env_cfg.sim.device = device
+    env_cfg.scene.num_envs = num_envs  # double safety
+    env_cfg.decimation = eval_cfg.decimation
+    env_cfg.sim.dt = 1 / eval_cfg.physics_freq if eval_cfg is not None else 0.01
+    env_cfg.sim.render_interval = env_cfg.decimation
+
+    env = ManagerBasedRLEnv(cfg=env_cfg)
+    return env, env_cfg
+
+
+# --------------------------------------------------------------------------------------
+# Observation/Action/Reward/Termination/Curriculum config classes
+# --------------------------------------------------------------------------------------
+@configclass
+class CommandsCfg:
+    """Command terms for the MDP."""
+
+    object_pose = mdp.UniformPoseCommandCfg(
+        asset_name="robot",
+        body_name=MISSING,
+        resampling_time_range=(5.0, 5.0),
+        debug_vis=False,
+        ranges=mdp.UniformPoseCommandCfg.Ranges(
+            pos_x=(0.4, 0.6),
+            pos_y=(-0.25, 0.25),
+            pos_z=(0.25, 0.5),
+            roll=(0.0, 0.0),
+            pitch=(0.0, 0.0),
+            yaw=(0.0, 0.0),
+        ),
+    )
+
+
+@configclass
+class ActionsCfg:
+    """Action specifications for the MDP."""
+
+    arm_action: (
+        mdp.JointPositionActionCfg | mdp.DifferentialInverseKinematicsActionCfg
+    ) = MISSING  # type: ignore
+    gripper_action: mdp.BinaryJointPositionActionCfg = MISSING  # type: ignore
+
+
+@configclass
+class ObservationsCfg:
+    """Observation specifications for the MDP."""
+
+    @configclass
+    class PolicyCfg(ObsGroup):
+        joint_pos_rel = ObsTerm(func=mdp.joint_pos_rel)
+        joint_pos = ObsTerm(func=mdp.joint_pos)
+        joint_vel_rel = ObsTerm(func=mdp.joint_vel_rel)
+        joint_vel = ObsTerm(func=mdp.joint_vel)
+        target_object_position = ObsTerm(
+            func=mdp.generated_commands, params={"command_name": "object_pose"}
+        )
+        actions = ObsTerm(func=mdp.last_action)
+        rgb = ObsTerm(
+            func=mdp.image,
+            params={
+                "sensor_cfg": SceneEntityCfg(name="camera"),
+                "data_type": "rgb",
+                "convert_perspective_to_orthogonal": False,
+                "normalize": False,
+            },
+        )
+        depth = ObsTerm(
+            func=mdp.image,
+            params={
+                "sensor_cfg": SceneEntityCfg(name="camera"),
+                "data_type": "distance_to_image_plane",
+                "convert_perspective_to_orthogonal": False,
+                "normalize": False,
+            },
+        )
+        segmask = ObsTerm(
+            func=mdp.image,
+            params={
+                "sensor_cfg": SceneEntityCfg(name="camera"),
+                "data_type": "instance_id_segmentation_fast",
+                "convert_perspective_to_orthogonal": False,
+                "normalize": False,
+            },
+        )
+
+        def __post_init__(self):
+            self.enable_corruption = True
+            self.concatenate_terms = False
+
+    policy: PolicyCfg = PolicyCfg()
+
+
+@configclass
+class EventCfg:
+    """Configuration for events."""
+
+    reset_all = EventTerm(func=mdp.reset_scene_to_default, mode="reset")
+
+
+@configclass
+class RewardsCfg:
+    """Reward terms for the MDP (kept simple)."""
+
+    action_rate = RewTerm(func=mdp.action_rate_l2, weight=-1e-4)
+    joint_vel = RewTerm(
+        func=mdp.joint_vel_l2,
+        weight=-1e-4,
+        params={"asset_cfg": SceneEntityCfg("robot")},
+    )
+
+
+@configclass
+class TerminationsCfg:
+    """Termination terms for the MDP."""
+
+    time_out = DoneTerm(func=mdp.time_out, time_out=True)
+
+
+@configclass
+class CurriculumCfg:
+    """Curriculum terms for the MDP."""
+
+    action_rate = CurrTerm(
+        func=mdp.modify_reward_weight,
+        params={"term_name": "action_rate", "weight": -1e-1, "num_steps": 10000},
+    )
+    joint_vel = CurrTerm(
+        func=mdp.modify_reward_weight,
+        params={"term_name": "joint_vel", "weight": -1e-1, "num_steps": 10000},
+    )
+
+
+# Public symbols
+__all__ = [
+    # Scene init
+    "init_scene_from_task_cfg",
+    # Scene builders
+    "create_camera",
+    "create_robot",
+    "build_tabletop_scene_cfg",
+    # Placement samplers
+    "robot_placement_candidates",
+    "robot_placement_candidates_v2",
+    # Manager config groups
+    "CommandsCfg",
+    "ActionsCfg",
+    "ObservationsCfg",
+    "EventCfg",
+    "RewardsCfg",
+    "TerminationsCfg",
+    "CurriculumCfg",
+    # Env factory
+    "make_env",
+]
diff --git a/r2sVLA/envs/sim_base_isaac.py b/r2sVLA/envs/sim_base_isaac.py
new file mode 100644
index 0000000..f6c4adb
--- /dev/null
+++ b/r2sVLA/envs/sim_base_isaac.py
@@ -0,0 +1,1252 @@
+from __future__ import annotations
+
+import json
+import os
+import random
+import shutil
+from pathlib import Path
+from typing import Any, Optional, Dict, Sequence, Tuple
+from typing import List
+import copy
+import numpy as np
+import torch
+import imageio
+import cv2
+import h5py
+import sys
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
+from envs.cfgs.task_cfg import CameraInfo, TaskCfg, TrajectoryCfg
+from envs.cfgs.eval_cfg import EvaluationConfig
+# Isaac Lab
+import isaaclab.sim as sim_utils
+from isaaclab.sensors.camera import Camera
+from isaaclab.managers import SceneEntityCfg
+from isaaclab.controllers import DifferentialIKController, DifferentialIKControllerCfg
+from isaaclab.utils.math import subtract_frame_transforms, transform_points, unproject_depth
+from isaaclab.markers import VisualizationMarkers, VisualizationMarkersCfg
+from isaaclab.utils.assets import ISAAC_NUCLEUS_DIR
+
+ # Curobo imports are done lazily in prepare_curobo() to avoid warp/inspect issues with isaaclab namespace package
+# Patch inspect.getfile to handle namespace packages (like isaaclab)
+# This is needed because warp's set_module_options() uses inspect.stack() which tries to get file paths
+# for all modules in the call stack, including namespace packages that don't have a __file__ attribute
+import inspect
+_original_getfile = inspect.getfile
+def _patched_getfile(object):
+    """Patched getfile that handles namespace packages."""
+    try:
+        return _original_getfile(object)
+    except TypeError as e:
+        if "is a built-in module" in str(e) or "namespace" in str(e).lower():
+            # For namespace packages, return a dummy path to avoid errors
+            # This allows warp's inspect.stack() to work even when isaaclab is in the call stack
+            if hasattr(object, '__name__'):
+                return f'<namespace:{object.__name__}>'
+            return '<namespace:unknown>'
+        raise
+inspect.getfile = _patched_getfile
+
+import curobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import (
+    MotionGen,
+    MotionGenConfig,
+    MotionGenPlanConfig,
+)
+from isaaclab.controllers import DifferentialIKController, DifferentialIKControllerCfg
+from isaaclab.managers import SceneEntityCfg
+from isaaclab.markers import VisualizationMarkers, VisualizationMarkersCfg
+from isaaclab.sensors.camera import Camera
+from isaaclab.utils.assets import ISAAC_NUCLEUS_DIR
+from isaaclab.utils.math import (
+    subtract_frame_transforms,
+    transform_points,
+    unproject_depth,
+)
+
+##-- for sim background to real background video composition--
+from PIL import Image 
+import cv2  
+
+
+
+def get_next_demo_id(demo_root: Path) -> int:
+    if not demo_root.exists():
+        return 0
+    demo_ids = []
+    for name in os.listdir(demo_root):
+        if name.startswith("demo_"):
+            try:
+                demo_ids.append(int(name.split("_")[1]))
+            except Exception:
+                pass
+    return max(demo_ids) + 1 if demo_ids else 0
+
+
+class BaseSimulator:
+
+    def __init__(
+        self,
+        sim: sim_utils.SimulationContext,
+        scene: Any,  # InteractiveScene
+        *,
+        out_dir : Optional[Path] = None,
+        set_physics_props: bool = True,
+        enable_motion_planning: bool = True,
+        debug_level: int = 1,
+        task_cfg: Optional[TaskCfg] = None,
+        eval_cfg: Optional[EvaluationConfig] = None,
+    ) -> None:
+        # basic simulation setup
+        self.sim: sim_utils.SimulationContext = sim
+        self.scene = scene
+        self.sim_dt = sim.get_physics_dt()  # Single physics substep dt
+        self.eval_cfg = eval_cfg
+        self.decimation = eval_cfg.decimation 
+        self.save_interval = eval_cfg.save_interval if eval_cfg is not None else 1
+
+        # Task step dt = physics_dt * decimation (time for one task step)
+        self.task_dt = self.sim_dt * self.decimation
+        self.num_envs: int = int(scene.num_envs)
+        self._all_env_ids = torch.arange(
+            self.num_envs, device=sim.device, dtype=torch.long
+        )
+
+        self.out_dir: Path = out_dir
+      
+        # scene entities
+        self.task_cfg = task_cfg
+        self.robot = scene["robot"]
+        
+        robot_pose = torch.tensor(np.array(self.task_cfg.reference_trajectory[0].robot_pose))
+        
+        if robot_pose.ndim == 1:
+            self.robot_pose = (
+                robot_pose.view(1, -1).repeat(self.num_envs, 1).to(self.robot.device)
+            )
+        else:
+            assert robot_pose.shape[0] == self.num_envs and robot_pose.shape[1] == 7, (
+                f"robot_pose must be [B,7], got {robot_pose.shape}"
+            )
+            self.robot_pose = robot_pose.to(self.robot.device).contiguous()
+        
+        # Get object prim based on selected_object_id
+        # Default to object_00 for backward compatibility
+        # Note: selected_object_id is set in subclasses after super().__init__()
+        # So we use a helper method that can be called later
+        self._selected_object_id = None  # Will be set by subclasses
+        self.object_prim = scene["object_00"]  # Default, will be updated if needed
+        self._update_object_prim()
+        self.joint_pos_des_list = []
+
+        self.gripper_cmd_list = []
+        # Get all other object prims (excluding the main object)
+        self.other_object_prims = [scene[key] for key in scene.keys() 
+                                   if f"object_" in key and key != "object_00"]
+        self.background_prim = scene["background"]
+        self.camera: Camera = scene["camera"]
+
+        # physics properties
+        if set_physics_props:
+            static_friction = 5.0
+            dynamic_friction = 5.0
+            restitution = 0.0
+
+            # object: rigid prim -> has root_physx_view
+            if (
+                hasattr(self.object_prim, "root_physx_view")
+                and self.object_prim.root_physx_view is not None
+            ):
+                obj_view = self.object_prim.root_physx_view
+                obj_mats = obj_view.get_material_properties()
+                vals_obj = torch.tensor(
+                    [static_friction, dynamic_friction, restitution],
+                    device=obj_mats.device,
+                    dtype=obj_mats.dtype,
+                )
+                obj_mats[:] = vals_obj
+                obj_view.set_material_properties(
+                    obj_mats, self._all_env_ids.to(obj_mats.device)
+                )
+
+            # background: GroundPlaneCfg -> XFormPrim (no root_physx_view); skip if unavailable
+            if (
+                hasattr(self.background_prim, "root_physx_view")
+                and self.background_prim.root_physx_view is not None
+            ):
+                bg_view = self.background_prim.root_physx_view
+                bg_mats = bg_view.get_material_properties()
+                vals_bg = torch.tensor(
+                    [static_friction, dynamic_friction, restitution],
+                    device=bg_mats.device,
+                    dtype=bg_mats.dtype,
+                )
+                bg_mats[:] = vals_bg
+                bg_view.set_material_properties(
+                    bg_mats, self._all_env_ids.to(bg_mats.device)
+                )
+
+        # ik controller
+        self.diff_ik_cfg = DifferentialIKControllerCfg(
+            command_type="pose", use_relative_mode=False, ik_method="dls"
+        )
+        self.diff_ik_controller = DifferentialIKController(
+            self.diff_ik_cfg, num_envs=self.num_envs, device=sim.device
+        )
+
+        # robot: joints / gripper / jacobian indices
+        self.robot_entity_cfg = SceneEntityCfg(
+            "robot", joint_names=["panda_joint.*"], body_names=["panda_hand"]
+        )
+        self.robot_gripper_cfg = SceneEntityCfg(
+            "robot", joint_names=["panda_finger_joint.*"], body_names=["panda_hand"]
+        )
+        self.robot_entity_cfg.resolve(scene)
+        self.robot_gripper_cfg.resolve(scene)
+        self.gripper_open_tensor = 0.04 * torch.ones(
+            (self.num_envs, len(self.robot_gripper_cfg.joint_ids)),
+            device=self.robot.device,
+        )
+        self.gripper_close_tensor = torch.zeros(
+            (self.num_envs, len(self.robot_gripper_cfg.joint_ids)),
+            device=self.robot.device,
+        )
+        if self.robot.is_fixed_base:
+            self.ee_jacobi_idx = self.robot_entity_cfg.body_ids[0] - 1
+        else:
+            self.ee_jacobi_idx = self.robot_entity_cfg.body_ids[0]
+
+        # demo count and data saving
+        self.count = 0  # Physical step counter
+        self.task_step_count = 0  # Task step counter (count // decimation)
+        self.demo_id = 0
+        self.save_dict = {
+            "rgb": [], "depth": [], "segmask": [],
+            "joint_pos": [], "joint_vel": [], "actions": [],
+            "gripper_pos": [], "gripper_cmd": [], "ee_pose_cam": [],
+            "composed_rgb": []  # composed rgb image with background and foreground
+        }
+
+        # visualization
+        self.selected_object_id = 0
+        self._selected_object_id = 0
+        self.debug_level = debug_level
+
+        self.goal_vis_list = []
+        
+        if self.debug_level > 0:
+            for b in range(self.num_envs):
+                cfg = VisualizationMarkersCfg(
+                    prim_path=f"/Visuals/ee_goal/env_{b:03d}",
+                    markers={
+                        "frame": sim_utils.UsdFileCfg(
+                            usd_path=f"{ISAAC_NUCLEUS_DIR}/Props/UIElements/frame_prim.usd",
+                            scale=(0.06, 0.06, 0.06),
+                            visual_material=sim_utils.PreviewSurfaceCfg(
+                                diffuse_color=(1.0, 0.0, 0.0)
+                            ),
+                        ),
+                    },
+                )
+                self.goal_vis_list.append(VisualizationMarkers(cfg))
+
+        # curobo motion planning
+        self.enable_motion_planning = enable_motion_planning
+        if self.enable_motion_planning:
+            print(f"prepare curobo motion planning: {enable_motion_planning}")
+            self.prepare_curobo()
+            print("curobo motion planning ready.")
+        
+    def _update_object_prim(self):
+        """Update object_prim based on selected_object_id. Called after selected_object_id is set."""
+        if self._selected_object_id is None:
+            return
+        try:
+            from sim_task_isaac import get_prim_name_from_oid
+            oid_str = str(self._selected_object_id)
+            prim_name = get_prim_name_from_oid(oid_str)
+            if prim_name in self.scene:
+                self.object_prim = self.scene[prim_name]
+                # Update other_object_prims
+                self.other_object_prims = [self.scene[key] for key in self.scene.keys() 
+                                           if f"object_" in key and key != prim_name]
+        except (ImportError, ValueError, KeyError) as e:
+            # Fallback to object_00 if mapping not available
+            pass
+
+    # -------- Curobo Motion Planning ----------
+    def prepare_curobo(self):
+
+        setup_curobo_logger("error")
+        # tensor_args = TensorDeviceType()
+        tensor_args = TensorDeviceType(device=self.sim.device, dtype=torch.float32)
+        curobo_path = curobo.__file__.split("/__init__")[0]
+        robot_file = f"{curobo_path}/content/configs/robot/franka.yml"
+        motion_gen_config = MotionGenConfig.load_from_robot_config(
+            robot_cfg=robot_file,
+            world_model=None,
+            tensor_args=tensor_args,
+            interpolation_dt=self.sim_dt,
+            use_cuda_graph=True if self.num_envs == 1 else False,
+        )
+        self.motion_gen = MotionGen(motion_gen_config)
+        if self.num_envs == 1:
+            self.motion_gen.warmup(enable_graph=True)
+        _ = RobotConfig.from_dict(
+            load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"],
+            tensor_args,
+        )
+
+    # ---------- Helpers ----------
+    def _ensure_batch_pose(self, p, q):
+        """Ensure position [B,3], quaternion [B,4] on device."""
+        B = self.scene.num_envs
+        p = torch.as_tensor(p, dtype=torch.float32, device=self.sim.device)
+        q = torch.as_tensor(q, dtype=torch.float32, device=self.sim.device)
+        if p.ndim == 1:
+            p = p.view(1, -1).repeat(B, 1)
+        if q.ndim == 1:
+            q = q.view(1, -1).repeat(B, 1)
+        return p.contiguous(), q.contiguous()
+
+    def _traj_to_BT7(self, traj):
+        """Normalize various curobo traj.position shapes to [B, T, 7]."""
+        B = self.scene.num_envs
+        pos = traj.position  # torch or numpy
+        pos = torch.as_tensor(pos, device=self.sim.device, dtype=torch.float32)
+
+        if pos.ndim == 3:
+            # candidate shapes: [B,T,7] or [T,B,7]
+            if pos.shape[0] == B and pos.shape[-1] == 7:
+                return pos  # [B,T,7]
+            if pos.shape[1] == B and pos.shape[-1] == 7:
+                return pos.permute(1, 0, 2).contiguous()  # [B,T,7]
+        elif pos.ndim == 2 and pos.shape[-1] == 7:
+            # [T,7] → broadcast to all envs
+            return pos.unsqueeze(0).repeat(B, 1, 1)
+        # Fallback: flatten and infer
+        flat = pos.reshape(-1, 7)  # [B*T,7]
+        T = flat.shape[0] // B
+        return flat.view(B, T, 7).contiguous()
+
+    # ---------- Planning / Execution (Single) ----------
+    def reinitialize_motion_gen(self):
+        """
+        Reinitialize the motion generation object.
+        Call this after a crash to restore a clean state.
+        """
+        print("[INFO] Reinitializing motion planner...")
+        try:
+            # Clear CUDA cache
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            
+            # Recreate the motion planner
+            from curobo.types.base import TensorDeviceType
+            from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig
+            from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+            from curobo.types.robot import RobotConfig
+            import curobo
+            
+            tensor_args = TensorDeviceType(device=self.sim.device, dtype=torch.float32)
+            curobo_path = curobo.__file__.split("/__init__")[0]
+            robot_file = f"{curobo_path}/content/configs/robot/franka.yml"
+            
+            motion_gen_config = MotionGenConfig.load_from_robot_config(
+                robot_cfg=robot_file,
+                world_model=None,
+                tensor_args=tensor_args,
+                interpolation_dt=self.sim_dt,
+                use_cuda_graph=True if self.num_envs == 1 else False,
+            )
+            
+            self.motion_gen = MotionGen(motion_gen_config)
+            
+            if self.num_envs == 1:
+                self.motion_gen.warmup(enable_graph=True)
+            
+            print("[INFO] Motion planner reinitialized successfully")
+            return True
+            
+        except Exception as e:
+            print(f"[ERROR] Failed to reinitialize motion planner: {e}")
+            return False
+
+  
+    def motion_planning_single(
+        self, position, quaternion, max_attempts=1, use_graph=True, max_retries=1
+    ):
+        """
+        Single environment planning with automatic recovery from crashes.
+        Returns None on complete failure to signal restart needed.
+        """
+        joint_pos0 = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids][
+            0:1
+        ].contiguous()
+        
+        pos_b, quat_b = self._ensure_batch_pose(position, quaternion)
+        pos_b = pos_b[0:1]
+        quat_b = quat_b[0:1]
+        
+        for retry in range(max_retries):
+            try:
+                start_state = JointState.from_position(joint_pos0)
+                goal_pose = Pose(position=pos_b, quaternion=quat_b)
+                plan_cfg = MotionGenPlanConfig(
+                    max_attempts=max_attempts, enable_graph=use_graph
+                )
+                
+                result = self.motion_gen.plan_single(start_state, goal_pose, plan_cfg)
+                
+                # Check if result is valid
+                if result is None:
+                    print(f"[ERROR] Motion planning returned None result on attempt {retry+1}/{max_retries}")
+                    if retry < max_retries - 1:
+                        if self.reinitialize_motion_gen():
+                            print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                            continue
+                    break
+                
+                traj = result.get_interpolated_plan()
+                
+                # Check if trajectory is valid
+                if traj is None:
+                    print(f"[ERROR] Motion planning returned None trajectory on attempt {retry+1}/{max_retries}")
+                    if retry < max_retries - 1:
+                        if self.reinitialize_motion_gen():
+                            print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                            continue
+                    break
+                
+                if result.success[0] == True:
+                    BT7 = (
+                        traj.position.to(self.sim.device).to(torch.float32).unsqueeze(0)
+                    )
+                else:
+                    print(f"[WARN] Motion planning failed.")
+                    BT7 = joint_pos0.unsqueeze(1)
+                
+                return BT7, result.success
+                
+            except AttributeError as e:
+                print(f"[ERROR] Motion planner crash on attempt {retry+1}/{max_retries}: {e}")
+                
+                if retry < max_retries - 1:
+                    if self.reinitialize_motion_gen():
+                        print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                        continue
+                    else:
+                        break
+                else:
+                    print("[ERROR] Max retries reached")
+                    
+            except Exception as e:
+                # Safe error message extraction
+                try:
+                    error_msg = str(e)
+                    error_type = type(e).__name__
+                except:
+                    error_msg = "Unknown error"
+                    error_type = "Exception"
+                
+                print(f"[ERROR] Unexpected error: {error_type}: {error_msg}")
+                
+                # Check for recoverable errors
+                is_recoverable = False
+                try:
+                    is_recoverable = ("cuda graph" in error_msg.lower() or 
+                                    "NoneType" in error_msg or 
+                                    "has no len()" in error_msg)
+                except:
+                    pass
+                
+                if retry < max_retries - 1 and is_recoverable:
+                    if self.reinitialize_motion_gen():
+                        continue
+                break
+        
+        # Complete failure - return dummy trajectory with False success
+        print("[ERROR] Motion planning failed critically - returning dummy trajectory")
+        joint_pos0 = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids][0:1].contiguous()
+        # Return current position as 1-step trajectory with failure
+        dummy_traj = joint_pos0.unsqueeze(1)  # (1, 1, dof)
+        dummy_success = torch.zeros(1, dtype=torch.bool, device=self.sim.device)
+        return dummy_traj, dummy_success
+
+    # ---------- Planning / Execution (Batched) ----------
+   
+
+    def motion_planning_batch(
+        self, position, quaternion, max_attempts=1, allow_graph=False, max_retries=1
+    ):
+        """
+        Multi-environment planning with automatic recovery from crashes.
+        Returns None on complete failure to signal restart needed.
+        """
+        B = self.scene.num_envs
+        joint_pos = self.robot.data.joint_pos[
+            :, self.robot_entity_cfg.joint_ids
+        ].contiguous()
+        
+        pos_b, quat_b = self._ensure_batch_pose(position, quaternion)
+        
+        for retry in range(max_retries):
+            try:
+                # Attempt planning
+                start_state = JointState.from_position(joint_pos)
+                goal_pose = Pose(position=pos_b, quaternion=quat_b)
+                plan_cfg = MotionGenPlanConfig(
+                    max_attempts=max_attempts, enable_graph=allow_graph
+                )
+                
+                try:
+                    result = self.motion_gen.plan_batch(start_state, goal_pose, plan_cfg)
+                except Exception as plan_err:
+                    print(f"[ERROR] curobo.plan_batch raised exception: {plan_err}")
+                    raise plan_err
+                
+                # Check if result is valid
+                if result is None:
+                    print(f"[ERROR] Motion planning returned None result on attempt {retry+1}/{max_retries}")
+                    if retry < max_retries - 1:
+                        if self.reinitialize_motion_gen():
+                            print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                            continue
+                    break
+                
+                # Process results
+                paths = result.get_paths()
+                
+                # Check if paths is valid - use try-except to safely check if it's iterable
+                paths_valid = False
+                try:
+                    if paths is not None:
+                        # Try to get length to verify it's iterable and not empty
+                        _ = len(paths)
+                        if len(paths) > 0:
+                            paths_valid = True
+                except:
+                    pass
+                
+                if not paths_valid:
+                    print(f"[ERROR] Motion planning returned invalid paths on attempt {retry+1}/{max_retries}")
+                    if retry < max_retries - 1:
+                        if self.reinitialize_motion_gen():
+                            print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                            continue
+                        else:
+                            print("[ERROR] Failed to recover motion planner")
+                    # Skip to next retry iteration or exit loop
+                    continue
+                
+                # Double-check paths is still valid (defensive programming)
+                if paths is None:
+                    print(f"[ERROR] paths became None after validation check")
+                    continue
+                
+                T_max = 1
+                
+                # Check if result.success is valid
+                if result.success is None:
+                     print(f"[WARN] result.success is None. Assuming failure for all envs.")
+                     # Create dummy failure tensor
+                     result.success = torch.zeros(B, dtype=torch.bool, device=self.sim.device)
+
+                try:
+                    for i, p in enumerate(paths):
+                        if not result.success[i]:
+                            print(f"[WARN] Motion planning failed for env {i}.")
+                        else:
+                            T_max = max(T_max, int(p.position.shape[-2]))
+                except TypeError as te:
+                    print(f"[ERROR] TypeError when processing paths: {te}")
+                    print(f"[DEBUG] paths type: {type(paths)}, paths value: {paths}")
+                    continue
+                
+                dof = joint_pos.shape[-1]
+                BT7 = torch.zeros(
+                    (B, T_max, dof), device=self.sim.device, dtype=torch.float32
+                )
+                
+                for i, p in enumerate(paths):
+                    if result.success[i] == False:
+                        BT7[i, :, :] = (
+                            joint_pos[i : i + 1, :].unsqueeze(1).repeat(1, T_max, 1)
+                        )
+                    else:
+                        Ti = p.position.shape[-2]
+                        BT7[i, :Ti, :] = p.position.to(self.sim.device).to(torch.float32)
+                        if Ti < T_max:
+                            BT7[i, Ti:, :] = BT7[i, Ti - 1 : Ti, :]
+                
+                success = result.success if result.success is not None else torch.zeros(
+                    B, dtype=torch.bool, device=self.sim.device
+                )
+                
+                # Success! Return the trajectory
+                return BT7, success
+                
+            except AttributeError as e:
+                print(f"[ERROR] Motion planner crash on attempt {retry+1}/{max_retries}: {e}")
+                
+                if retry < max_retries - 1:
+                    if self.reinitialize_motion_gen():
+                        print(f"[INFO] Retrying motion planning (attempt {retry+2}/{max_retries})...")
+                        continue
+                    else:
+                        print("[ERROR] Failed to recover motion planner")
+                        break
+                else:
+                    print("[ERROR] Max retries reached, motion planning failed critically")
+                    
+            except Exception as e:
+                # Safe error message extraction
+                try:
+                    error_msg = str(e)
+                    error_type = type(e).__name__
+                except:
+                    error_msg = "Unknown error"
+                    error_type = "Exception"
+                
+                print(f"[ERROR] Unexpected error in motion planning: {error_type}: {error_msg}")
+                
+                # Check for recoverable errors
+                is_recoverable = False
+                try:
+                    is_recoverable = ("cuda graph" in error_msg.lower() or 
+                                    "NoneType" in error_msg or 
+                                    "has no len()" in error_msg)
+                except:
+                    pass
+                
+                if retry < max_retries - 1 and is_recoverable:
+                    if self.reinitialize_motion_gen():
+                        print(f"[INFO] Retrying after error (attempt {retry+2}/{max_retries})...")
+                        continue
+                break
+        
+        # If we get here, all retries failed - return dummy trajectory with all False success
+        print("[ERROR] Motion planning failed critically - returning dummy trajectory")
+        B = self.scene.num_envs
+        joint_pos = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids].contiguous()
+        dof = joint_pos.shape[-1]
+        # Return current position as 1-step trajectory with all failures
+        dummy_traj = joint_pos.unsqueeze(1)  # (B, 1, dof)
+        dummy_success = torch.zeros(B, dtype=torch.bool, device=self.sim.device)
+        return dummy_traj, dummy_success
+
+
+
+    def motion_planning(self, position, quaternion, max_attempts=1):
+        if self.scene.num_envs == 1:
+            return self.motion_planning_single(
+                position, quaternion, max_attempts=max_attempts, use_graph=True
+            )
+        else:
+            return self.motion_planning_batch(
+                position, quaternion, max_attempts=max_attempts, allow_graph=False
+            )
+
+    def move_to_motion_planning(
+        self,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        gripper_open: bool = True,
+        record: bool = True,
+    ) -> torch.Tensor:
+        """
+        Cartesian space control: Move the end effector to the desired position and orientation using motion planning.
+        Works with batched envs. If inputs are 1D, they will be broadcast to all envs.
+        """
+        traj, success = self.motion_planning(position, quaternion)
+        BT7 = traj
+        T = BT7.shape[1]
+        last = None
+        for i in range(T):
+            joint_pos_des = BT7[:, i, :]  # [B,7]
+            self.apply_actions(joint_pos_des, gripper_open=gripper_open, record=record)
+            last = joint_pos_des
+        return last, success
+
+    def set_robot_pose(self, robot_pose: torch.Tensor):
+        if robot_pose.ndim == 1:
+            self.robot_pose = (
+                robot_pose.view(1, -1).repeat(self.num_envs, 1).to(self.robot.device)
+            )
+        else:
+            assert robot_pose.shape[0] == self.num_envs and robot_pose.shape[1] == 7, (
+                f"robot_pose must be [B,7], got {robot_pose.shape}"
+            )
+            self.robot_pose = robot_pose.to(self.robot.device).contiguous()
+
+
+    # ---------- Environment Step ----------
+    def step(self):
+        self.scene.write_data_to_sim()
+        self.sim.step()
+        # Camera update should use task_dt (decimation * sim_dt) since camera
+        # is typically updated at task step frequency, not physics substep frequency
+        # This matches the render_interval setting
+        if self.count % self.decimation == 0:
+            self.camera.update(dt=self.task_dt)
+            self.task_step_count += 1  # Increment task step counter
+        self.count += 1  # Increment physical step counter
+        self.scene.update(self.sim_dt)
+
+    # ---------- Apply actions to robot joints ----------
+    def apply_actions(self, joint_pos_des, gripper_open: bool = True, record: bool = True):
+        # joint_pos_des: [B, n_joints]
+        self.gripper_cmd_list.append(gripper_open)
+        self.joint_pos_des_list.append(joint_pos_des)
+        self.robot.set_joint_position_target(
+            joint_pos_des, joint_ids=self.robot_entity_cfg.joint_ids
+        )
+        ### FIXME: HACK
+        if gripper_open:
+            self.robot.set_joint_position_target(
+                self.gripper_open_tensor, joint_ids=self.robot_gripper_cfg.joint_ids
+            )
+        else:
+            self.robot.set_joint_position_target(
+                self.gripper_close_tensor, joint_ids=self.robot_gripper_cfg.joint_ids
+            )
+        # Execute decimation number of physics steps to complete one task step
+        for _ in range(self.decimation):
+            self.step()
+        obs = self.get_observation(gripper_open=gripper_open)
+        if record:
+            self.record_data(obs)
+        return obs
+
+    # ---------- EE control ----------
+    def move_to(
+        self,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        gripper_open: bool = True,
+        record: bool = True,
+    ) -> torch.Tensor:
+        if self.enable_motion_planning:
+            return self.move_to_motion_planning(
+                position, quaternion, gripper_open=gripper_open, record=record
+            )
+        else:
+            return self.move_to_ik(
+                position, quaternion, gripper_open=gripper_open, record=record
+            )
+
+    def move_to_ik(
+        self,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        steps: int = 3,
+        gripper_open: bool = True,
+        record: bool = True,
+    ) -> torch.Tensor:
+        """
+        Cartesian space control: Move the end effector to the desired position and orientation using inverse kinematics.
+        Works with batched envs. If inputs are 1D, they will be broadcast to all envs.
+
+        Early-stop when both position and orientation errors are within tolerances.
+        'steps' now acts as a max-iteration cap; the loop breaks earlier on convergence.
+        """
+        # Ensure [B,3]/[B,4] tensors on device
+        position, quaternion = self._ensure_batch_pose(position, quaternion)
+
+        # IK command (world frame goals)
+        ee_goals = torch.cat([position, quaternion], dim=1).to(self.sim.device).float()
+        self.diff_ik_controller.reset()
+        self.diff_ik_controller.set_command(ee_goals)
+
+        # Tolerances (you can tune if needed)
+        pos_tol = 1e-3  # meters
+        ori_tol = 3.0 * np.pi / 180.0  # radians (~3 degrees)
+
+        # Interpret 'steps' as max iterations; early-stop on convergence
+        max_steps = int(steps) if steps is not None and steps > 0 else 10_000
+
+        joint_pos_des = None
+        for _ in range(max_steps):
+            # Current EE pose (world) and Jacobian
+            jacobian = self.robot.root_physx_view.get_jacobians()[
+                :, self.ee_jacobi_idx, :, self.robot_entity_cfg.joint_ids
+            ]
+            ee_pose_w = self.robot.data.body_state_w[
+                :, self.robot_entity_cfg.body_ids[0], 0:7
+            ]
+            root_pose_w = self.robot.data.root_state_w[:, 0:7]
+            joint_pos = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
+
+            # Current EE pose expressed in robot base
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_pose_w[:, 0:3],
+                root_pose_w[:, 3:7],
+                ee_pose_w[:, 0:3],
+                ee_pose_w[:, 3:7],
+            )
+
+            # Compute next joint command
+            joint_pos_des = self.diff_ik_controller.compute(
+                ee_pos_b, ee_quat_b, jacobian, joint_pos
+            )
+
+            # Apply
+            self.apply_actions(joint_pos_des, gripper_open=gripper_open, record=record)
+            # --- Early-stop check ---
+            # Desired EE pose in base frame (convert world goal -> base)
+            des_pos_b, des_quat_b = subtract_frame_transforms(
+                root_pose_w[:, 0:3], root_pose_w[:, 3:7], position, quaternion
+            )
+            # Position error [B]
+            pos_err = torch.norm(des_pos_b - ee_pos_b, dim=1)
+            # Orientation error [B]: angle between quaternions
+            # Note: q and -q are equivalent -> take |dot|
+            dot = torch.sum(des_quat_b * ee_quat_b, dim=1).abs().clamp(-1.0, 1.0)
+            ori_err = 2.0 * torch.acos(dot)
+
+            done = (pos_err <= pos_tol) & (ori_err <= ori_tol)
+            if bool(torch.all(done)):
+                break
+
+        return joint_pos_des
+
+    # ---------- Robot Waiting ----------
+    def wait(self, gripper_open, steps: int, record: bool = True):
+        joint_pos_des = self.robot.data.joint_pos[
+            :, self.robot_entity_cfg.joint_ids
+        ].clone()
+        for _ in range(steps):
+            self.apply_actions(joint_pos_des, gripper_open=gripper_open)
+        return joint_pos_des
+
+    # ---------- Reset Envs ----------
+    def reset(self, env_ids=None):
+        """
+        Reset all envs or only those in env_ids.
+        Assumptions:
+          - self.robot_pose.shape == (B, 7)        # base pose per env (wxyz in [:,3:])
+          - self.robot.data.default_joint_pos == (B, 7)
+          - self.robot.data.default_joint_vel == (B, 7)
+        """
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)  # (B,)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(
+                -1
+            )  # (M,)
+        M = int(env_ids_t.shape[0])
+
+        # --- object pose/vel: set object at env origins with identity quat ---
+        env_origins = self.scene.env_origins.to(device)[env_ids_t]  # (M,3)
+        object_pose = torch.zeros((M, 7), device=device, dtype=torch.float32)
+        object_pose[:, :3] = env_origins
+        object_pose[:, 3] = 1.0  # wxyz = [1,0,0,0]
+        self.object_prim.write_root_pose_to_sim(object_pose, env_ids=env_ids_t)
+        self.object_prim.write_root_velocity_to_sim(
+            torch.zeros((M, 6), device=device, dtype=torch.float32), env_ids=env_ids_t
+        )
+        self.object_prim.write_data_to_sim()
+        for prim in self.other_object_prims:
+            prim.write_root_pose_to_sim(object_pose, env_ids=env_ids_t)
+            prim.write_root_velocity_to_sim(
+                torch.zeros((M, 6), device=device, dtype=torch.float32),
+                env_ids=env_ids_t,
+            )
+            prim.write_data_to_sim()
+
+        # --- robot base pose/vel ---
+        # robot_pose is (B,7) in *local* base frame; add env origin offset per env
+        rp_local = self.robot_pose.to(self.robot.device)[env_ids_t]  # (M,7)
+        env_origins_robot = env_origins.to(self.robot.device)  # (M,3)
+        robot_pose_world = rp_local.clone().float()
+        robot_pose_world[:, :3] = env_origins_robot + robot_pose_world[:, :3]
+        #print(f"[INFO] robot_pose_world: {robot_pose_world}")
+        self.robot.write_root_pose_to_sim(robot_pose_world, env_ids=env_ids_t)
+        self.robot.write_root_velocity_to_sim(
+            torch.zeros((M, 6), device=self.robot.device, dtype=torch.float32),
+            env_ids=env_ids_t,
+        )
+
+        # --- joints (B,7) -> select ids (M,7) ---
+        joint_pos = self.robot.data.default_joint_pos.to(self.robot.device)[
+            env_ids_t
+        ]  # (M,7)
+        joint_vel = self.robot.data.default_joint_vel.to(self.robot.device)[
+            env_ids_t
+        ]  # (M,7)
+        self.robot.write_joint_state_to_sim(joint_pos, joint_vel, env_ids=env_ids_t)
+        self.robot.write_data_to_sim()
+
+        self.step()
+        # housekeeping
+        self.count = 0
+        self.task_step_count = 0
+        self.clear_data()
+
+    # ---------- Get Observations ----------
+    def get_observation(self, gripper_open) -> Dict[str, torch.Tensor]:
+        # camera outputs (already batched)
+        rgb = self.camera.data.output["rgb"]  # [B,H,W,3]
+        depth = self.camera.data.output["distance_to_image_plane"]  # [B,H,W]
+        ins_all = self.camera.data.output["instance_id_segmentation_fast"]  # [B,H,W]
+
+        B, H, W, _ = ins_all.shape
+        fg_mask_list = []
+        obj_mask_list = []
+        for b in range(B):
+            ins_id_seg = ins_all[b]
+            id_mapping = self.camera.data.info[b]["instance_id_segmentation_fast"][
+                "idToLabels"
+            ]
+            fg_mask_b = torch.zeros_like(
+                ins_id_seg, dtype=torch.bool, device=ins_id_seg.device
+            )
+            obj_mask_b = torch.zeros_like(
+                ins_id_seg, dtype=torch.bool, device=ins_id_seg.device
+            )
+            for key, value in id_mapping.items():
+                if "object" in value:
+                    fg_mask_b |= ins_id_seg == key
+                    obj_mask_b |= ins_id_seg == key
+                if "Robot" in value:
+                    fg_mask_b |= ins_id_seg == key
+            fg_mask_list.append(fg_mask_b)
+            obj_mask_list.append(obj_mask_b)
+        fg_mask = torch.stack(fg_mask_list, dim=0)  # [B,H,W]
+        obj_mask = torch.stack(obj_mask_list, dim=0)  # [B,H,W]
+
+        ee_pose_w = self.robot.data.body_state_w[
+            :, self.robot_entity_cfg.body_ids[0], 0:7
+        ]
+        joint_pos = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
+        joint_vel = self.robot.data.joint_vel[:, self.robot_entity_cfg.joint_ids]
+        gripper_pos = self.robot.data.joint_pos[:, self.robot_gripper_cfg.joint_ids]
+        gripper_cmd = (
+            self.gripper_open_tensor if gripper_open else self.gripper_close_tensor
+        )
+
+        cam_pos_w = self.camera.data.pos_w
+        cam_quat_w = self.camera.data.quat_w_ros
+        ee_pos_cam, ee_quat_cam = subtract_frame_transforms(
+            cam_pos_w, cam_quat_w, ee_pose_w[:, 0:3], ee_pose_w[:, 3:7]
+        )
+        ee_pose_cam = torch.cat([ee_pos_cam, ee_quat_cam], dim=1)
+
+        points_3d_cam = unproject_depth(
+            self.camera.data.output["distance_to_image_plane"],
+            self.camera.data.intrinsic_matrices,
+        )
+        points_3d_world = transform_points(
+            points_3d_cam, self.camera.data.pos_w, self.camera.data.quat_w_ros
+        )
+
+        object_center = self.object_prim.data.root_com_pos_w[:, :3]
+
+        # Convert real background composition (batch-aware)
+        rgb_real = self.convert_real_batch(fg_mask, rgb)
+        return {
+            "rgb": rgb,
+            "composed_rgb": rgb_real,
+            "depth": depth,
+            "fg_mask": fg_mask,
+            "joint_pos": joint_pos,
+            "gripper_pos": gripper_pos,
+            "gripper_cmd": gripper_cmd,
+            "joint_vel": joint_vel,
+            "ee_pose_cam": ee_pose_cam,
+            "ee_pose_w": ee_pose_w,
+            "object_mask": obj_mask,
+            "points_cam": points_3d_cam,
+            "points_world": points_3d_world,
+            "object_center": object_center,
+        }
+
+    # ---------- Task Completion Verifier ----------
+    def is_success(self) -> bool:
+        raise NotImplementedError(
+            "BaseSimulator.is_success() should be implemented in subclass."
+        )
+
+    # ---------- Data Recording & Saving & Clearing ----------
+    def record_data(self, obs: Dict[str, torch.Tensor]):
+        if self.task_step_count % self.save_interval == 0:
+            self.save_dict["rgb"].append(obs["rgb"].cpu().numpy())  # [B,H,W,3]
+            self.save_dict["composed_rgb"].append(obs["composed_rgb"].cpu().numpy())  # [B,H,W,3]
+            self.save_dict["depth"].append(obs["depth"].cpu().numpy())  # [B,H,W]
+            self.save_dict["segmask"].append(obs["fg_mask"].cpu().numpy())  # [B,H,W]
+            self.save_dict["joint_pos"].append(obs["joint_pos"].cpu().numpy())  # [B,nJ]
+            self.save_dict["gripper_pos"].append(obs["gripper_pos"].cpu().numpy())  # [B,3]
+            self.save_dict["gripper_cmd"].append(obs["gripper_cmd"].cpu().numpy())  # [B,1]
+            self.save_dict["joint_vel"].append(obs["joint_vel"].cpu().numpy())
+            self.save_dict["ee_pose_cam"].append(obs["ee_pose_cam"].cpu().numpy())
+            
+    def clear_data(self):
+        for key in self.save_dict.keys():
+            self.save_dict[key] = []
+ 
+    def _env_dir(self, base: Path, b: int) -> Path:
+        d = base / f"env_{b:03d}"
+        d.mkdir(parents=True, exist_ok=True)
+        return d
+    
+    def _get_next_demo_dir(self, base: Path) -> Path:
+        already_existing_num = len(list(base.iterdir()))
+        return base / f"demo_{already_existing_num:03d}.mp4"
+
+    def save_data(self, ignore_keys: List[str] = [], env_ids: Optional[List[int]] = None, export_hdf5: bool = False, save_other_things: bool = False):
+        stacked = self.save_dict.copy()
+        if env_ids is None:
+            env_ids = self._all_env_ids.cpu().numpy()
+        video_dir = self.out_dir
+        video_dir.mkdir(parents=True, exist_ok=True)
+        hdf5_names = []
+        video_paths = []
+        fps = 1 / (self.sim_dt * self.decimation * self.save_interval)
+        for b in env_ids:
+            demo_path = self._get_next_demo_dir(video_dir)
+            hdf5_names.append(demo_path.name.replace(".mp4", ""))
+            if save_other_things:
+                demo_dir = self.out_dir / self.img_folder/ "demos"
+                demo_dir = self._get_next_demo_dir(demo_dir).replace(".mp4", "")
+                env_dir = self._env_dir(demo_dir, b)
+                for key, arr in stacked.items():
+                    if key in ignore_keys:  # skip the keys for storage
+                        continue
+                    if key == "rgb":
+                        video_path = env_dir / "sim_video.mp4"
+                        writer = imageio.get_writer(
+                            video_path, fps=fps, macro_block_size=None
+                        )
+                        for t in range(arr.shape[0]):
+                            writer.append_data(arr[t][b])
+                        writer.close()
+                        
+                    elif key == "segmask":
+                        video_path = env_dir / "mask_video.mp4"
+                        writer = imageio.get_writer(
+                            video_path, fps=fps, macro_block_size=None
+                        )
+                        for t in range(arr.shape[0]):
+                            writer.append_data((arr[t][b].astype(np.uint8) * 255))
+                        writer.close()
+                    elif key == "depth":
+                        depth_seq = arr[:, b]
+                        flat = depth_seq[depth_seq > 0]
+                        max_depth = np.percentile(flat, 99) if flat.size > 0 else 1.0
+                        depth_norm = np.clip(depth_seq / max_depth * 255.0, 0, 255).astype(
+                            np.uint8
+                        )
+                        video_path = env_dir / "depth_video.mp4"
+                        writer = imageio.get_writer(
+                            video_path, fps=50, macro_block_size=None
+                        )
+                        for t in range(depth_norm.shape[0]):
+                            writer.append_data(depth_norm[t])
+                        writer.close()
+                        np.save(env_dir / f"{key}.npy", depth_seq)
+                    elif key != "composed_rgb":
+                        np.save(env_dir / f"{key}.npy", arr[b])
+            writer = imageio.get_writer(demo_path, fps=fps, macro_block_size=None)
+            for t in range(len(self.save_dict["composed_rgb"])):
+                writer.append_data(self.save_dict["composed_rgb"][t][b])
+            writer.close()
+            video_paths.append(str(demo_path))
+            print(f"[INFO]: Demonstration is saved at: {demo_path}")
+        if export_hdf5:
+            self.export_batch_data_to_hdf5(hdf5_names, video_paths)
+       
+
+    # def convert_real(self, segmask, bg_rgb, fg_rgb):
+    #     """Convert single image with background composition (non-batch version)."""
+    #     segmask_2d = segmask[..., 0] if segmask.ndim > 2 else segmask
+    #     composed = bg_rgb.copy()
+    #     composed[segmask_2d] = fg_rgb[segmask_2d]
+    #     return composed
+    
+    def convert_real_batch(self, fg_mask, rgb):
+        """
+        Convert batch of images with background composition.
+        
+        Args:
+            fg_mask: [B, H, W] torch.Tensor, boolean mask for foreground
+            rgb: [B, H, W, 3] torch.Tensor, foreground RGB images
+            
+        Returns:
+            composed_rgb: [B, H, W, 3] torch.Tensor, composed images with background
+        """
+        # Ensure bg_rgb is loaded
+        if not hasattr(self, 'bg_rgb') or self.bg_rgb is None:
+            bg_rgb_path = self.task_cfg.background_cfg.background_rgb_path
+            self.bg_rgb = imageio.imread(bg_rgb_path)
+        
+        # Convert to torch if needed
+        if isinstance(self.bg_rgb, np.ndarray):
+            bg_rgb_tensor = torch.from_numpy(self.bg_rgb).to(rgb.device)
+        else:
+            bg_rgb_tensor = self.bg_rgb.to(rgb.device)
+        
+        # Ensure bg_rgb has batch dimension [1, H, W, 3] and expand to [B, H, W, 3]
+        if bg_rgb_tensor.ndim == 3:
+            bg_rgb_tensor = bg_rgb_tensor.unsqueeze(0)  # [1, H, W, 3]
+        B = rgb.shape[0]
+        bg_rgb_batch = bg_rgb_tensor.expand(B, -1, -1, -1)  # [B, H, W, 3]
+        
+        # Normalize bg_rgb to same range as rgb if needed
+        # Isaac Lab typically outputs RGB in [0, 1] range (float32)
+        # imageio.imread typically outputs [0, 255] range (uint8)
+        if bg_rgb_batch.dtype == torch.uint8:
+            bg_rgb_batch = bg_rgb_batch.float() / 255.0
+        elif bg_rgb_batch.max() > 1.0:
+            bg_rgb_batch = bg_rgb_batch.float() / 255.0
+        
+        # Ensure rgb is in [0, 1] range
+        rgb_normalized = rgb.float() if rgb.dtype != torch.float32 else rgb
+        if rgb_normalized.max() > 1.0:
+            rgb_normalized = rgb_normalized / 255.0
+        
+        # Ensure both are float32
+        bg_rgb_batch = bg_rgb_batch.to(dtype=torch.float32)
+        rgb_normalized = rgb_normalized.to(dtype=torch.float32)
+        
+        # Ensure fg_mask is [B, H, W] then expand to [B, H, W, 1] for broadcasting
+        if fg_mask.ndim == 4:
+            fg_mask = fg_mask.squeeze(-1)  # Remove last dimension if present
+        fg_mask_expanded = fg_mask.unsqueeze(-1)  # [B, H, W, 1]
+        
+        # Compose: use fg_rgb where mask is True, bg_rgb where mask is False
+        composed = torch.where(fg_mask_expanded, rgb_normalized, bg_rgb_batch)
+        
+        # Draw count and joint_pos_des_list last value on images
+        # Convert to numpy for cv2 operations
+        composed_np = composed.cpu().numpy()  # [B, H, W, 3] in [0, 1] range
+        # Convert to [0, 255] uint8 and BGR for cv2, ensure contiguous array
+        composed_np = (composed_np * 255.0).astype(np.uint8)
+        composed_np = composed_np[..., ::-1].copy()  # RGB to BGR, ensure contiguous
+        
+        # # Get count value
+        # count_str = f"Count: {self.count}"
+        
+        # # Get last joint_pos_des value if available
+        # last_joint_pos = None
+        # gripper_cmd = None
+        # if hasattr(self, 'joint_pos_des_list') and len(self.joint_pos_des_list) > 0:
+        #     last_joint_pos = self.joint_pos_des_list[-1]  # [B, n_joints]
+        
+        # # Get gripper command if available
+        # if hasattr(self, 'gripper_cmd_list') and len(self.gripper_cmd_list) > 0:
+        #     gripper_cmd = self.gripper_cmd_list[-1]  # Last gripper command
+        
+        # # Draw text on each image in the batch
+        # for b in range(B):
+        #     # Create a copy to ensure contiguous and writable array
+        #     img = composed_np[b].copy()  # [H, W, 3] BGR, contiguous copy
+            
+        #     # Draw count
+        #     cv2.putText(img, count_str, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 
+        #                0.7, (0, 255, 0), 2)  # Green text
+            
+        #     # Draw joint position
+        #     if last_joint_pos is not None:
+        #         if last_joint_pos.ndim == 2:
+        #             # For batch, use b-th env's last joint value
+        #             last_joint_val = last_joint_pos[b].cpu().numpy()  # Last joint of b-th env
+        #         else:
+        #             last_joint_val = last_joint_pos[b].cpu().numpy()
+        #         # Format joint values: first 4 on first line, last 3 on second line
+        #         joint_pos_str1 = f"Joint: {', '.join([f'{last_joint_val[j]:.3f}' for j in range(4)])}"
+        #         joint_pos_str2 = f"      {', '.join([f'{last_joint_val[j]:.3f}' for j in range(4, 7)])}"
+                
+        #         cv2.putText(img, joint_pos_str1, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 
+        #                    0.7, (0, 255, 0), 2)  # Green text
+        #         cv2.putText(img, joint_pos_str2, (10, 85), cv2.FONT_HERSHEY_SIMPLEX, 
+        #                    0.7, (0, 255, 0), 2)  # Green text
+            
+        #     # Draw gripper command
+        #     if gripper_cmd is not None:
+        #         try:
+        #             if isinstance(gripper_cmd, torch.Tensor):
+        #                 if gripper_cmd.ndim == 0:
+        #                     gripper_val = gripper_cmd.item()
+        #                 elif gripper_cmd.ndim == 1:
+        #                     gripper_val = gripper_cmd[b].item()
+        #                 else:
+        #                     # 2D or higher: take first element of b-th row
+        #                     gripper_val = gripper_cmd[b, 0].item() if gripper_cmd.shape[1] > 0 else gripper_cmd[b].item()
+        #             else:
+        #                 # numpy array or list
+        #                 gripper_cmd_np = np.asarray(gripper_cmd)
+        #                 if gripper_cmd_np.ndim == 0:
+        #                     gripper_val = float(gripper_cmd_np)
+        #                 elif gripper_cmd_np.ndim == 1:
+        #                     gripper_val = float(gripper_cmd_np[b])
+        #                 else:
+        #                     # 2D or higher: take first element of b-th row
+        #                     gripper_val = float(gripper_cmd_np[b, 0]) if gripper_cmd_np.shape[1] > 0 else float(gripper_cmd_np[b])
+        #             gripper_cmd_str = f"Gripper: {gripper_val:.3f}"
+        #             cv2.putText(img, gripper_cmd_str, (10, 120), cv2.FONT_HERSHEY_SIMPLEX,
+        #                        0.7, (0, 255, 0), 2)  # Green text
+        #         except (IndexError, AttributeError, TypeError) as e:
+        #             # Skip gripper display if there's an error
+        #             pass
+            
+        #     # Write back the modified image
+        #     composed_np[b] = img
+        
+        # Convert back to RGB and [0, 1] range
+        composed_np = composed_np[..., ::-1]  # BGR to RGB
+        composed_np = composed_np.astype(np.float32) / 255.0
+        composed = torch.from_numpy(composed_np).to(composed.device)
+        
+        return composed
+
+    def _quat_to_rot(self, quat: Sequence[float]) -> np.ndarray:
+        w, x, y, z = quat
+        rot = np.array(
+            [
+                [1 - 2 * (y ** 2 + z ** 2), 2 * (x * y - z * w), 2 * (x * z + y * w)],
+                [2 * (x * y + z * w), 1 - 2 * (x ** 2 + z ** 2), 2 * (y * z - x * w)],
+                [2 * (x * z - y * w), 2 * (y * z + x * w), 1 - 2 * (x ** 2 + y ** 2)],
+            ],
+            dtype=np.float32,
+        )
+        return rot
+
+    def _get_camera_parameters(self) -> Optional[Tuple[np.ndarray, np.ndarray, Tuple[int, int]]]:
+        if self.task_cfg is None:
+            return None
+        camera_info = getattr(self.task_cfg, "camera_info", None)
+        if camera_info is None:
+            return None
+
+        intrinsics = np.array(
+            [
+                [camera_info.fx, 0.0, camera_info.cx],
+                [0.0, camera_info.fy, camera_info.cy],
+                [0.0, 0.0, 1.0],
+            ],
+            dtype=np.float32,
+        )
+
+        if getattr(camera_info, "camera_opencv_to_world", None) is not None:
+            extrinsics = np.array(camera_info.camera_opencv_to_world, dtype=np.float32)
+        else: 
+            extrinsics = np.eye(4, dtype=np.float32)
+            if getattr(camera_info, "camera_heading_wxyz", None) is not None:
+                rot = self._quat_to_rot(camera_info.camera_heading_wxyz)
+            else:
+                rot = np.eye(3, dtype=np.float32)
+            extrinsics[:3, :3] = rot
+            if getattr(camera_info, "camera_position", None) is not None:
+                extrinsics[:3, 3] = np.array(camera_info.camera_position, dtype=np.float32)
+        resolution = (
+            int(camera_info.width),
+            int(camera_info.height),
+        )
+        return intrinsics, extrinsics, resolution
diff --git a/r2sVLA/envs/sim_utils.py b/r2sVLA/envs/sim_utils.py
new file mode 100644
index 0000000..6c115dd
--- /dev/null
+++ b/r2sVLA/envs/sim_utils.py
@@ -0,0 +1,60 @@
+import json
+from pathlib import Path
+import yaml
+import torch
+import random
+import numpy as np
+import transforms3d
+import numpy as np
+import transforms3d
+
+def pose_to_mat(pos, quat):
+    if torch.is_tensor(pos):  pos  = pos.cpu().numpy()
+    if torch.is_tensor(quat): quat = quat.cpu().numpy()
+    m = np.eye(4, dtype=np.float32)
+    m[:3, :3] = transforms3d.quaternions.quat2mat(quat)
+    m[:3,  3] = pos
+    return m
+
+def mat_to_pose(mat):
+    pos  = mat[:3, 3]
+    quat = transforms3d.quaternions.mat2quat(mat[:3, :3])
+    return pos, quat
+
+def normalize_quaternion(q: np.ndarray) -> np.ndarray:
+    norm = np.linalg.norm(q, axis=1, keepdims=True) + 1e-9
+    return q / norm
+
+
+def pose_distance(T1, T2):
+    """
+    Compute translation and rotation (angle) distance between two SE(3) transformation matrices in torch.
+    Args:
+        T1, T2: [..., 4, 4] torch.Tensor or np.ndarray, can be batched
+    Returns:
+        trans_dist: translation distance(s)
+        angle: rotational angle(s)
+    """
+    if not torch.is_tensor(T1):
+        T1 = torch.tensor(T1, dtype=torch.float32)
+    if not torch.is_tensor(T2):
+        T2 = torch.tensor(T2, dtype=torch.float32)
+    
+    # Translation distance
+    t1 = T1[..., :3, 3]
+    t2 = T2[..., :3, 3]
+    trans_dist = torch.norm(t2 - t1, dim=-1)
+
+    # Rotation difference (angle)
+    R1 = T1[..., :3, :3]
+    R2 = T2[..., :3, :3]
+    dR = torch.matmul(R2, R1.transpose(-2, -1))
+    trace = dR[..., 0, 0] + dR[..., 1, 1] + dR[..., 2, 2]
+    cos_angle = (trace - 1) / 2
+    
+    # Handle numerical precision
+    cos_angle = torch.clamp(cos_angle, -1.0, 1.0)
+    angle = torch.acos(cos_angle)
+    return trans_dist, angle
+
+ 
\ No newline at end of file
diff --git a/r2sVLA/envs/sim_wrapper_isaac.py b/r2sVLA/envs/sim_wrapper_isaac.py
new file mode 100644
index 0000000..a4ff4e6
--- /dev/null
+++ b/r2sVLA/envs/sim_wrapper_isaac.py
@@ -0,0 +1,971 @@
+"""
+Policy evaluation wrapper for Isaac Lab simulation.
+Provides interface for evaluating policies in batched simulation environments.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional, List, Dict, Union, Any, Tuple
+import numpy as np
+import torch
+import argparse
+
+from isaaclab.utils.math import subtract_frame_transforms, transform_points, combine_frame_transforms
+# Note: AppLauncher should be initialized by the caller, not at module level
+# This prevents duplicate initialization when importing this module
+
+# Local imports
+import sys
+import os
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
+
+from sim_base_isaac import BaseSimulator
+from sim_utils import pose_to_mat, mat_to_pose, pose_distance
+from envs.cfgs.task_cfg import TaskCfg, TrajectoryCfg, SuccessMetric, SuccessMetricType, load_task_cfg
+from envs.cfgs.eval_cfg import EvaluationConfig
+from envs.cfgs.policy_interface import Action, BasePolicy
+from envs.cfgs.randomizer import Randomizer
+from envs.cfgs.randomizer_cfg import RandomizerConfig
+from sim_base_isaac import BaseSimulator
+from envs.make_env_isaac import make_env, get_prim_name_from_oid
+# Constants
+BASE_DIR = Path.cwd()
+
+# ──────────────────────────── Policy Evaluation Wrapper ────────────────────────────
+class PolicyEvaluationWrapper(BaseSimulator):
+    """
+    Policy evaluation wrapper for Isaac Lab simulation environments.
+    
+    Provides a clean interface for evaluating policies in batched simulation environments.
+    Supports both joint position (qpos) and end-effector (ee) action types.
+    
+    Features:
+      • Batch environment support with per-environment state tracking
+      • Flexible action interface (qpos or ee control)
+      • Episode termination checking
+      • Policy integration via BasePolicy interface
+      • Observation formatting for policy consumption
+    """
+    def __init__(self, sim, scene, task_cfg: TaskCfg, eval_cfg: EvaluationConfig, num_envs: int):
+        super().__init__(
+            sim=sim, scene=scene,
+            out_dir=eval_cfg.video_save_dir,
+            enable_motion_planning=True,
+            set_physics_props=True, debug_level=0,
+            task_cfg=task_cfg,
+            eval_cfg=eval_cfg,
+        )
+
+        self.selected_object_id = task_cfg.manipulated_oid
+        self._selected_object_id = str(self.selected_object_id)  # Store as string for mapping
+        self._update_object_prim()  # Update object_prim based on selected_object_id
+        self.record = eval_cfg.record_video  # Store whether to record data
+        assert self.record, "Record must be True for evaluation"
+        #self.traj_cfg_list = traj_cfg_list
+       
+        self.task_type = task_cfg.task_type
+       
+        # Evaluation configuration
+        self.eval_cfg = eval_cfg
+        
+        # Store task_cfg for trajectory loading
+        self.task_cfg = task_cfg
+        
+        # Initialize randomizer (lazy initialization)
+        self.randomizer: Optional[Randomizer] = None
+        self.randomizer_cfg: Optional[RandomizerConfig] = None
+        
+        # Trajectory tracking
+        self.traj_cfg_list: Optional[List[TrajectoryCfg]] = None
+        self.current_trajectory_idx = 0  # For verified randomization mode
+        
+        # Success metric list - per environment [B]
+        self.success_metric_list: List[Optional[SuccessMetric]] = [None] * num_envs
+        
+        # Episode tracking - per environment [B]
+        device = self.robot.device
+        self.step_count = torch.zeros(self.num_envs, dtype=torch.long, device=device)  # [B]
+        self.episode_done = torch.zeros(self.num_envs, dtype=torch.bool, device=device)  # [B]
+        self.eval_success = torch.zeros(self.num_envs, dtype=torch.bool, device=device)  # [B]
+        self._is_success = torch.zeros(self.num_envs, dtype=torch.bool, device=device)  # [B] - Current success state
+        
+        # Grasping success tracking - per environment [B]
+        self.initial_object_height = torch.zeros(self.num_envs, dtype=torch.float32, device=device)  # [B] - Initial object height (z coordinate)
+        self.grasping_success_achieved = torch.zeros(self.num_envs, dtype=torch.bool, device=device)  # [B] - Whether grasping success has been achieved at least once
+        
+        # Gripper state tracking - per environment [B]
+        # True = open, False = closed (updated from actions)
+        self.gripper_state_list = torch.ones(self.num_envs, dtype=torch.bool, device=device)  # [B] - Current gripper state (True=open, False=closed)
+        
+        # Policy (will be set externally)
+        self.policy: Optional[BasePolicy] = None
+
+    def _compute_poses_from_traj_cfg(self, traj_cfg_list: List[TrajectoryCfg]):
+        """
+        Extract poses and trajectories from a list of TrajectoryCfg objects.
+        Reference: sim_randomize_rollout.py compute_poses_from_traj_cfg
+        
+        Args:
+            traj_cfg_list: List of TrajectoryCfg objects
+            
+        Returns:
+            robot_poses_list: List of robot poses [7] for each trajectory
+            object_poses_dict: Dict mapping oid -> list of (pos, quat) tuples
+            object_trajectory_list: List of object trajectories
+            final_gripper_state_list: List of final gripper states
+            pregrasp_pose_list: List of pregrasp poses
+            grasp_pose_list: List of grasp poses
+            end_pose_list: List of end poses from success_metric
+            success_metric_list: List of SuccessMetric objects
+        """
+        robot_poses_list = []
+        object_poses_dict = {}  # {oid: [(pos, quat), ...]}
+        object_trajectory_list = []
+        final_gripper_state_list = []
+        pregrasp_pose_list = []
+        grasp_pose_list = []
+        end_pose_list = []
+        success_metric_list = []
+
+        for traj_cfg in traj_cfg_list:
+            robot_poses_list.append(traj_cfg.robot_pose)
+            
+            # Extract object poses: traj_cfg.object_poses is a dict mapping oid -> pose
+            for oid in traj_cfg.object_poses.keys():
+                pose = traj_cfg.object_poses[oid]
+                oid_str = str(oid)
+                if oid_str not in object_poses_dict:
+                    object_poses_dict[oid_str] = []
+                object_poses_dict[oid_str].append(np.array(pose, dtype=np.float32))
+            
+            # Extract object trajectory
+            traj = []
+            for i in range(len(traj_cfg.object_trajectory)):
+                mat = pose_to_mat(traj_cfg.object_trajectory[i][:3], traj_cfg.object_trajectory[i][3:7])
+                traj.append(mat)
+            object_trajectory_list.append(np.array(traj, dtype=np.float32))
+            
+            final_gripper_state_list.append(traj_cfg.final_gripper_close)
+            pregrasp_pose_list.append(np.array(traj_cfg.pregrasp_pose, dtype=np.float32) if traj_cfg.pregrasp_pose is not None else None)
+            grasp_pose_list.append(np.array(traj_cfg.grasp_pose, dtype=np.float32) if traj_cfg.grasp_pose is not None else None)
+            
+            # Extract success metric
+            success_metric_list.append(traj_cfg.success_metric)
+            if traj_cfg.success_metric.end_pose is not None:
+                end_pose_list.append(np.array(traj_cfg.success_metric.end_pose, dtype=np.float32))
+            else:
+                end_pose_list.append(None)
+      
+        return robot_poses_list, object_poses_dict, object_trajectory_list, final_gripper_state_list, pregrasp_pose_list, grasp_pose_list, end_pose_list, success_metric_list
+
+    def reset(self, env_ids=None, use_randomization: bool = False, use_verified_randomization: bool = False):
+        super().reset(env_ids)
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)  # (B,)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)  # (M,)
+        M = int(env_ids_t.shape[0])
+
+        # ──────────── Load trajectory configurations ────────────
+        if not use_randomization:
+            print("[INFO] Using reference trajectory")
+            self.traj_cfg_list = self.task_cfg.reference_trajectory
+        else:
+            if use_verified_randomization:
+                # Mode 1: Load from generated_trajectories (verified randomization)
+                if self.task_cfg.generated_trajectories is None or len(self.task_cfg.generated_trajectories) == 0:
+                    raise ValueError("No generated_trajectories available in task_cfg for verified randomization mode")
+                
+                # Load the last num_envs trajectories from generated_trajectories
+                num_available = len(self.task_cfg.generated_trajectories)
+                start_idx = max(0, num_available - self.num_envs)
+                end_idx = num_available
+                
+                # If we need more trajectories than available, cycle through them
+                if end_idx - start_idx < M:
+                    # Repeat trajectories if needed
+                    traj_cfg_list = []
+                    for i in range(M):
+                        idx = (start_idx + i) % num_available
+                        traj_cfg_list.append(self.task_cfg.generated_trajectories[idx])
+                else:
+                    traj_cfg_list = self.task_cfg.generated_trajectories[start_idx:end_idx][:M]
+                
+                self.traj_cfg_list = traj_cfg_list
+            else:
+                # Mode 2: Generate new trajectories using randomizer
+                if self.randomizer is None:
+                    from envs.cfgs.randomizer import Randomizer
+                    self.randomizer = Randomizer(self.task_cfg)
+                
+                # Get randomizer config (use default if not set)
+                if self.randomizer_cfg is None:
+                    # Try to load from config file or use defaults
+                    try:
+                        from envs.cfgs.randomizer_cfg import RandomizerConfig
+                        config_path = BASE_DIR / "config" / "randomizer_cfg.yaml"
+                        if config_path.exists():
+                            self.randomizer_cfg = RandomizerConfig.from_yaml(config_path, task_name=self.task_cfg.task_key)
+                        else:
+                            self.randomizer_cfg = RandomizerConfig()  # Use defaults
+                    except Exception as e:
+                        print(f"[WARN] Failed to load randomizer config, using defaults: {e}")
+                        from envs.cfgs.randomizer_cfg import RandomizerConfig
+                        self.randomizer_cfg = RandomizerConfig()
+                
+                # Generate randomized trajectories
+                # Calculate traj_randomize_num to get at least M trajectories
+                total_per_traj = self.randomizer_cfg.scene_randomize_num * self.randomizer_cfg.robot_pose_randomize_num
+                traj_randomize_num = max(1, (M + total_per_traj - 1) // total_per_traj)  # Ceiling division
+                
+                random_traj_cfg_list = self.randomizer.generate_randomized_scene_cfg(
+                    grid_dist=self.randomizer_cfg.grid_dist,
+                    grid_num=self.randomizer_cfg.grid_num,
+                    angle_random_range=self.randomizer_cfg.angle_random_range,
+                    angle_random_num=self.randomizer_cfg.angle_random_num,
+                    traj_randomize_num=traj_randomize_num,
+                    scene_randomize_num=self.randomizer_cfg.scene_randomize_num,
+                    robot_pose_randomize_range=self.randomizer_cfg.robot_pose_randomize_range,
+                    robot_pose_randomize_angle=self.randomizer_cfg.robot_pose_randomize_angle,
+                    robot_pose_randomize_num=self.randomizer_cfg.robot_pose_randomize_num,
+                    fix_end_pose=self.randomizer_cfg.fix_end_pose,
+                )
+                # Take first M trajectories
+                self.traj_cfg_list = random_traj_cfg_list[:M]
+        
+        # ──────────── Compute poses and update success metrics ────────────
+        (self.robot_poses_list, self.object_poses_dict, self.object_trajectory_list,
+         self.final_gripper_state_list, self.pregrasp_pose_list, self.grasp_pose_list,
+         self.end_pose_list, success_metrics) = self._compute_poses_from_traj_cfg(self.traj_cfg_list)
+        
+        # Update success_metric_list for the environments being reset
+        for i, env_idx in enumerate(env_ids_t.cpu().numpy()):
+            if i < len(success_metrics):
+                self.success_metric_list[env_idx] = success_metrics[i]
+
+        # ──────────── Set object poses ────────────
+        env_origins = self.scene.env_origins.to(device)[env_ids_t]  # (M,3)
+        
+        # Set poses for all objects from object_poses_dict
+        for oid in self.object_poses_dict.keys():
+            # Get prim name from oid
+            prim_name = get_prim_name_from_oid(str(oid))
+            
+            object_prim = self.scene[prim_name]
+            
+            # Get pose for this object
+            if len(self.object_poses_dict[oid]) == 0:
+                continue
+            
+            # Extract poses for the environments being reset
+            poses_array = np.array(self.object_poses_dict[oid], dtype=np.float32)
+            if poses_array.shape[0] < M:
+                # Pad with last pose if needed
+                last_pose = poses_array[-1]
+                poses_array = np.vstack([poses_array, np.tile(last_pose, (M - poses_array.shape[0], 1))])
+            
+            pos = poses_array[env_ids_t.cpu().numpy(), :3]
+            quat = poses_array[env_ids_t.cpu().numpy(), 3:7]
+            
+            object_pose = torch.zeros((M, 7), device=device, dtype=torch.float32)
+            object_pose[:, :3] = env_origins + torch.tensor(pos, dtype=torch.float32, device=device)
+            object_pose[:, 3:7] = torch.tensor(quat, dtype=torch.float32, device=device)  # wxyz
+            
+            object_prim.write_root_pose_to_sim(object_pose, env_ids=env_ids_t)
+            object_prim.write_root_velocity_to_sim(
+                torch.zeros((M, 6), device=device, dtype=torch.float32), env_ids=env_ids_t
+            )
+            object_prim.write_data_to_sim()
+        
+        # ──────────── Set robot poses ────────────
+        rp_local = np.array(self.robot_poses_list, dtype=np.float32)
+        env_origins_robot = self.scene.env_origins.to(device)[env_ids_t]
+        import copy
+        robot_pose_world = copy.deepcopy(rp_local)
+        
+        # Ensure we have enough robot poses
+        if robot_pose_world.shape[0] < M:
+            last_pose = robot_pose_world[-1]
+            robot_pose_world = np.vstack([robot_pose_world, np.tile(last_pose, (M - robot_pose_world.shape[0], 1))])
+        
+        robot_pose_world[:, :3] = env_origins_robot.cpu().numpy() + robot_pose_world[env_ids_t.cpu().numpy(), :3]
+        self.robot.write_root_pose_to_sim(torch.tensor(robot_pose_world, dtype=torch.float32, device=device), env_ids=env_ids_t)
+        self.robot.write_root_velocity_to_sim(
+            torch.zeros((M, 6), device=device, dtype=torch.float32), env_ids=env_ids_t
+        )
+
+        joint_pos = self.robot.data.default_joint_pos.to(self.robot.device)[env_ids_t]  # (M,7)
+        joint_vel = self.robot.data.default_joint_vel.to(self.robot.device)[env_ids_t]  # (M,7)
+        self.robot.write_joint_state_to_sim(joint_pos, joint_vel, env_ids=env_ids_t)
+        self.robot.write_data_to_sim()
+
+        self.clear_data()
+
+        # Reset episode tracking for specified environments
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)
+        
+        self.step_count[env_ids_t] = 0
+        self.episode_done[env_ids_t] = False
+        self.eval_success[env_ids_t] = False
+        self._is_success[env_ids_t] = False
+        
+        # Record initial object height for grasping success check
+        # Need to step simulation once to get accurate positions after reset
+        initial_object_pos = self.object_prim.data.root_com_pos_w[:, :3]  # [B, 3]
+        initial_object_pos -= self.scene.env_origins.to(device)[env_ids_t]
+        print("current robot pose", self.robot.data.root_state_w[:, :7])
+        print("scene.env_origins", self.scene.env_origins.to(device)[env_ids_t])
+        print("initial_object_pos", initial_object_pos)
+        self.initial_object_height[env_ids_t] = initial_object_pos[env_ids_t, 2]  # Store z coordinate (height)
+        
+        # Reset grasping success tracking
+        self.grasping_success_achieved[env_ids_t] = False
+        
+        # Reset gripper state (default to open)
+        self.gripper_state_list[env_ids_t] = True
+        
+        # Reset policy if available
+        if self.policy is not None:
+            self.policy.reset()
+
+    # ---------- Policy Evaluation Functions ----------
+    def get_obs(self) -> Dict[str, Any]:
+        """
+        Get observation from environment.
+        Returns a dictionary with observation data compatible with policy interface.
+        
+        Returns:
+            Dictionary containing:
+                - 'rgb': RGB image [B, H, W, 3]
+                - 'depth': Depth image [B, H, W]
+                - 'joint_pos': Joint positions [B, n_joints]
+                - 'joint_vel': Joint velocities [B, n_joints]
+                - 'ee_pose': End-effector pose [B, 7] (pos + quat)
+                - 'gripper_pos': Gripper position [B, 1]
+                - Other observation keys as needed
+        """
+        # Use actual gripper state for gripper_cmd (use first env as representative, or could use per-env logic)
+        # FIXME: Currently we DO NOT actually ALLOW PARALLELISM IN EVALUATION.
+        obs = self.get_observation(gripper_open=self.gripper_state_list[0].item() if self.num_envs > 0 else True)
+        
+        # Convert to policy-friendly format
+        observation = {
+            'rgb': obs['rgb'],  # [B, H, W, 3]
+            'depth': obs['depth'],  # [B, H, W]
+            'joint_pos': obs['joint_pos'],  # [B, n_joints]
+            'joint_vel': obs.get('joint_vel', torch.zeros_like(obs['joint_pos'])),  # [B, n_joints]
+            'ee_pose': obs['ee_pose_w'],  # [B, 7] (pos + quat)
+            'gripper_pos': obs['gripper_pos'],  # [B, 1] or [B, 3]
+            'gripper_cmd': obs.get('gripper_cmd', torch.zeros((self.num_envs, 1), device=self.robot.device)),
+        }
+        
+        return observation
+    
+    def take_action(self, action: Action, record: bool = True) -> Dict[str, Any]:
+        """
+        Execute an action in the environment.
+        
+        Args:
+            action: Action object (either qpos or ee type)
+            record: Whether to record data
+        
+        Returns:
+            Updated observation dictionary
+        """
+        device = self.robot.device
+        
+        # Find active (not done) environments
+        active_mask = ~self.episode_done  # [B]
+        active_env_ids = self._all_env_ids[active_mask].to(device)  # [M] where M <= B
+        
+        # If all environments are done, return observation
+        if len(active_env_ids) == 0:
+            return self.get_obs()
+        
+        # Check step limit for active environments
+        step_limit_reached = self.step_count[active_env_ids] >= self.eval_cfg.max_steps
+        self.episode_done[active_env_ids[step_limit_reached]] = True
+        
+        # Update active mask after step limit check
+        active_mask = ~self.episode_done
+        active_env_ids = self._all_env_ids[active_mask].to(device)
+        
+        if len(active_env_ids) == 0:
+            return self.get_obs()
+        
+        # Extract gripper state from action (for updating gripper_state_list after action execution)
+        gripper_open_from_action = action.gripper_open
+        if isinstance(gripper_open_from_action, torch.Tensor):
+            if gripper_open_from_action.ndim == 0:
+                # Scalar: apply to all active environments
+                gripper_open_value = gripper_open_from_action.item()
+                gripper_open_tensor = torch.full((len(active_env_ids),), gripper_open_value, dtype=torch.bool, device=device)
+            else:
+                # Per-env tensor
+                if gripper_open_from_action.shape[0] == len(active_env_ids):
+                    gripper_open_tensor = gripper_open_from_action.to(device).bool()
+                elif gripper_open_from_action.shape[0] == self.num_envs:
+                    gripper_open_tensor = gripper_open_from_action[active_env_ids].to(device).bool()
+                else:
+                    gripper_open_value = gripper_open_from_action[0].item() if gripper_open_from_action.shape[0] > 0 else True
+                    gripper_open_tensor = torch.full((len(active_env_ids),), gripper_open_value, dtype=torch.bool, device=device)
+        else:
+            # Scalar (bool or float): apply to all active environments
+            gripper_open_value = bool(gripper_open_from_action) if not isinstance(gripper_open_from_action, bool) else gripper_open_from_action
+            gripper_open_tensor = torch.full((len(active_env_ids),), gripper_open_value, dtype=torch.bool, device=device)
+        
+        # Execute action based on type
+        if action.action_type == 'qpos':
+            # Joint position control
+            joint_pos_des = action.qpos.to(self.robot.device)
+            
+            # Ensure correct shape: [B, n_joints]
+            if joint_pos_des.ndim == 1:
+                joint_pos_des = joint_pos_des.unsqueeze(0).repeat(self.num_envs, 1)
+            
+            # Use scalar gripper_open for apply_actions (it handles per-env internally)
+            if isinstance(gripper_open_from_action, torch.Tensor) and gripper_open_from_action.ndim > 0:
+                gripper_open_scalar = gripper_open_from_action[0].item() if gripper_open_from_action.shape[0] > 0 else True
+            else:
+                gripper_open_scalar = gripper_open_from_action.item() if isinstance(gripper_open_from_action, torch.Tensor) else gripper_open_from_action
+            
+            self.apply_actions(joint_pos_des, gripper_open=gripper_open_scalar)
+            
+        elif action.action_type == 'ee_direct':
+            # End-effector control using motion planning
+            ee_pose = action.ee_pose.to(self.robot.device)
+            
+            # Ensure correct shape: [B, 7]
+            if ee_pose.ndim == 1:
+                ee_pose = ee_pose.unsqueeze(0).repeat(self.num_envs, 1)
+            
+            # Split into position and quaternion
+            position = ee_pose[:, :3]
+            quaternion = ee_pose[:, 3:7]  # [B, 4] (wxyz)
+            
+            # Handle gripper
+            gripper_open = action.gripper_open
+            if isinstance(gripper_open, torch.Tensor):
+                if gripper_open.ndim == 0:
+                    gripper_open = gripper_open.item()
+                else:
+                    gripper_open = gripper_open[0].item() if gripper_open.shape[0] > 0 else True
+            
+            # Use motion planning to move to target pose
+            self.move_to_motion_planning(
+                position=position,
+                quaternion=quaternion,
+                gripper_open=gripper_open,
+                record=record
+            )
+            self.wait(gripper_open=gripper_open, steps=3)
+        
+        elif action.action_type == 'ee_l':
+            # End-effector control using motion planning
+            ee_pose = action.ee_pose.to(self.robot.device)
+            
+            # Ensure correct shape: [B, 7]
+            if ee_pose.ndim == 1:
+                ee_pose = ee_pose.unsqueeze(0).repeat(self.num_envs, 1)
+            
+            # Split into position and quaternion
+            position = ee_pose[:, :3] + self.scene.env_origins.to(self.robot.device)[active_env_ids]
+            quaternion = ee_pose[:, 3:7]  # [B, 4] (wxyz)
+            
+            # Handle gripper
+            gripper_open = action.gripper_open
+            if isinstance(gripper_open, torch.Tensor):
+                if gripper_open.ndim == 0:
+                    gripper_open = gripper_open.item()
+                else:
+                    gripper_open = gripper_open[0].item() if gripper_open.shape[0] > 0 else True
+            
+            robot_root_pose_w = self.robot.data.root_state_w[:, :7]
+            ee_pos_l, ee_quat_l = subtract_frame_transforms(robot_root_pose_w[:, :3], robot_root_pose_w[:, 3:7], position, quaternion)
+            # Use motion planning to move to target pose
+            # self.move_to_motion_planning(
+            #     position=ee_pos_l,
+            #     quaternion=ee_quat_l,
+            #     gripper_open=gripper_open,
+            #     record=record
+            # )
+            self.move_to_ik(
+                position=ee_pos_l,
+                quaternion=ee_quat_l,
+                steps=3,
+                gripper_open=gripper_open,
+                record=record
+            )
+            #self.wait(gripper_open=gripper_open, steps=3)
+    
+        elif action.action_type == 'ee_cam':
+            # End-effector control in camera frame - convert to world frame first
+            ee_pose_cam = action.ee_pose.to(self.robot.device)
+            
+            # Ensure correct shape: [B, 7]
+            if ee_pose_cam.ndim == 1:
+                ee_pose_cam = ee_pose_cam.unsqueeze(0).repeat(self.num_envs, 1)
+            
+            # Get camera pose in world frame
+            cam_pos_w = self.camera.data.pos_w  # [B, 3]
+            cam_quat_w = self.camera.data.quat_w_ros  # [B, 4] (wxyz)
+            
+            cam_pos_w -= self.scene.env_origins.to(self.robot.device)[active_env_ids]
+            # Extract camera frame pose
+            ee_pos_cam = ee_pose_cam[:, :3]  # [B, 3]
+            ee_quat_cam = ee_pose_cam[:, 3:7]  # [B, 4] (wxyz)
+            
+            # Convert from camera frame to world frame
+            # Use combine_frame_transforms to batch transform both position and quaternion
+            # t01, q01: camera pose in world frame (frame 1 w.r.t. frame 0)
+            # t12, q12: ee pose in camera frame (frame 2 w.r.t. frame 1)
+            # Returns: ee pose in world frame (frame 2 w.r.t. frame 0)
+            ee_pos_w, ee_quat_w = combine_frame_transforms(
+                t01=cam_pos_w,  # [B, 3] camera position in world
+                q01=cam_quat_w,  # [B, 4] camera quaternion in world (wxyz)
+                t12=ee_pos_cam,  # [B, 3] ee position in camera
+                q12=ee_quat_cam  # [B, 4] ee quaternion in camera (wxyz)
+            )  # Returns: [B, 3], [B, 4]
+            ee_pos_w += self.scene.env_origins.to(self.robot.device)[active_env_ids]
+            
+            # Handle gripper
+            gripper_open = action.gripper_open
+            if isinstance(gripper_open, torch.Tensor):
+                if gripper_open.ndim == 0:
+                    gripper_open = gripper_open.item()
+                else:
+                    gripper_open = gripper_open[0].item() if gripper_open.shape[0] > 0 else True
+            
+            robot_root_pose_w = self.robot.data.root_state_w[:, :7]
+            ee_pos_l, ee_quat_l = subtract_frame_transforms(robot_root_pose_w[:, :3], robot_root_pose_w[:, 3:7], ee_pos_w, ee_quat_w)
+            self.move_to_motion_planning(
+                position=ee_pos_l,
+                quaternion=ee_quat_l,
+                gripper_open=gripper_open,
+                record=record
+            )
+        else:
+            
+            raise ValueError(f"Unknown action type: {action.action_type}")
+        
+        # Update gripper_state_list from action (after action execution)
+        self.gripper_state_list[active_env_ids] = gripper_open_tensor
+        
+        # Update step count for active environments (task step counter)
+        # Each take_action() call corresponds to one task step
+        self.step_count[active_env_ids] += 1
+        
+        # Update success state for active environments
+        self.is_success(env_ids=active_env_ids)
+        
+        # Check termination for active environments
+        termination_mask = self.check_termination(env_ids=active_env_ids)
+        self.episode_done[active_env_ids] = termination_mask
+        
+        # Get updated observation
+        obs = self.get_obs()
+        
+        # Update policy observation if needed
+        if self.policy is not None:
+            self.policy.update_obs(obs)
+        
+        return obs
+    
+    def check_termination(self, env_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Check if episode should terminate for specified environments.
+        This is a placeholder - user will implement their own termination logic.
+        
+        Args:
+            env_ids: Optional tensor of environment IDs to check. If None, checks all environments.
+        
+        Returns:
+            Boolean tensor [M] indicating which environments should terminate
+        """
+        device = self.robot.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)
+        
+        M = len(env_ids_t)
+        termination = torch.zeros(M, dtype=torch.bool, device=device)
+        
+        # Check step limit
+        step_limit_reached = self.step_count[env_ids_t] >= self.eval_cfg.max_steps
+        termination |= step_limit_reached
+        
+        termination |= self._is_success[env_ids_t]
+        
+        return termination
+    
+    def is_success(self, env_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Check success based on eval_cfg.success_keys.
+        Combines different success metrics based on configuration.
+        
+        Args:
+            env_ids: Optional tensor of environment IDs to check. If None, checks all environments.
+        
+        Returns:
+            Boolean tensor [M] or [B] indicating which environments have achieved success
+        """
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)
+        
+        M = len(env_ids_t)
+        
+        # If no success keys specified, return False for all
+        if not self.eval_cfg.success_keys:
+            self._is_success[env_ids_t] = False
+            return self._is_success[env_ids_t]
+        
+        # Initialize success mask (all True, will be ANDed with each check)
+        success_mask = torch.ones(M, dtype=torch.bool, device=device)
+        
+        # Check each success key
+        for key in self.eval_cfg.success_keys:
+            if key == "grasping":
+                success_mask = success_mask & self.is_grasping_success(env_ids=env_ids_t)
+            elif key == "strict":
+                success_mask = success_mask & self.is_strict_success(env_ids=env_ids_t)
+            elif key == "metric":
+                success_mask = success_mask & self.is_metric_success(env_ids=env_ids_t)
+            else:
+                print(f"[WARN] Unknown success key: {key}, ignoring")
+        
+        # Update internal success state
+        self._is_success[env_ids_t] = success_mask
+        
+        return success_mask
+    
+    def is_grasping_success(self, env_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)
+        
+        # If already achieved for these environments, return True
+        already_achieved = self.grasping_success_achieved[env_ids_t]
+        if torch.all(already_achieved):
+            return already_achieved
+        
+        # Check gripper is closed using gripper_state_list (True=open, False=closed)
+        is_gripper_closed = ~self.gripper_state_list[env_ids_t]  # [M]
+        
+        # Get current and initial object heights
+        current_object_pos = self.object_prim.data.root_com_pos_w[:, :3]  # [B, 3]
+
+        current_object_height = current_object_pos[env_ids_t, 2]  # [M] - z coordinate
+        initial_height = self.initial_object_height[env_ids_t]  # [M]
+        
+        # Check if object is lifted above threshold
+        height_lifted = current_object_height - initial_height  # [M]
+        is_object_lifted = height_lifted >= self.eval_cfg.lift_height_threshold
+        
+        # Grasping success: gripper closed AND object lifted
+        current_success = is_gripper_closed & is_object_lifted  # [M]
+        
+        # Update tracking: once achieved, it stays True
+        self.grasping_success_achieved[env_ids_t] = self.grasping_success_achieved[env_ids_t] | current_success
+        
+        return self.grasping_success_achieved[env_ids_t]
+    
+
+    def _check_gripper_state_match(self, env_ids_t: torch.Tensor) -> torch.Tensor:
+        """Check if gripper state matches final_gripper_close requirement for given environments."""
+        device = env_ids_t.device
+        gripper_match_list = []
+        
+        for env_idx in env_ids_t.cpu().numpy():
+            success_metric = self.success_metric_list[env_idx]
+            
+            if success_metric is None:
+                gripper_match_list.append(torch.tensor(False, device=device))
+                continue
+            
+            # Get current gripper state (True=open, False=closed)
+            current_gripper_open = self.gripper_state_list[env_idx]
+            
+            # final_gripper_close=True means should be closed (gripper_open=False)
+            # final_gripper_close=False means should be open (gripper_open=True)
+            required_gripper_open = not success_metric.final_gripper_close
+            gripper_match = (current_gripper_open == required_gripper_open)
+            gripper_match_list.append(torch.tensor(gripper_match, device=device))
+        
+        return torch.stack(gripper_match_list)  # [M]
+    
+    def _compute_pose_distance_to_end_pose(self, env_ids_t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute pose distance between current object pose and end_pose from SuccessMetric.
+        
+        Returns:
+            trans_dist: Translation distance [M]
+            angle_dist: Rotation angle distance [M]
+            valid_mask: Boolean mask indicating which envs have valid end_pose [M]
+        """
+        device = env_ids_t.device
+        
+        # Get current object pose in world frame and convert to local frame
+        obj_w = self.object_prim.data.root_state_w[:, :7]  # [B, 7]
+        origins = self.scene.env_origins.to(device)  # [B, 3]
+        obj_local = obj_w.clone()
+        obj_local[:, :3] = obj_w[:, :3] - origins  # [B, 7] in local frame
+        
+        trans_dist_list = []
+        angle_list = []
+        valid_mask_list = []
+        
+        for env_idx in env_ids_t.cpu().numpy():
+            success_metric = self.success_metric_list[env_idx]
+            
+            if success_metric is None or success_metric.end_pose is None:
+                trans_dist_list.append(torch.tensor(float('inf'), device=device))
+                angle_list.append(torch.tensor(float('inf'), device=device))
+                valid_mask_list.append(False)
+                continue
+            
+            valid_mask_list.append(True)
+            
+            # Get current and target poses
+            obj_pose_local = obj_local[env_idx]  # [7]
+            end_pose = np.array(success_metric.end_pose, dtype=np.float32)  # [7]
+            
+            # Convert to 4x4 transformation matrices
+            obj_mat = pose_to_mat(obj_pose_local[:3].cpu().numpy(), obj_pose_local[3:7].cpu().numpy())
+            end_mat = pose_to_mat(end_pose[:3], end_pose[3:7])
+            
+            # Compute pose distance
+            obj_mat_t = torch.tensor(obj_mat, dtype=torch.float32, device=device)
+            end_mat_t = torch.tensor(end_mat, dtype=torch.float32, device=device)
+            trans_dist, angle = pose_distance(obj_mat_t.unsqueeze(0), end_mat_t.unsqueeze(0))
+            
+            trans_dist_list.append(trans_dist.squeeze(0))
+            angle_list.append(angle.squeeze(0))
+        
+        trans_dist = torch.stack(trans_dist_list)  # [M]
+        angle_dist = torch.stack(angle_list)  # [M]
+        valid_mask = torch.tensor(valid_mask_list, dtype=torch.bool, device=device)  # [M]
+        
+        return trans_dist, angle_dist, valid_mask
+    
+    def is_strict_success(self, env_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Check if object pose is within threshold distance of the end_pose from SuccessMetric.
+        Compares current object pose with the target end_pose using pose distance (translation + rotation).
+        
+        Args:
+            env_ids: Optional tensor of environment IDs to check. If None, checks all environments.
+        
+        Returns:
+            Boolean tensor [M] or [B] indicating which environments have achieved strict success
+        """
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)
+        
+        # Compute pose distances
+        trans_dist, angle_dist, valid_mask = self._compute_pose_distance_to_end_pose(env_ids_t)
+        
+        # Check pose distance thresholds
+        trans_success = trans_dist <= self.eval_cfg.pose_dist_threshold
+        angle_success = angle_dist <= self.eval_cfg.angle_dist_threshold
+        
+        # Check gripper state matches requirement
+        gripper_state_success = self._check_gripper_state_match(env_ids_t)
+        
+        # All conditions must be satisfied
+        is_success = trans_success & angle_success & gripper_state_success & valid_mask
+        
+        return is_success
+    
+    def is_metric_success(self, env_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Check success based on SuccessMetric type for each environment.
+        Uses different criteria based on success_metric_type:
+        - SIMPLE_LIFT: Check lift_height threshold
+        - TARGET_POINT: Check end_pose distance (same as is_strict_success)
+        - TARGET_PLANE: Check ground_value (object height above ground)
+        
+        Args:
+            env_ids: Optional tensor of environment IDs to check. If None, checks all environments.
+        
+        Returns:
+            Boolean tensor [M] or [B] indicating which environments have achieved metric success
+        """
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)
+        
+        success_list = []
+        
+        for env_idx in env_ids_t.cpu().numpy():
+            success_metric = self.success_metric_list[env_idx]
+            
+            if success_metric is None:
+                success_list.append(torch.tensor(False, device=device))
+                continue
+            
+            # Check gripper state first (required for all metric types)
+            current_gripper_open = self.gripper_state_list[env_idx]
+            required_gripper_open = not success_metric.final_gripper_close
+            gripper_match = (current_gripper_open == required_gripper_open)
+            
+            if not gripper_match:
+                success_list.append(torch.tensor(False, device=device))
+                continue
+            
+            # Check based on metric type
+            if success_metric.success_metric_type == SuccessMetricType.SIMPLE_LIFT:
+                # Check lift height
+                current_object_pos = self.object_prim.data.root_com_pos_w[env_idx, :3]  # [3]
+                current_height = current_object_pos[2]
+                initial_height = self.initial_object_height[env_idx]
+                height_lifted = current_height - initial_height
+                
+                lift_threshold = success_metric.lift_height if success_metric.lift_height is not None else self.eval_cfg.lift_height_threshold
+                is_success = height_lifted >= lift_threshold
+                
+            elif success_metric.success_metric_type == SuccessMetricType.TARGET_POINT:
+                # Check end_pose distance (same as is_strict_success)
+                env_idx_t = torch.tensor([env_idx], device=device, dtype=torch.long)
+                trans_dist, angle_dist, valid_mask = self._compute_pose_distance_to_end_pose(env_idx_t)
+                
+                if not valid_mask[0] or success_metric.end_pose is None:
+                    is_success = False
+                else:
+                    trans_success = trans_dist[0] <= self.eval_cfg.pose_dist_threshold
+                    angle_success = angle_dist[0] <= self.eval_cfg.angle_dist_threshold
+                    is_success = trans_success & angle_success
+                    
+            elif success_metric.success_metric_type == SuccessMetricType.TARGET_PLANE:
+                # Check ground value (object height above ground plane)
+                current_object_pos = self.object_prim.data.root_com_pos_w[env_idx, :3]  # [3]
+                current_height = current_object_pos[2]
+                
+                if success_metric.ground_value is not None:
+                    is_success = current_height >= success_metric.ground_value
+                else:
+                    # Fallback: check if lifted above initial height
+                    initial_height = self.initial_object_height[env_idx]
+                    height_lifted = current_height - initial_height
+                    is_success = height_lifted >= self.eval_cfg.lift_height_threshold
+            else:
+                # Unknown metric type
+                is_success = False
+            
+            success_list.append(torch.tensor(is_success, device=device))
+        
+        return torch.stack(success_list)  # [M]
+    
+
+    def set_policy(self, policy: BasePolicy) -> None:
+        """Set the policy for evaluation."""
+        self.policy = policy
+    
+    def evaluate_episode(self, reset_env: bool = True, env_ids: Optional[List[int]] = None) -> Dict[str, Any]:
+        """
+        Run a complete evaluation episode.
+        
+        Args:
+            reset_env: Whether to reset the environment at the start
+            env_ids: Optional list of environment IDs to evaluate. If None, evaluates all.
+        
+        Returns:
+            Dictionary with episode results:
+                - 'success': bool tensor [B] or [M]
+                - 'steps': int tensor [B] or [M]
+                - 'episode_done': bool tensor [B] or [M]
+        """
+        if reset_env:
+            self.reset(env_ids=env_ids)
+        
+        if self.policy is None:
+            raise ValueError("Policy not set. Call set_policy() first.")
+        
+        # Reset policy
+        self.policy.reset()
+        
+        # Note: physics_step_counter removed - step counting is now handled by base class
+        
+        # Convert env_ids to tensor if provided
+        device = self.robot.device
+        if env_ids is not None:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long)
+        else:
+            env_ids_t = None
+        
+        # Run episode until all specified environments are done
+        while True:
+            # Check if all environments are done
+            if env_ids_t is not None:
+                all_done = torch.all(self.episode_done[env_ids_t])
+            else:
+                all_done = torch.all(self.episode_done)
+            
+            if all_done:
+                break
+            
+            obs = self.get_obs()
+            # Preprocess observation (e.g., resize images to policy's required resolution)
+            obs = self.policy.preprocess_observation(obs)
+            actions = self.policy.get_action(obs)
+            
+            # Handle both single Action and action sequence (List[Action])
+            # This matches RoboTwin's pattern where policies return action chunks
+            if isinstance(actions, list):
+                # Action sequence: execute each action sequentially
+                # Note: take_action() already updates policy observation internally
+                for action in actions:
+                    obs = self.take_action(action, record=self.record)
+                    # Check if all environments are done after each action
+                    if env_ids_t is not None:
+                        all_done = torch.all(self.episode_done[env_ids_t])
+                    else:
+                        all_done = torch.all(self.episode_done)
+                    if all_done:
+                        break
+            else:
+                # Single Action: execute directly
+                obs = self.take_action(actions, record=self.record)
+        
+        # Return results for specified environments
+        if env_ids_t is not None:
+            return {
+                'success': self._is_success[env_ids_t].cpu(),
+                'steps': self.step_count[env_ids_t].cpu(),
+                'episode_done': self.episode_done[env_ids_t].cpu(),
+            }
+        else:
+            return {
+                'success': self._is_success.cpu(),
+                'steps': self.step_count.cpu(),
+                'episode_done': self.episode_done.cpu(),
+            }
+
+    # ---------- Helpers ----------
+    def _to_base(self, pos_w: np.ndarray | torch.Tensor, quat_w: np.ndarray | torch.Tensor):
+        """World → robot base frame for all envs."""
+        root = self.robot.data.root_state_w[:, 0:7]  # [B,7]
+        p_w, q_w = self._ensure_batch_pose(pos_w, quat_w)
+        pb, qb = subtract_frame_transforms(
+            root[:, 0:3], root[:, 3:7], p_w, q_w
+        )
+        return pb, qb  # [B,3], [B,4]
+
+
diff --git a/r2sVLA/envs/test/test_cam_world_transfer.py b/r2sVLA/envs/test/test_cam_world_transfer.py
new file mode 100644
index 0000000..b5a37bd
--- /dev/null
+++ b/r2sVLA/envs/test/test_cam_world_transfer.py
@@ -0,0 +1,764 @@
+"""
+Direct Replay Policy: Load actions directly from HDF5 files and replay in simulation.
+
+This policy loads a sequence of actions from an HDF5 file and replays them step by step.
+It's useful for testing the simulation environment with recorded demonstrations.
+"""
+from __future__ import annotations
+
+import h5py
+import numpy as np
+import torch
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import json
+import yaml
+import sys
+import argparse
+
+# Add parent directories to path for imports
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent.parent))
+sys.path.append(str(file_path.parent.parent.parent))
+
+# Initialize Isaac Lab AppLauncher FIRST (before importing modules that need carb)
+from isaaclab.app import AppLauncher
+app_launcher = AppLauncher(headless=True, enable_cameras=True)
+simulation_app = app_launcher.app
+
+# Now we can import modules that depend on carb
+from envs.cfgs.policy_interface import BasePolicy, Action
+
+
+class DirectReplayPolicy(BasePolicy):
+    """
+    Policy that directly replays actions from HDF5 file or demo folder.
+    
+    Supports both 'qpos' (joint position) and 'ee' (end-effector) action types.
+    
+    For 'qpos' action type:
+        - Loads joint positions and gripper commands
+        - Expected HDF5 structure:
+            /joint_action/joint_pos: (episode_len, 7) - 7D joint positions
+            /joint_action/gripper_cmd: (episode_len, 2) - gripper commands [0.00 (close) to 0.04 (open)]
+        - Demo folder: joint_pos.npy, gripper_cmd.npy, or actions.npy
+    
+    For 'ee' action type:
+        - Loads end-effector poses (position + quaternion) and gripper commands
+        - Expected format: (episode_len, 8) - [pos(3), quat(4), gripper(1)]
+        - Demo folder: actions.npy (format: [pos[0:3], quat[3:7], gripper[7]])
+        - HDF5: /ee_pose (if available)
+    """
+    
+    # Franka gripper constants (same as in ACT training)
+    FRANKA_GRIPPER_OPEN = 0.04
+    FRANKA_GRIPPER_CLOSE = 0.00
+    
+    def __init__(
+        self,
+        observation_keys: List[str],
+        action_type: str,
+        image_resolution: List[int],
+        hdf5_path: Optional[str] = None,
+        demo_folder: Optional[str] = None,
+        task_folder: Optional[str] = None,
+        h5py_path: Optional[str] = None,
+        episode_idx: int = 0,
+        env_idx: int = 0,
+        device: str = "cuda:0",
+    ):
+        """
+        Initialize DirectReplayPolicy.
+        
+        Args:
+            observation_keys: List of observation keys (not used, but required by BasePolicy)
+            action_type: 'qpos' or 'ee' - type of actions to replay
+            image_resolution: Image resolution (not used, but required by BasePolicy)
+            hdf5_path: Path to HDF5 file containing actions, or path to directory with episode_*.hdf5 files
+            demo_folder: Path to demo folder (e.g., outputs/demo_video/demos) - alternative to hdf5_path
+            task_folder: Path to task folder containing config.json (optional, for loading task config)
+            h5py_path: Path to h5py directory (optional, alternative to task_folder)
+            episode_idx: Episode index if hdf5_path is a directory or demo_folder (default: 0)
+            env_idx: Environment index for demo_folder (default: 0)
+            device: Device to run on
+        """
+        super().__init__(observation_keys, action_type, image_resolution)
+        
+        if action_type not in ['qpos', 'ee_direct', 'ee_cam', 'ee_l']:
+            raise ValueError(f"DirectReplayPolicy supports 'qpos' or 'ee' action_type, got '{action_type}'")
+        
+        self.device = device
+        self.current_step = 0
+        
+        # Load task config if provided
+        self.task_cfg = None
+        if task_folder is not None:
+            config_path = Path(task_folder) / "config.json"
+            if config_path.exists():
+                with open(config_path, 'r') as f:
+                    self.task_cfg = json.load(f)
+                print(f"Loaded task config from {config_path}")
+        elif h5py_path is not None:
+            # Try to find config.json in h5py_path or parent directory
+            config_path = Path(h5py_path) / "config.json"
+            if not config_path.exists():
+                config_path = Path(h5py_path).parent / "config.json"
+            if config_path.exists():
+                with open(config_path, 'r') as f:
+                    self.task_cfg = json.load(f)
+                print(f"Loaded task config from {config_path}")
+        
+        # Load actions - prioritize demo_folder over hdf5_path
+        if demo_folder is not None:
+            # Load from demo folder
+            demo_folder_obj = Path(demo_folder)
+            if not demo_folder_obj.exists():
+                raise FileNotFoundError(f"Demo folder not found: {demo_folder}")
+            
+            # Find demo_XXX folders
+            demo_dirs = sorted(demo_folder_obj.glob("demo_*"))
+            if len(demo_dirs) == 0:
+                raise ValueError(f"No demo_* folders found in {demo_folder}")
+            if episode_idx >= len(demo_dirs) and episode_idx != -1:
+                raise ValueError(f"Episode index {episode_idx} out of range (found {len(demo_dirs)} demos)")
+            
+            demo_dir = demo_dirs[episode_idx]
+            # Find env_XXX folders
+            env_dirs = sorted(demo_dir.glob("env_*"))
+            if len(env_dirs) == 0:
+                raise ValueError(f"No env_* folders found in {demo_dir}")
+            if env_idx >= len(env_dirs) and env_idx != -1:
+                raise ValueError(f"Environment index {env_idx} out of range (found {len(env_dirs)} envs)")
+            
+            env_dir = env_dirs[env_idx]
+            print(f"Loading from demo folder: {demo_dir.name}/{env_dir.name}")
+            
+            # Load actions from demo folder
+            if self.action_type == 'qpos':
+                self.actions = self._load_qpos_actions_from_demo_folder(str(env_dir))
+            elif self.action_type == 'ee_cam':
+                self.actions = self._load_ee_cam_actions_from_demo_folder(str(env_dir))
+            elif self.action_type == 'ee_l':
+                self.actions = self._load_ee_l_actions_from_demo_folder(str(env_dir))
+            else:  # 'ee' or 'ee_direct'
+                self.actions = self._load_sparse_ee_actions_from_demo_folder(str(env_dir))
+            print(f"Loaded {len(self.actions)} actions from {env_dir}")
+            
+        elif hdf5_path is not None:
+            # Load actions from HDF5 file
+            hdf5_path_obj = Path(hdf5_path)
+            if hdf5_path_obj.is_dir():
+                # Directory: look for episode_*.hdf5 files
+                episode_files = sorted(hdf5_path_obj.glob("episode_*.hdf5"))
+                if len(episode_files) == 0:
+                    raise ValueError(f"No episode_*.hdf5 files found in {hdf5_path}")
+                if episode_idx >= len(episode_files):
+                    raise ValueError(f"Episode index {episode_idx} out of range (found {len(episode_files)} episodes)")
+                hdf5_file_path = episode_files[episode_idx]
+                print(f"Loading episode {episode_idx} from {hdf5_file_path}")
+            else:
+                # Single file
+                hdf5_file_path = hdf5_path_obj
+                if not hdf5_file_path.exists():
+                    raise FileNotFoundError(f"HDF5 file not found: {hdf5_file_path}")
+            
+            # Load actions from HDF5
+            if self.action_type == 'qpos':
+                self.actions = self._load_actions_from_hdf5(str(hdf5_file_path))
+            else: 
+                self.actions = self._load_ee_actions_from_hdf5(str(hdf5_file_path))
+            print(f"Loaded {len(self.actions)} actions from {hdf5_file_path}")
+        else:
+            raise ValueError("Either 'hdf5_path' or 'demo_folder' must be provided")
+    
+    def _load_gripper_cmd(self, env_dir_path: Path) -> np.ndarray:
+        """Helper function to load and convert gripper_cmd.npy to boolean array."""
+        gripper_cmd_path = env_dir_path / "gripper_cmd.npy"
+        if not gripper_cmd_path.exists():
+            raise FileNotFoundError(f"gripper_cmd.npy not found in {env_dir_path}")
+        
+        gripper_cmd = np.load(str(gripper_cmd_path))  # (episode_len, 2) or (episode_len,)
+        
+        # Convert gripper_cmd to boolean (open/close)
+        # gripper_cmd[:, 0] is in [0.00 (close), 0.04 (open)]
+        # Threshold: > 0.02 means open
+        if gripper_cmd.ndim == 2:
+            gripper_raw = gripper_cmd[:, 0]  # Use first gripper
+        else:
+            gripper_raw = gripper_cmd  # Already 1D
+        gripper_open = (gripper_raw > 0.02)  # Boolean array
+        
+        return gripper_open
+    
+    def _load_qpos_actions_from_demo_folder(self, env_dir: str) -> List[Action]:
+        """
+        Load joint position actions from demo folder.
+        Loads from: joint_pos_des.npy + gripper_cmd.npy
+        
+        Args:
+            env_dir: Path to env_XXX directory containing .npy files
+            
+        Returns:
+            List of Action objects with action_type='qpos'
+        """
+        env_dir_path = Path(env_dir)
+        
+        # Load joint_pos_des.npy
+        joint_pos_des_path = env_dir_path / "joint_pos_des.npy"
+        if not joint_pos_des_path.exists():
+            raise FileNotFoundError(f"joint_pos_des.npy not found in {env_dir}")
+        
+        joint_pos = np.load(str(joint_pos_des_path))  # May be (episode_len, 7) or (episode_len * 7,)
+        joint_pos = joint_pos.reshape(-1, 7)  # Ensure (episode_len, 7)
+        
+        # Load gripper command
+        gripper_open = self._load_gripper_cmd(env_dir_path)
+        
+        # Ensure same length
+        min_len = min(len(joint_pos), len(gripper_open))
+        joint_pos = joint_pos[:min_len]
+        gripper_open = gripper_open[:min_len]
+        
+        # Convert to list of Action objects
+        actions = []
+        for i in range(len(joint_pos)):
+            qpos_tensor = torch.from_numpy(joint_pos[i]).float().to(self.device)  # [7]
+            qpos_batch = qpos_tensor.unsqueeze(0)  # [1, 7]
+            
+            action = Action(
+                action_type='qpos',
+                qpos=qpos_batch,
+                gripper_open=bool(gripper_open[i])
+            )
+            actions.append(action)
+        
+        return actions
+    
+    def _load_sparse_ee_actions_from_demo_folder(self, env_dir: str) -> List[Action]:
+        """
+        Load sparse end-effector actions from demo folder.
+        Loads from: actions.npy (format: [pos[0:3], quat[3:7], gripper[7]]) + gripper_cmd.npy
+        
+        Args:
+            env_dir: Path to env_XXX directory containing .npy files
+            
+        Returns:
+            List of Action objects with action_type='ee' or 'ee_direct'
+        """
+        env_dir_path = Path(env_dir)
+        
+        # Load actions.npy (sparse waypoints)
+        actions_path = env_dir_path / "actions.npy"
+        if not actions_path.exists():
+            raise FileNotFoundError(f"actions.npy not found in {env_dir}")
+        
+        actions = np.load(str(actions_path))  # (episode_len, 8)
+        
+        # Extract components
+        positions = actions[:, 0:3]  # (episode_len, 3)
+        quaternions = actions[:, 3:7]  # (episode_len, 4)
+        
+        # Load gripper command from gripper_cmd.npy (preferred over actions[:, 7])
+        gripper_open = self._load_gripper_cmd(env_dir_path)
+        
+        # Ensure same length
+        min_len = min(len(positions), len(gripper_open))
+        positions = positions[:min_len]
+        quaternions = quaternions[:min_len]
+        gripper_open = gripper_open[:min_len]
+        
+        # Convert to list of Action objects
+        actions_list = []
+        for i in range(len(positions)):
+            # Normalize quaternion if needed
+            quat = quaternions[i]
+            quat_norm = np.linalg.norm(quat)
+            if abs(quat_norm - 1.0) > 0.1:
+                quat = quat / (quat_norm + 1e-8)
+            
+            # Create ee_pose tensor [1, 7] = [pos(3), quat(4)] in wxyz format
+            ee_pose = torch.zeros((1, 7), dtype=torch.float32, device=self.device)
+            ee_pose[0, 0:3] = torch.from_numpy(positions[i]).float()
+            ee_pose[0, 3:7] = torch.from_numpy(quat).float()
+            
+            action = Action(
+                action_type='ee' if self.action_type == 'ee' else 'ee_direct',
+                ee_pose=ee_pose,
+                gripper_open=bool(gripper_open[i])
+            )
+            actions_list.append(action)
+        
+        return actions_list
+    
+    def _load_ee_cam_actions_from_demo_folder(self, env_dir: str) -> List[Action]:
+        """
+        Load dense end-effector actions in camera frame from demo folder.
+        Loads from: ee_pose_cam.npy + gripper_cmd.npy
+        
+        Args:
+            env_dir: Path to env_XXX directory containing .npy files
+            
+        Returns:
+            List of Action objects with action_type='ee_cam'
+        """
+        env_dir_path = Path(env_dir)
+        
+        # Load ee_pose_cam.npy (camera frame end-effector pose)
+        ee_pose_cam_path = env_dir_path / "ee_pose_cam.npy"
+        if not ee_pose_cam_path.exists():
+            raise FileNotFoundError(
+                f"ee_pose_cam.npy not found in {env_dir}. "
+                f"For 'ee_cam' action_type, ee_pose_cam.npy with format [pos(3), quat(4)] in camera frame is required."
+            )
+        
+        ee_pose_cam = np.load(str(ee_pose_cam_path))  # (episode_len, 7)
+        
+        # Load gripper command
+        gripper_open = self._load_gripper_cmd(env_dir_path)
+        
+        # Ensure same length
+        min_len = min(len(ee_pose_cam), len(gripper_open))
+        ee_pose_cam = ee_pose_cam[:min_len]
+        gripper_open = gripper_open[:min_len]
+        
+        # Convert to list of Action objects
+        actions_list = []
+        for i in range(len(ee_pose_cam)):
+            # ee_pose_cam is [7] = [pos(3), quat(4)] in camera frame, wxyz format
+            ee_pose_cam_tensor = torch.from_numpy(ee_pose_cam[i]).float().to(self.device)  # [7]
+            ee_pose_cam_batch = ee_pose_cam_tensor.unsqueeze(0)  # [1, 7]
+            
+            action = Action(
+                action_type='ee_cam',
+                ee_pose=ee_pose_cam_batch,
+                gripper_open=bool(gripper_open[i])
+            )
+            actions_list.append(action)
+        
+        return actions_list
+    
+    def _load_ee_l_actions_from_demo_folder(self, env_dir: str) -> List[Action]:
+        """
+        Load dense end-effector actions in local frame from demo folder.
+        Loads from: ee_pose_l.npy + gripper_cmd.npy
+        
+        Args:
+            env_dir: Path to env_XXX directory containing .npy files
+            
+        Returns:
+            List of Action objects with action_type='ee_l'
+        """
+        env_dir_path = Path(env_dir)
+        
+        # Load ee_pose_l.npy (local frame end-effector pose)
+        ee_pose_l_path = env_dir_path / "ee_pose_l.npy"
+        if not ee_pose_l_path.exists():
+            raise FileNotFoundError(
+                f"ee_pose_l.npy not found in {env_dir}. "
+                f"For 'ee_l' action_type, ee_pose_l.npy with format [pos(3), quat(4)] in local frame is required."
+            )
+        
+        ee_pose_l = np.load(str(ee_pose_l_path))  # (episode_len, 7)
+        
+        # Load gripper command
+        gripper_open = self._load_gripper_cmd(env_dir_path)
+        
+        # Ensure same length
+        min_len = min(len(ee_pose_l), len(gripper_open))
+        ee_pose_l = ee_pose_l[:min_len]
+        gripper_open = gripper_open[:min_len]
+        
+        # Convert to list of Action objects
+        actions_list = []
+        for i in range(len(ee_pose_l)):
+            # ee_pose_l is [7] = [pos(3), quat(4)] in local frame, wxyz format
+            ee_pose_l_tensor = torch.from_numpy(ee_pose_l[i]).float().to(self.device)  # [7]
+            ee_pose_l_batch = ee_pose_l_tensor.unsqueeze(0)  # [1, 7]
+            
+            action = Action(
+                action_type='ee_l',
+                ee_pose=ee_pose_l_batch,
+                gripper_open=bool(gripper_open[i])
+            )
+            actions_list.append(action)
+        
+        return actions_list
+
+    def _load_ee_actions_from_hdf5(self, hdf5_path: str) -> List[Action]:
+        """
+        Load end-effector actions from HDF5 file.
+        
+        Args:
+            hdf5_path: Path to HDF5 file
+            
+        Returns:
+            List of Action objects with action_type='ee'
+        """
+        with h5py.File(hdf5_path, 'r') as f:
+            # Check for ee_pose in various possible locations
+            ee_pose = None
+            gripper_cmd = None
+            
+            # Try different possible paths
+            if '/ee_pose' in f:
+                ee_pose = f['/ee_pose'][:]  # (episode_len, 7)
+            elif '/observation/ee_pose' in f:
+                ee_pose = f['/observation/ee_pose'][:]  # (episode_len, 7)
+            elif '/action' in f:
+                # Check if action is 8D (pos + quat + gripper)
+                action = f['/action'][:]  # (episode_len, 8)
+                if action.shape[1] == 8:
+                    ee_pose = action[:, :7]  # (episode_len, 7)
+                    gripper_raw = action[:, 7]  # (episode_len,)
+                    gripper_cmd = (gripper_raw > 0.5)  # Boolean array
+            
+            if ee_pose is None:
+                raise ValueError(
+                    f"HDF5 file must contain '/ee_pose', '/observation/ee_pose', or '/action' (8D) key for 'ee' action_type"
+                )
+            
+            # Get gripper command if not already extracted
+            if gripper_cmd is None:
+                if '/joint_action/gripper_cmd' in f:
+                    gripper_cmd_array = f['/joint_action/gripper_cmd'][:]  # (episode_len, 2)
+                    gripper_raw = gripper_cmd_array[:, 0]
+                    gripper_cmd = (gripper_raw > 0.02)  # Boolean array
+                else:
+                    # Default: assume gripper is open
+                    gripper_cmd = np.ones(len(ee_pose), dtype=bool)
+            
+            # Ensure same length
+            min_len = min(len(ee_pose), len(gripper_cmd))
+            ee_pose = ee_pose[:min_len]
+            gripper_cmd = gripper_cmd[:min_len]
+            
+            # Convert to list of Action objects
+            actions = []
+            for i in range(len(ee_pose)):
+                # ee_pose is [7] = [pos(3), quat(4)] in wxyz format
+                ee_pose_tensor = torch.from_numpy(ee_pose[i]).float().to(self.device)  # [7]
+                ee_pose_batch = ee_pose_tensor.unsqueeze(0)  # [1, 7]
+                gripper_open_bool = bool(gripper_cmd[i])
+                
+                action = Action(
+                    action_type='ee',
+                    ee_pose=ee_pose_batch,
+                    gripper_open=gripper_open_bool
+                )
+                actions.append(action)
+            
+            return actions
+    
+    def _load_actions_from_hdf5(self, hdf5_path: str) -> List[Action]:
+        """
+        Load actions from HDF5 file.
+        
+        Args:
+            hdf5_path: Path to HDF5 file
+            
+        Returns:
+            List of Action objects
+        """
+        with h5py.File(hdf5_path, 'r') as f:
+            # Check if raw format (has /joint_action) or processed format (has /action)
+            if '/joint_action' in f:
+                # Raw format: load joint_pos and gripper_cmd
+                joint_pos = f['/joint_action/joint_pos'][:]  # (episode_len, 7)
+                gripper_cmd = f['/joint_action/gripper_cmd'][:]  # (episode_len, 2)
+                
+                # Convert gripper_cmd to boolean (open/close)
+                # gripper_cmd[:, 0] is in [0.00 (close), 0.04 (open)]
+                # Threshold: > 0.02 means open
+                gripper_raw = gripper_cmd[:, 0]  # Use first gripper
+                gripper_open = (gripper_raw > 0.02)  # Boolean array
+                
+            elif '/action' in f:
+                # Processed format: action is already 8D [7 joints, 1 gripper]
+                action = f['/action'][:]  # (episode_len, 8)
+                joint_pos = action[:, :7]  # (episode_len, 7)
+                gripper_normalized = action[:, 7]  # (episode_len,)
+                # Convert normalized gripper [0, 1] to boolean
+                gripper_open = (gripper_normalized > 0.5)  # Boolean array
+                
+            else:
+                raise ValueError(f"HDF5 file must contain either '/joint_action' or '/action' key")
+            
+            # Convert to list of Action objects
+            actions = []
+            for i in range(len(joint_pos)):
+                qpos_tensor = torch.from_numpy(joint_pos[i]).float().to(self.device)  # [7]
+                qpos_batch = qpos_tensor.unsqueeze(0)  # [1, 7]
+                gripper_open_bool = bool(gripper_open[i])
+                
+                action = Action(
+                    action_type='qpos',
+                    qpos=qpos_batch,
+                    gripper_open=gripper_open_bool
+                )
+                actions.append(action)
+            
+            return actions
+    
+    def get_action(self, observation: Dict[str, Any]) -> Action:
+        """
+        Get next action from loaded sequence.
+        
+        Args:
+            observation: Observation dict (ignored, actions are pre-loaded)
+            
+        Returns:
+            Next Action in the sequence
+        """
+        if self.current_step >= len(self.actions):
+            # End of episode: return last action
+            return self.actions[-1]
+        
+        action = self.actions[self.current_step]
+        self.current_step += 1
+        return action
+    
+    def reset(self) -> None:
+        """Reset policy state (reset step counter)."""
+        self.current_step = 0
+        print(f"DirectReplayPolicy reset: {len(self.actions)} actions ready")
+
+
+def create_direct_replay_policy(
+    hdf5_path: Optional[str] = None,
+    demo_folder: Optional[str] = None,
+    task_folder: Optional[str] = None,
+    h5py_path: Optional[str] = None,
+    episode_idx: int = 0,
+    env_idx: int = 0,
+    observation_keys: List[str] = ['rgb'],
+    action_type: str = 'qpos',  # 'qpos' or 'ee'
+    image_resolution: List[int] = [224, 224],
+    device: str = "cuda:0",
+) -> DirectReplayPolicy:
+    """
+    Factory function to create DirectReplayPolicy.
+    
+    Args:
+        hdf5_path: Path to HDF5 file or directory with episode_*.hdf5 files
+        demo_folder: Path to demo folder (e.g., outputs/demo_video/demos) - alternative to hdf5_path
+        task_folder: Path to task folder containing config.json (optional)
+        h5py_path: Path to h5py directory (optional, alternative to task_folder)
+        episode_idx: Episode index if hdf5_path is a directory or demo_folder (default: 0)
+        env_idx: Environment index for demo_folder (default: 0)
+        observation_keys: List of observation keys
+        action_type: 'qpos' or 'ee' - type of actions to replay
+        image_resolution: Image resolution
+        device: Device to run on
+        
+    Returns:
+        DirectReplayPolicy instance
+    """
+    return DirectReplayPolicy(
+        observation_keys=observation_keys,
+        action_type=action_type,
+        image_resolution=image_resolution,
+        hdf5_path=hdf5_path,
+        demo_folder=demo_folder,
+        task_folder=task_folder,
+        h5py_path=h5py_path,
+        episode_idx=episode_idx,
+        env_idx=env_idx,
+        device=device,
+    )
+
+
+# Test-related imports (only needed for test functions)
+# Note: AppLauncher is already initialized above
+from envs.cfgs.eval_cfg import EvaluationConfig
+from envs.cfgs.task_cfg import load_task_cfg
+from envs.sim_wrapper_isaac import PolicyEvaluationWrapper
+from envs.make_env_isaac import make_env
+
+
+def clean_task_cfg(task_cfg):
+    """Clean task config paths."""
+    def rename_path(path: str, or2s_dir: Path) -> str:
+        return path.replace("/app", str(or2s_dir))
+    
+    o2s_dir = Path("/home/peiqiduan/OpenReal2Sim-dev")
+    task_cfg.background_cfg.background_rgb_path = rename_path(
+        task_cfg.background_cfg.background_rgb_path, o2s_dir
+    )
+    task_cfg.background_cfg.background_mesh_path = rename_path(
+        task_cfg.background_cfg.background_mesh_path, o2s_dir
+    )
+    task_cfg.background_cfg.background_usd_path = rename_path(
+        task_cfg.background_cfg.background_usd_path, o2s_dir
+    )
+    for obj in task_cfg.objects:
+        obj.mesh_path = rename_path(obj.mesh_path, o2s_dir)
+        obj.usd_path = rename_path(obj.usd_path, o2s_dir)
+    return task_cfg
+
+
+def test_direct_replay(
+    hdf5_path: Optional[str] = None,
+    demo_folder: Optional[str] = None,
+    task_folder: Optional[str] = None,
+    h5py_path: Optional[str] = None,
+    episode_idx: int = 0,
+    env_idx: int = 0,
+    task_json_path: str = "/app/tasks/demo_video/task.json",
+    num_envs: int = 1,
+    device: str = "cuda:0",
+    action_type: str = "qpos",
+):
+    """
+    Test DirectReplayPolicy with PolicyEvaluationWrapper.
+    
+    Args:
+        hdf5_path: Path to HDF5 file or directory with episode_*.hdf5 files
+        demo_folder: Path to demo folder (e.g., outputs/demo_video/demos) - alternative to hdf5_path
+        task_folder: Path to task folder containing config.json (optional)
+        h5py_path: Path to h5py directory (optional, alternative to task_folder)
+        episode_idx: Episode index if hdf5_path is a directory or demo_folder (default: 0)
+        env_idx: Environment index for demo_folder (default: 0)
+        task_json_path: Path to task.json file
+        num_envs: Number of parallel environments
+        device: Device to run on
+    """
+    # Load task config
+    task_cfg = load_task_cfg(task_json_path)
+    # task_cfg = clean_task_cfg(task_cfg)
+    
+    # Use default evaluation config
+    eval_cfg = EvaluationConfig()
+    if action_type == "ee" or action_type == "ee_direct":
+        eval_cfg.max_steps = 13
+    else:
+        eval_cfg.max_steps = 1000  # Allow enough steps for replay
+    eval_cfg.record_video = True
+    eval_cfg.use_randomization = False
+    
+    # Create environment
+    env, _ = make_env(
+        task_cfg,
+        eval_cfg=eval_cfg,
+        num_envs=num_envs,
+        device=device,
+        bg_simplify=False
+    )
+    sim, scene = env.sim, env.scene
+    my_sim = PolicyEvaluationWrapper(sim, scene, task_cfg, eval_cfg, num_envs)
+    
+    # Create direct replay policy
+    replay_policy = DirectReplayPolicy(
+        observation_keys=['rgb'],
+        action_type=action_type,
+        image_resolution=[224, 224],
+        hdf5_path=hdf5_path,
+        demo_folder=demo_folder,
+        task_folder=task_folder,
+        h5py_path=h5py_path,
+        episode_idx=episode_idx,
+        env_idx=env_idx,
+        device=device
+    )
+    
+    # Set policy and evaluate
+    my_sim.set_policy(replay_policy)
+    my_sim.out_dir = Path("/app/r2sVLA/results")
+    
+    print(f"Starting direct replay test with {len(replay_policy.actions)} actions")
+    result = my_sim.evaluate_episode()
+    print(f"Replay result: {result}")
+    my_sim.save_data()
+    
+    env.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test DirectReplayPolicy")
+    parser.add_argument(
+        "--hdf5_path",
+        type=str,
+        default=None,
+        help="Path to HDF5 file or directory with episode_*.hdf5 files"
+    )
+    parser.add_argument(
+        "--demo_folder",
+        type=str,
+        default=None,
+        help="Path to demo folder (e.g., outputs/demo_video/demos) - alternative to hdf5_path"
+    )
+    parser.add_argument(
+        "--task_folder",
+        type=str,
+        default=None,
+        help="Path to task folder containing config.json (optional)"
+    )
+    parser.add_argument(
+        "--h5py_path",
+        type=str,
+        default=None,
+        help="Path to h5py directory (optional, alternative to task_folder)"
+    )
+    parser.add_argument(
+        "--episode_idx",
+        type=int,
+        default=-1,
+        help="Episode index if hdf5_path is a directory or demo_folder (default: 0)"
+    )
+    parser.add_argument(
+        "--env_idx",
+        type=int,
+        default=0,
+        help="Environment index for demo_folder (default: 0)"
+    )
+    parser.add_argument(
+        "--task_json_path",
+        type=str,
+        default="/app/tasks/demo_video/task.json",
+        help="Path to task.json file (default: /app/tasks/demo_video/task.json)"
+    )
+    parser.add_argument(
+        "--num_envs",
+        type=int,
+        default=1,
+        help="Number of parallel environments (default: 1)"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda:0",
+        help="Device to run on (default: cuda:0)"
+    )
+    parser.add_argument(
+        "--action_type",
+        type=str,
+        default="qpos",
+        choices=["qpos", "ee_direct", "ee_cam", "ee_l"],
+        help="Action type: 'qpos' for joint position control, 'ee' for end-effector control (default: qpos)"
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate that at least one of hdf5_path or demo_folder is provided
+    if args.hdf5_path is None and args.demo_folder is None:
+        parser.error("Either --hdf5_path or --demo_folder must be provided")
+    
+    try:
+        test_direct_replay(
+            hdf5_path=args.hdf5_path,
+            demo_folder=args.demo_folder,
+            task_folder=args.task_folder,
+            h5py_path=args.h5py_path,
+            episode_idx=args.episode_idx,
+            env_idx=args.env_idx,
+            task_json_path=args.task_json_path,
+            num_envs=args.num_envs,
+            device=args.device,
+            action_type=args.action_type
+        )
+    except Exception as e:
+        print(f"[ERR] Error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+    finally:
+        simulation_app.close()
diff --git a/r2sVLA/envs/test/test_direct_replay.py b/r2sVLA/envs/test/test_direct_replay.py
new file mode 100644
index 0000000..d228ab8
--- /dev/null
+++ b/r2sVLA/envs/test/test_direct_replay.py
@@ -0,0 +1,764 @@
+"""
+Direct Replay Policy: Load actions directly from HDF5 files and replay in simulation.
+
+This policy loads a sequence of actions from an HDF5 file and replays them step by step.
+It's useful for testing the simulation environment with recorded demonstrations.
+"""
+from __future__ import annotations
+
+import h5py
+import numpy as np
+import torch
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import json
+import yaml
+import sys
+import argparse
+
+# Add parent directories to path for imports
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent.parent))
+sys.path.append(str(file_path.parent.parent.parent))
+
+# Initialize Isaac Lab AppLauncher FIRST (before importing modules that need carb)
+from isaaclab.app import AppLauncher
+app_launcher = AppLauncher(headless=True, enable_cameras=True)
+simulation_app = app_launcher.app
+
+# Now we can import modules that depend on carb
+from envs.cfgs.policy_interface import BasePolicy, Action
+
+
+class DirectReplayPolicy(BasePolicy):
+    """
+    Policy that directly replays actions from HDF5 file or demo folder.
+    
+    Supports both 'qpos' (joint position) and 'ee' (end-effector) action types.
+    
+    For 'qpos' action type:
+        - Loads joint positions and gripper commands
+        - Expected HDF5 structure:
+            /joint_action/joint_pos: (episode_len, 7) - 7D joint positions
+            /joint_action/gripper_cmd: (episode_len, 2) - gripper commands [0.00 (close) to 0.04 (open)]
+        - Demo folder: joint_pos.npy, gripper_cmd.npy, or actions.npy
+    
+    For 'ee' action type:
+        - Loads end-effector poses (position + quaternion) and gripper commands
+        - Expected format: (episode_len, 8) - [pos(3), quat(4), gripper(1)]
+        - Demo folder: actions.npy (format: [pos[0:3], quat[3:7], gripper[7]])
+        - HDF5: /ee_pose (if available)
+    """
+    
+    # Franka gripper constants (same as in ACT training)
+    FRANKA_GRIPPER_OPEN = 0.04
+    FRANKA_GRIPPER_CLOSE = 0.00
+    
+    def __init__(
+        self,
+        observation_keys: List[str],
+        action_type: str,
+        image_resolution: List[int],
+        hdf5_path: Optional[str] = None,
+        demo_folder: Optional[str] = None,
+        task_folder: Optional[str] = None,
+        h5py_path: Optional[str] = None,
+        episode_idx: int = 0,
+        env_idx: int = 0,
+        device: str = "cuda:0",
+    ):
+        """
+        Initialize DirectReplayPolicy.
+        
+        Args:
+            observation_keys: List of observation keys (not used, but required by BasePolicy)
+            action_type: 'qpos' or 'ee' - type of actions to replay
+            image_resolution: Image resolution (not used, but required by BasePolicy)
+            hdf5_path: Path to HDF5 file containing actions, or path to directory with episode_*.hdf5 files
+            demo_folder: Path to demo folder (e.g., outputs/demo_video/demos) - alternative to hdf5_path
+            task_folder: Path to task folder containing config.json (optional, for loading task config)
+            h5py_path: Path to h5py directory (optional, alternative to task_folder)
+            episode_idx: Episode index if hdf5_path is a directory or demo_folder (default: 0)
+            env_idx: Environment index for demo_folder (default: 0)
+            device: Device to run on
+        """
+        super().__init__(observation_keys, action_type, image_resolution)
+        
+        if action_type not in ['qpos', 'ee_direct', 'ee_cam', 'ee_l']:
+            raise ValueError(f"DirectReplayPolicy supports 'qpos' or 'ee' action_type, got '{action_type}'")
+        
+        self.device = device
+        self.current_step = 0
+        
+        # Load task config if provided
+        self.task_cfg = None
+        if task_folder is not None:
+            config_path = Path(task_folder) / "config.json"
+            if config_path.exists():
+                with open(config_path, 'r') as f:
+                    self.task_cfg = json.load(f)
+                print(f"Loaded task config from {config_path}")
+        elif h5py_path is not None:
+            # Try to find config.json in h5py_path or parent directory
+            config_path = Path(h5py_path) / "config.json"
+            if not config_path.exists():
+                config_path = Path(h5py_path).parent / "config.json"
+            if config_path.exists():
+                with open(config_path, 'r') as f:
+                    self.task_cfg = json.load(f)
+                print(f"Loaded task config from {config_path}")
+        
+        # Load actions - prioritize demo_folder over hdf5_path
+        if demo_folder is not None:
+            # Load from demo folder
+            demo_folder_obj = Path(demo_folder)
+            if not demo_folder_obj.exists():
+                raise FileNotFoundError(f"Demo folder not found: {demo_folder}")
+            
+            # Find demo_XXX folders
+            demo_dirs = sorted(demo_folder_obj.glob("demo_*"))
+            if len(demo_dirs) == 0:
+                raise ValueError(f"No demo_* folders found in {demo_folder}")
+            if episode_idx >= len(demo_dirs) and episode_idx != -1:
+                raise ValueError(f"Episode index {episode_idx} out of range (found {len(demo_dirs)} demos)")
+            
+            demo_dir = demo_dirs[episode_idx]
+            # Find env_XXX folders
+            env_dirs = sorted(demo_dir.glob("env_*"))
+            if len(env_dirs) == 0:
+                raise ValueError(f"No env_* folders found in {demo_dir}")
+            if env_idx >= len(env_dirs) and env_idx != -1:
+                raise ValueError(f"Environment index {env_idx} out of range (found {len(env_dirs)} envs)")
+            
+            env_dir = env_dirs[env_idx]
+            print(f"Loading from demo folder: {demo_dir.name}/{env_dir.name}")
+            
+            # Load actions from demo folder
+            if self.action_type == 'qpos':
+                self.actions = self._load_qpos_actions_from_demo_folder(str(env_dir))
+            elif self.action_type == 'ee_cam':
+                self.actions = self._load_ee_cam_actions_from_demo_folder(str(env_dir))
+            elif self.action_type == 'ee_l':
+                self.actions = self._load_ee_l_actions_from_demo_folder(str(env_dir))
+            else:  # 'ee' or 'ee_direct'
+                self.actions = self._load_sparse_ee_actions_from_demo_folder(str(env_dir))
+            print(f"Loaded {len(self.actions)} actions from {env_dir}")
+            
+        elif hdf5_path is not None:
+            # Load actions from HDF5 file
+            hdf5_path_obj = Path(hdf5_path)
+            if hdf5_path_obj.is_dir():
+                # Directory: look for episode_*.hdf5 files
+                episode_files = sorted(hdf5_path_obj.glob("episode_*.hdf5"))
+                if len(episode_files) == 0:
+                    raise ValueError(f"No episode_*.hdf5 files found in {hdf5_path}")
+                if episode_idx >= len(episode_files):
+                    raise ValueError(f"Episode index {episode_idx} out of range (found {len(episode_files)} episodes)")
+                hdf5_file_path = episode_files[episode_idx]
+                print(f"Loading episode {episode_idx} from {hdf5_file_path}")
+            else:
+                # Single file
+                hdf5_file_path = hdf5_path_obj
+                if not hdf5_file_path.exists():
+                    raise FileNotFoundError(f"HDF5 file not found: {hdf5_file_path}")
+            
+            # Load actions from HDF5
+            if self.action_type == 'qpos':
+                self.actions = self._load_actions_from_hdf5(str(hdf5_file_path))
+            else: 
+                self.actions = self._load_ee_actions_from_hdf5(str(hdf5_file_path))
+            print(f"Loaded {len(self.actions)} actions from {hdf5_file_path}")
+        else:
+            raise ValueError("Either 'hdf5_path' or 'demo_folder' must be provided")
+    
+    def _load_gripper_cmd(self, env_dir_path: Path) -> np.ndarray:
+        """Helper function to load and convert gripper_cmd.npy to boolean array."""
+        gripper_cmd_path = env_dir_path / "gripper_cmd.npy"
+        if not gripper_cmd_path.exists():
+            raise FileNotFoundError(f"gripper_cmd.npy not found in {env_dir_path}")
+        
+        gripper_cmd = np.load(str(gripper_cmd_path))  # (episode_len, 2) or (episode_len,)
+        
+        # Convert gripper_cmd to boolean (open/close)
+        # gripper_cmd[:, 0] is in [0.00 (close), 0.04 (open)]
+        # Threshold: > 0.02 means open
+        if gripper_cmd.ndim == 2:
+            gripper_raw = gripper_cmd[:, 0]  # Use first gripper
+        else:
+            gripper_raw = gripper_cmd  # Already 1D
+        gripper_open = (gripper_raw > 0.02)  # Boolean array
+        
+        return gripper_open
+    
+    def _load_qpos_actions_from_demo_folder(self, env_dir: str) -> List[Action]:
+        """
+        Load joint position actions from demo folder.
+        Loads from: joint_pos_des.npy + gripper_cmd.npy
+        
+        Args:
+            env_dir: Path to env_XXX directory containing .npy files
+            
+        Returns:
+            List of Action objects with action_type='qpos'
+        """
+        env_dir_path = Path(env_dir)
+        
+        # Load joint_pos_des.npy
+        joint_pos_des_path = env_dir_path / "joint_pos_des.npy"
+        if not joint_pos_des_path.exists():
+            raise FileNotFoundError(f"joint_pos_des.npy not found in {env_dir}")
+        
+        joint_pos = np.load(str(joint_pos_des_path))  # May be (episode_len, 7) or (episode_len * 7,)
+        joint_pos = joint_pos.reshape(-1, 7)  # Ensure (episode_len, 7)
+        
+        # Load gripper command
+        gripper_open = self._load_gripper_cmd(env_dir_path)
+        
+        # Ensure same length
+        min_len = min(len(joint_pos), len(gripper_open))
+        joint_pos = joint_pos[:min_len]
+        gripper_open = gripper_open[:min_len]
+        
+        # Convert to list of Action objects
+        actions = []
+        for i in range(len(joint_pos)):
+            qpos_tensor = torch.from_numpy(joint_pos[i]).float().to(self.device)  # [7]
+            qpos_batch = qpos_tensor.unsqueeze(0)  # [1, 7]
+            
+            action = Action(
+                action_type='qpos',
+                qpos=qpos_batch,
+                gripper_open=bool(gripper_open[i])
+            )
+            actions.append(action)
+        
+        return actions
+    
+    def _load_sparse_ee_actions_from_demo_folder(self, env_dir: str) -> List[Action]:
+        """
+        Load sparse end-effector actions from demo folder.
+        Loads from: actions.npy (format: [pos[0:3], quat[3:7], gripper[7]]) + gripper_cmd.npy
+        
+        Args:
+            env_dir: Path to env_XXX directory containing .npy files
+            
+        Returns:
+            List of Action objects with action_type='ee' or 'ee_direct'
+        """
+        env_dir_path = Path(env_dir)
+        
+        # Load actions.npy (sparse waypoints)
+        actions_path = env_dir_path / "actions.npy"
+        if not actions_path.exists():
+            raise FileNotFoundError(f"actions.npy not found in {env_dir}")
+        
+        actions = np.load(str(actions_path))  # (episode_len, 8)
+        
+        # Extract components
+        positions = actions[:, 0:3]  # (episode_len, 3)
+        quaternions = actions[:, 3:7]  # (episode_len, 4)
+        
+        # Load gripper command from gripper_cmd.npy (preferred over actions[:, 7])
+        gripper_open = actions[:, 7] < 0.5
+        
+        # Ensure same length
+        min_len = min(len(positions), len(gripper_open))
+        positions = positions[:min_len]
+        quaternions = quaternions[:min_len]
+        gripper_open = gripper_open[:min_len]
+        
+        # Convert to list of Action objects
+        actions_list = []
+        for i in range(len(positions)):
+            # Normalize quaternion if needed
+            quat = quaternions[i]
+            quat_norm = np.linalg.norm(quat)
+            if abs(quat_norm - 1.0) > 0.1:
+                quat = quat / (quat_norm + 1e-8)
+            
+            # Create ee_pose tensor [1, 7] = [pos(3), quat(4)] in wxyz format
+            ee_pose = torch.zeros((1, 7), dtype=torch.float32, device=self.device)
+            ee_pose[0, 0:3] = torch.from_numpy(positions[i]).float()
+            ee_pose[0, 3:7] = torch.from_numpy(quat).float()
+            
+            action = Action(
+                action_type='ee' if self.action_type == 'ee' else 'ee_direct',
+                ee_pose=ee_pose,
+                gripper_open=bool(gripper_open[i])
+            )
+            actions_list.append(action)
+        
+        return actions_list
+    
+    def _load_ee_cam_actions_from_demo_folder(self, env_dir: str) -> List[Action]:
+        """
+        Load dense end-effector actions in camera frame from demo folder.
+        Loads from: ee_pose_cam.npy + gripper_cmd.npy
+        
+        Args:
+            env_dir: Path to env_XXX directory containing .npy files
+            
+        Returns:
+            List of Action objects with action_type='ee_cam'
+        """
+        env_dir_path = Path(env_dir)
+        
+        # Load ee_pose_cam.npy (camera frame end-effector pose)
+        ee_pose_cam_path = env_dir_path / "ee_pose_cam.npy"
+        if not ee_pose_cam_path.exists():
+            raise FileNotFoundError(
+                f"ee_pose_cam.npy not found in {env_dir}. "
+                f"For 'ee_cam' action_type, ee_pose_cam.npy with format [pos(3), quat(4)] in camera frame is required."
+            )
+        
+        ee_pose_cam = np.load(str(ee_pose_cam_path))  # (episode_len, 7)
+        
+        # Load gripper command
+        gripper_open = self._load_gripper_cmd(env_dir_path)
+        
+        # Ensure same length
+        min_len = min(len(ee_pose_cam), len(gripper_open))
+        ee_pose_cam = ee_pose_cam[:min_len]
+        gripper_open = gripper_open[:min_len]
+        
+        # Convert to list of Action objects
+        actions_list = []
+        for i in range(len(ee_pose_cam)):
+            # ee_pose_cam is [7] = [pos(3), quat(4)] in camera frame, wxyz format
+            ee_pose_cam_tensor = torch.from_numpy(ee_pose_cam[i]).float().to(self.device)  # [7]
+            ee_pose_cam_batch = ee_pose_cam_tensor.unsqueeze(0)  # [1, 7]
+            
+            action = Action(
+                action_type='ee_cam',
+                ee_pose=ee_pose_cam_batch,
+                gripper_open=bool(gripper_open[i])
+            )
+            actions_list.append(action)
+        
+        return actions_list
+    
+    def _load_ee_l_actions_from_demo_folder(self, env_dir: str) -> List[Action]:
+        """
+        Load dense end-effector actions in local frame from demo folder.
+        Loads from: ee_pose_l.npy + gripper_cmd.npy
+        
+        Args:
+            env_dir: Path to env_XXX directory containing .npy files
+            
+        Returns:
+            List of Action objects with action_type='ee_l'
+        """
+        env_dir_path = Path(env_dir)
+        
+        # Load ee_pose_l.npy (local frame end-effector pose)
+        ee_pose_l_path = env_dir_path / "ee_pose_l.npy"
+        if not ee_pose_l_path.exists():
+            raise FileNotFoundError(
+                f"ee_pose_l.npy not found in {env_dir}. "
+                f"For 'ee_l' action_type, ee_pose_l.npy with format [pos(3), quat(4)] in local frame is required."
+            )
+        
+        ee_pose_l = np.load(str(ee_pose_l_path))  # (episode_len, 7)
+        
+        # Load gripper command
+        gripper_open = self._load_gripper_cmd(env_dir_path)
+        
+        # Ensure same length
+        min_len = min(len(ee_pose_l), len(gripper_open))
+        ee_pose_l = ee_pose_l[:min_len]
+        gripper_open = gripper_open[:min_len]
+        
+        # Convert to list of Action objects
+        actions_list = []
+        for i in range(len(ee_pose_l)):
+            # ee_pose_l is [7] = [pos(3), quat(4)] in local frame, wxyz format
+            ee_pose_l_tensor = torch.from_numpy(ee_pose_l[i]).float().to(self.device)  # [7]
+            ee_pose_l_batch = ee_pose_l_tensor.unsqueeze(0)  # [1, 7]
+            
+            action = Action(
+                action_type='ee_l',
+                ee_pose=ee_pose_l_batch,
+                gripper_open=bool(gripper_open[i])
+            )
+            actions_list.append(action)
+        
+        return actions_list
+
+    def _load_ee_actions_from_hdf5(self, hdf5_path: str) -> List[Action]:
+        """
+        Load end-effector actions from HDF5 file.
+        
+        Args:
+            hdf5_path: Path to HDF5 file
+            
+        Returns:
+            List of Action objects with action_type='ee'
+        """
+        with h5py.File(hdf5_path, 'r') as f:
+            # Check for ee_pose in various possible locations
+            ee_pose = None
+            gripper_cmd = None
+            
+            # Try different possible paths
+            if '/ee_pose' in f:
+                ee_pose = f['/ee_pose'][:]  # (episode_len, 7)
+            elif '/observation/ee_pose' in f:
+                ee_pose = f['/observation/ee_pose'][:]  # (episode_len, 7)
+            elif '/action' in f:
+                # Check if action is 8D (pos + quat + gripper)
+                action = f['/action'][:]  # (episode_len, 8)
+                if action.shape[1] == 8:
+                    ee_pose = action[:, :7]  # (episode_len, 7)
+                    gripper_raw = action[:, 7]  # (episode_len,)
+                    gripper_cmd = (gripper_raw > 0.5)  # Boolean array
+            
+            if ee_pose is None:
+                raise ValueError(
+                    f"HDF5 file must contain '/ee_pose', '/observation/ee_pose', or '/action' (8D) key for 'ee' action_type"
+                )
+            
+            # Get gripper command if not already extracted
+            if gripper_cmd is None:
+                if '/joint_action/gripper_cmd' in f:
+                    gripper_cmd_array = f['/joint_action/gripper_cmd'][:]  # (episode_len, 2)
+                    gripper_raw = gripper_cmd_array[:, 0]
+                    gripper_cmd = (gripper_raw > 0.02)  # Boolean array
+                else:
+                    # Default: assume gripper is open
+                    gripper_cmd = np.ones(len(ee_pose), dtype=bool)
+            
+            # Ensure same length
+            min_len = min(len(ee_pose), len(gripper_cmd))
+            ee_pose = ee_pose[:min_len]
+            gripper_cmd = gripper_cmd[:min_len]
+            
+            # Convert to list of Action objects
+            actions = []
+            for i in range(len(ee_pose)):
+                # ee_pose is [7] = [pos(3), quat(4)] in wxyz format
+                ee_pose_tensor = torch.from_numpy(ee_pose[i]).float().to(self.device)  # [7]
+                ee_pose_batch = ee_pose_tensor.unsqueeze(0)  # [1, 7]
+                gripper_open_bool = bool(gripper_cmd[i])
+                
+                action = Action(
+                    action_type='ee',
+                    ee_pose=ee_pose_batch,
+                    gripper_open=gripper_open_bool
+                )
+                actions.append(action)
+            
+            return actions
+    
+    def _load_actions_from_hdf5(self, hdf5_path: str) -> List[Action]:
+        """
+        Load actions from HDF5 file.
+        
+        Args:
+            hdf5_path: Path to HDF5 file
+            
+        Returns:
+            List of Action objects
+        """
+        with h5py.File(hdf5_path, 'r') as f:
+            # Check if raw format (has /joint_action) or processed format (has /action)
+            if '/joint_action' in f:
+                # Raw format: load joint_pos and gripper_cmd
+                joint_pos = f['/joint_action/joint_pos'][:]  # (episode_len, 7)
+                gripper_cmd = f['/joint_action/gripper_cmd'][:]  # (episode_len, 2)
+                
+                # Convert gripper_cmd to boolean (open/close)
+                # gripper_cmd[:, 0] is in [0.00 (close), 0.04 (open)]
+                # Threshold: > 0.02 means open
+                gripper_raw = gripper_cmd[:, 0]  # Use first gripper
+                gripper_open = (gripper_raw > 0.02)  # Boolean array
+                
+            elif '/action' in f:
+                # Processed format: action is already 8D [7 joints, 1 gripper]
+                action = f['/action'][:]  # (episode_len, 8)
+                joint_pos = action[:, :7]  # (episode_len, 7)
+                gripper_normalized = action[:, 7]  # (episode_len,)
+                # Convert normalized gripper [0, 1] to boolean
+                gripper_open = (gripper_normalized > 0.5)  # Boolean array
+                
+            else:
+                raise ValueError(f"HDF5 file must contain either '/joint_action' or '/action' key")
+            
+            # Convert to list of Action objects
+            actions = []
+            for i in range(len(joint_pos)):
+                qpos_tensor = torch.from_numpy(joint_pos[i]).float().to(self.device)  # [7]
+                qpos_batch = qpos_tensor.unsqueeze(0)  # [1, 7]
+                gripper_open_bool = bool(gripper_open[i])
+                
+                action = Action(
+                    action_type='qpos',
+                    qpos=qpos_batch,
+                    gripper_open=gripper_open_bool
+                )
+                actions.append(action)
+            
+            return actions
+    
+    def get_action(self, observation: Dict[str, Any]) -> Action:
+        """
+        Get next action from loaded sequence.
+        
+        Args:
+            observation: Observation dict (ignored, actions are pre-loaded)
+            
+        Returns:
+            Next Action in the sequence
+        """
+        if self.current_step >= len(self.actions):
+            # End of episode: return last action
+            return self.actions[-1]
+        
+        action = self.actions[self.current_step]
+        self.current_step += 1
+        return action
+    
+    def reset(self) -> None:
+        """Reset policy state (reset step counter)."""
+        self.current_step = 0
+        print(f"DirectReplayPolicy reset: {len(self.actions)} actions ready")
+
+
+def create_direct_replay_policy(
+    hdf5_path: Optional[str] = None,
+    demo_folder: Optional[str] = None,
+    task_folder: Optional[str] = None,
+    h5py_path: Optional[str] = None,
+    episode_idx: int = 0,
+    env_idx: int = 0,
+    observation_keys: List[str] = ['rgb'],
+    action_type: str = 'qpos',  # 'qpos' or 'ee'
+    image_resolution: List[int] = [224, 224],
+    device: str = "cuda:0",
+) -> DirectReplayPolicy:
+    """
+    Factory function to create DirectReplayPolicy.
+    
+    Args:
+        hdf5_path: Path to HDF5 file or directory with episode_*.hdf5 files
+        demo_folder: Path to demo folder (e.g., outputs/demo_video/demos) - alternative to hdf5_path
+        task_folder: Path to task folder containing config.json (optional)
+        h5py_path: Path to h5py directory (optional, alternative to task_folder)
+        episode_idx: Episode index if hdf5_path is a directory or demo_folder (default: 0)
+        env_idx: Environment index for demo_folder (default: 0)
+        observation_keys: List of observation keys
+        action_type: 'qpos' or 'ee' - type of actions to replay
+        image_resolution: Image resolution
+        device: Device to run on
+        
+    Returns:
+        DirectReplayPolicy instance
+    """
+    return DirectReplayPolicy(
+        observation_keys=observation_keys,
+        action_type=action_type,
+        image_resolution=image_resolution,
+        hdf5_path=hdf5_path,
+        demo_folder=demo_folder,
+        task_folder=task_folder,
+        h5py_path=h5py_path,
+        episode_idx=episode_idx,
+        env_idx=env_idx,
+        device=device,
+    )
+
+
+# Test-related imports (only needed for test functions)
+# Note: AppLauncher is already initialized above
+from envs.cfgs.eval_cfg import EvaluationConfig
+from envs.cfgs.task_cfg import load_task_cfg
+from envs.sim_wrapper_isaac import PolicyEvaluationWrapper
+from envs.make_env_isaac import make_env
+
+
+def clean_task_cfg(task_cfg):
+    """Clean task config paths."""
+    def rename_path(path: str, or2s_dir: Path) -> str:
+        return path.replace("/app", str(or2s_dir))
+    
+    o2s_dir = Path("/home/peiqiduan/OpenReal2Sim-dev")
+    task_cfg.background_cfg.background_rgb_path = rename_path(
+        task_cfg.background_cfg.background_rgb_path, o2s_dir
+    )
+    task_cfg.background_cfg.background_mesh_path = rename_path(
+        task_cfg.background_cfg.background_mesh_path, o2s_dir
+    )
+    task_cfg.background_cfg.background_usd_path = rename_path(
+        task_cfg.background_cfg.background_usd_path, o2s_dir
+    )
+    for obj in task_cfg.objects:
+        obj.mesh_path = rename_path(obj.mesh_path, o2s_dir)
+        obj.usd_path = rename_path(obj.usd_path, o2s_dir)
+    return task_cfg
+
+
+def test_direct_replay(
+    hdf5_path: Optional[str] = None,
+    demo_folder: Optional[str] = None,
+    task_folder: Optional[str] = None,
+    h5py_path: Optional[str] = None,
+    episode_idx: int = 0,
+    env_idx: int = 0,
+    task_json_path: str = "/app/tasks/demo_video/task.json",
+    num_envs: int = 1,
+    device: str = "cuda:0",
+    action_type: str = "qpos",
+):
+    """
+    Test DirectReplayPolicy with PolicyEvaluationWrapper.
+    
+    Args:
+        hdf5_path: Path to HDF5 file or directory with episode_*.hdf5 files
+        demo_folder: Path to demo folder (e.g., outputs/demo_video/demos) - alternative to hdf5_path
+        task_folder: Path to task folder containing config.json (optional)
+        h5py_path: Path to h5py directory (optional, alternative to task_folder)
+        episode_idx: Episode index if hdf5_path is a directory or demo_folder (default: 0)
+        env_idx: Environment index for demo_folder (default: 0)
+        task_json_path: Path to task.json file
+        num_envs: Number of parallel environments
+        device: Device to run on
+    """
+    # Load task config
+    task_cfg = load_task_cfg(task_json_path)
+    # task_cfg = clean_task_cfg(task_cfg)
+    
+    # Use default evaluation config
+    eval_cfg = EvaluationConfig()
+    if action_type == "ee" or action_type == "ee_direct":
+        eval_cfg.max_steps = 13
+    else:
+        eval_cfg.max_steps = 1000  # Allow enough steps for replay
+    eval_cfg.record_video = True
+    eval_cfg.use_randomization = False
+    
+    # Create environment
+    env, _ = make_env(
+        task_cfg,
+        eval_cfg=eval_cfg,
+        num_envs=num_envs,
+        device=device,
+        bg_simplify=False
+    )
+    sim, scene = env.sim, env.scene
+    my_sim = PolicyEvaluationWrapper(sim, scene, task_cfg, eval_cfg, num_envs)
+    
+    # Create direct replay policy
+    replay_policy = DirectReplayPolicy(
+        observation_keys=['rgb'],
+        action_type=action_type,
+        image_resolution=[224, 224],
+        hdf5_path=hdf5_path,
+        demo_folder=demo_folder,
+        task_folder=task_folder,
+        h5py_path=h5py_path,
+        episode_idx=episode_idx,
+        env_idx=env_idx,
+        device=device
+    )
+    
+    # Set policy and evaluate
+    my_sim.set_policy(replay_policy)
+    my_sim.out_dir = Path("/app/r2sVLA/results")
+    
+    print(f"Starting direct replay test with {len(replay_policy.actions)} actions")
+    result = my_sim.evaluate_episode()
+    print(f"Replay result: {result}")
+    my_sim.save_data()
+    
+    env.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test DirectReplayPolicy")
+    parser.add_argument(
+        "--hdf5_path",
+        type=str,
+        default=None,
+        help="Path to HDF5 file or directory with episode_*.hdf5 files"
+    )
+    parser.add_argument(
+        "--demo_folder",
+        type=str,
+        default=None,
+        help="Path to demo folder (e.g., outputs/demo_video/demos) - alternative to hdf5_path"
+    )
+    parser.add_argument(
+        "--task_folder",
+        type=str,
+        default=None,
+        help="Path to task folder containing config.json (optional)"
+    )
+    parser.add_argument(
+        "--h5py_path",
+        type=str,
+        default=None,
+        help="Path to h5py directory (optional, alternative to task_folder)"
+    )
+    parser.add_argument(
+        "--episode_idx",
+        type=int,
+        default=-1,
+        help="Episode index if hdf5_path is a directory or demo_folder (default: 0)"
+    )
+    parser.add_argument(
+        "--env_idx",
+        type=int,
+        default=0,
+        help="Environment index for demo_folder (default: 0)"
+    )
+    parser.add_argument(
+        "--task_json_path",
+        type=str,
+        default="/app/tasks/demo_video/task.json",
+        help="Path to task.json file (default: /app/tasks/demo_video/task.json)"
+    )
+    parser.add_argument(
+        "--num_envs",
+        type=int,
+        default=1,
+        help="Number of parallel environments (default: 1)"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda:0",
+        help="Device to run on (default: cuda:0)"
+    )
+    parser.add_argument(
+        "--action_type",
+        type=str,
+        default="qpos",
+        choices=["qpos", "ee_direct", "ee_cam", "ee_l"],
+        help="Action type: 'qpos' for joint position control, 'ee' for end-effector control (default: qpos)"
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate that at least one of hdf5_path or demo_folder is provided
+    if args.hdf5_path is None and args.demo_folder is None:
+        parser.error("Either --hdf5_path or --demo_folder must be provided")
+    
+    try:
+        test_direct_replay(
+            hdf5_path=args.hdf5_path,
+            demo_folder=args.demo_folder,
+            task_folder=args.task_folder,
+            h5py_path=args.h5py_path,
+            episode_idx=args.episode_idx,
+            env_idx=args.env_idx,
+            task_json_path=args.task_json_path,
+            num_envs=args.num_envs,
+            device=args.device,
+            action_type=args.action_type
+        )
+    except Exception as e:
+        print(f"[ERR] Error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+    finally:
+        simulation_app.close()
diff --git a/r2sVLA/envs/test/test_sim_wrapper_isaac.py b/r2sVLA/envs/test/test_sim_wrapper_isaac.py
new file mode 100644
index 0000000..b6d4a44
--- /dev/null
+++ b/r2sVLA/envs/test/test_sim_wrapper_isaac.py
@@ -0,0 +1,222 @@
+"""
+Test script for PolicyEvaluationWrapper with RandomPolicy.
+
+This test script demonstrates how to use the PolicyEvaluationWrapper
+with a RandomPolicy for testing the simulation environment.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+import sys
+from typing import List, Optional, Dict, Any
+import torch
+
+# Initialize Isaac Lab AppLauncher FIRST, before any other Isaac Lab imports
+# This is required to set up the carb module and other Isaac Sim dependencies
+from isaaclab.app import AppLauncher
+
+# Parse arguments
+app_launcher = AppLauncher(headless=True, enable_cameras=True)
+simulation_app = app_launcher.app
+
+# Add parent directories to path
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent.parent))
+sys.path.append(str(file_path.parent.parent.parent))
+
+# Now we can safely import modules that depend on Isaac Lab
+from envs.cfgs.eval_cfg import EvaluationConfig
+from envs.cfgs.policy_interface import BasePolicy, Action
+from envs.cfgs.task_cfg import load_task_cfg
+from envs.sim_wrapper_isaac import PolicyEvaluationWrapper
+from envs.make_env_isaac import make_env
+
+
+
+class RandomPolicy(BasePolicy):
+    """
+    Random policy that generates random actions.
+    Useful for testing and baseline comparisons.
+    
+    For 'qpos' action_type: generates random joint positions within safe limits
+    For 'ee' action_type: generates random end-effector poses within workspace
+    """
+    
+    def __init__(
+        self,
+        observation_keys: List[str],
+        action_type: str,
+        image_resolution: List[int],
+        num_joints: int = 7,  # Default for Franka (7 arm joints)
+        qpos_low: Optional[torch.Tensor] = None,
+        qpos_high: Optional[torch.Tensor] = None,
+        ee_pos_low: Optional[torch.Tensor] = None,
+        ee_pos_high: Optional[torch.Tensor] = None,
+        device: str = "cuda:0",
+    ):
+        """
+        Initialize random policy.
+        
+        Args:
+            observation_keys: List of observation keys (not used for random policy)
+            action_type: 'qpos' or 'ee'
+            image_resolution: Image resolution (not used for random policy)
+            num_joints: Number of joints (for qpos action_type)
+            qpos_low: Lower bounds for joint positions [num_joints]
+            qpos_high: Upper bounds for joint positions [num_joints]
+            ee_pos_low: Lower bounds for end-effector position [3]
+            ee_pos_high: Upper bounds for end-effector position [3]
+            device: Device to run on
+        """
+        super().__init__(observation_keys, action_type, image_resolution)
+        self.num_joints = num_joints
+        self.device = device
+        
+        # Set default joint limits (Franka Panda safe ranges)
+        if qpos_low is None:
+            # Franka Panda joint limits (radians)
+            self.qpos_low = torch.tensor([
+                -2.8973, -1.7628, -2.8973, -3.0718, -2.8973, -0.0175, -2.8973
+            ], device=device, dtype=torch.float32)
+        else:
+            self.qpos_low = qpos_low.to(device) if torch.is_tensor(qpos_low) else torch.tensor(qpos_low, device=device, dtype=torch.float32)
+        
+        if qpos_high is None:
+            self.qpos_high = torch.tensor([
+                2.8973, 1.7628, 2.8973, -0.0698, 2.8973, 3.7525, 2.8973
+            ], device=device, dtype=torch.float32)
+        else:
+            self.qpos_high = qpos_high.to(device) if torch.is_tensor(qpos_high) else torch.tensor(qpos_high, device=device, dtype=torch.float32)
+        
+       
+    def get_action(self, observation: Dict[str, Any]) -> Action:
+        """
+        Generate random action.
+        
+        Args:
+            observation: Observation dict (ignored for random policy)
+        
+        Returns:
+            Action with random values
+        """
+        # Random gripper state (50% chance open/closed)
+        gripper_open = torch.rand(1, device=self.device).item() > 0.5
+        
+        if self.action_type == 'qpos':
+            # Generate random joint positions within limits
+            qpos = torch.rand(self.num_joints, device=self.device) * (self.qpos_high - self.qpos_low) + self.qpos_low
+            # Add batch dimension [1, num_joints]
+            qpos = qpos.unsqueeze(0)
+            return Action(action_type='qpos', qpos=qpos, gripper_open=gripper_open)
+        
+        elif self.action_type == 'ee':
+            # Generate random end-effector position within workspace
+            ee_pos = torch.rand(3, device=self.device) * (self.ee_pos_high - self.ee_pos_low) + self.ee_pos_low
+            
+            # Generate random quaternion (wxyz format)
+            # Sample from uniform distribution on unit sphere
+            quat = torch.randn(4, device=self.device)
+            quat = quat / torch.norm(quat)  # Normalize to unit quaternion
+            # Ensure w is positive (standard convention)
+            if quat[0] < 0:
+                quat = -quat
+            
+            # Combine position and quaternion [7]
+            ee_pose = torch.cat([ee_pos, quat])
+            # Add batch dimension [1, 7]
+            ee_pose = ee_pose.unsqueeze(0)
+            
+            return Action(action_type='ee', ee_pose=ee_pose, gripper_open=gripper_open)
+        
+        else:
+            raise ValueError(f"Invalid action_type: {self.action_type}")
+    
+    def reset(self) -> None:
+        """Reset policy state (no state to reset for random policy)."""
+        pass
+
+
+
+def clean_task_cfg(task_cfg):
+    """Clean task config paths."""
+    def rename_path(path: str, or2s_dir: Path) -> str:
+        return path.replace("/app", str(or2s_dir))
+    
+    o2s_dir = Path("/home/peiqiduan/OpenReal2Sim-dev")
+    task_cfg.background_cfg.background_rgb_path = rename_path(
+        task_cfg.background_cfg.background_rgb_path, o2s_dir
+    )
+    task_cfg.background_cfg.background_mesh_path = rename_path(
+        task_cfg.background_cfg.background_mesh_path, o2s_dir
+    )
+    task_cfg.background_cfg.background_usd_path = rename_path(
+        task_cfg.background_cfg.background_usd_path, o2s_dir
+    )
+    for obj in task_cfg.objects:
+        obj.mesh_path = rename_path(obj.mesh_path, o2s_dir)
+        obj.usd_path = rename_path(obj.usd_path, o2s_dir)
+    return task_cfg
+
+
+def test_sim_wrapper_isaac():
+    """Test PolicyEvaluationWrapper with RandomPolicy."""
+    # Load task config
+    task_cfg = load_task_cfg("/app/tasks/demo_video/task.json")
+    # task_cfg = clean_task_cfg(task_cfg)
+    
+    # Use default evaluation config
+    eval_cfg = EvaluationConfig()
+    eval_cfg.max_steps = 100  # Limit steps for testing
+    eval_cfg.record_video = True
+    
+    num_envs = 1
+
+    env, _ = make_env(
+        task_cfg,
+        eval_cfg=eval_cfg,
+        num_envs=num_envs,
+        device="cuda:0",
+        bg_simplify=False
+    )
+    sim, scene = env.sim, env.scene
+    my_sim = PolicyEvaluationWrapper(sim, scene, task_cfg, eval_cfg, num_envs)
+    
+    # Create random policy
+    random_policy = RandomPolicy(
+        observation_keys=['rgb'],
+        action_type='qpos',  # or 'ee'
+        image_resolution=[224, 224],
+        num_joints=7,  # Franka has 7 arm joints
+        device="cuda:0"
+    )
+    
+    # Set policy and evaluate
+    my_sim.set_policy(random_policy)
+    my_sim.out_dir = Path("/app/r2sVLA/results")
+    res = []
+    for i in range(eval_cfg.num_trials):
+        result = my_sim.evaluate_episode()
+        print(f"Trial {i+1} result: {result}")
+        my_sim.save_data()
+        res.append(result)
+    
+    success_num = 0
+    for r in res:
+        success_num += r['success'].sum().item()
+    success_rate = success_num / (eval_cfg.num_trials * num_envs)
+    print(f"Success rate: {success_rate}")
+    
+    env.close()
+
+
+if __name__ == "__main__":
+    try:
+        test_sim_wrapper_isaac()
+    except Exception as e:
+        print(f"[ERR] Error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+    finally:
+        simulation_app.close()
+
diff --git a/r2sVLA/results/demo_000.mp4 b/r2sVLA/results/demo_000.mp4
new file mode 100644
index 0000000..6edb14e
Binary files /dev/null and b/r2sVLA/results/demo_000.mp4 differ
diff --git a/r2sVLA/results/demo_001.mp4 b/r2sVLA/results/demo_001.mp4
new file mode 100644
index 0000000..11629f5
Binary files /dev/null and b/r2sVLA/results/demo_001.mp4 differ
diff --git a/r2sVLA/results/demo_002.mp4 b/r2sVLA/results/demo_002.mp4
new file mode 100644
index 0000000..25f5ad8
Binary files /dev/null and b/r2sVLA/results/demo_002.mp4 differ
diff --git a/r2sVLA/results/demo_003.mp4 b/r2sVLA/results/demo_003.mp4
new file mode 100644
index 0000000..604cc7e
Binary files /dev/null and b/r2sVLA/results/demo_003.mp4 differ
diff --git a/r2sVLA/results/demo_004.mp4 b/r2sVLA/results/demo_004.mp4
new file mode 100644
index 0000000..f82ee1b
Binary files /dev/null and b/r2sVLA/results/demo_004.mp4 differ
diff --git a/r2sVLA/results/demo_005.mp4 b/r2sVLA/results/demo_005.mp4
new file mode 100644
index 0000000..dc9f4ba
Binary files /dev/null and b/r2sVLA/results/demo_005.mp4 differ
diff --git a/r2sVLA/results/demo_006.mp4 b/r2sVLA/results/demo_006.mp4
new file mode 100644
index 0000000..3b6f521
Binary files /dev/null and b/r2sVLA/results/demo_006.mp4 differ
diff --git a/r2sVLA/results/demo_007.mp4 b/r2sVLA/results/demo_007.mp4
new file mode 100644
index 0000000..675cf71
Binary files /dev/null and b/r2sVLA/results/demo_007.mp4 differ
diff --git a/r2sVLA/results/demo_008.mp4 b/r2sVLA/results/demo_008.mp4
new file mode 100644
index 0000000..2959fe8
Binary files /dev/null and b/r2sVLA/results/demo_008.mp4 differ
diff --git a/r2sVLA/results/demo_009.mp4 b/r2sVLA/results/demo_009.mp4
new file mode 100644
index 0000000..71cf016
Binary files /dev/null and b/r2sVLA/results/demo_009.mp4 differ
diff --git a/r2sVLA/results/demo_010.mp4 b/r2sVLA/results/demo_010.mp4
new file mode 100644
index 0000000..c821055
Binary files /dev/null and b/r2sVLA/results/demo_010.mp4 differ
diff --git a/r2sVLA/results/demo_011.mp4 b/r2sVLA/results/demo_011.mp4
new file mode 100644
index 0000000..8206cba
Binary files /dev/null and b/r2sVLA/results/demo_011.mp4 differ
diff --git a/r2sVLA/results/demo_012.mp4 b/r2sVLA/results/demo_012.mp4
new file mode 100644
index 0000000..a421d08
Binary files /dev/null and b/r2sVLA/results/demo_012.mp4 differ
diff --git a/r2sVLA/results/demo_013.mp4 b/r2sVLA/results/demo_013.mp4
new file mode 100644
index 0000000..fda1ed9
Binary files /dev/null and b/r2sVLA/results/demo_013.mp4 differ
diff --git a/r2sVLA/results/demo_014.mp4 b/r2sVLA/results/demo_014.mp4
new file mode 100644
index 0000000..6a3b3e3
Binary files /dev/null and b/r2sVLA/results/demo_014.mp4 differ
diff --git a/r2sVLA/results/demo_015.mp4 b/r2sVLA/results/demo_015.mp4
new file mode 100644
index 0000000..babb9aa
Binary files /dev/null and b/r2sVLA/results/demo_015.mp4 differ
diff --git a/r2sVLA/results/demo_016.mp4 b/r2sVLA/results/demo_016.mp4
new file mode 100644
index 0000000..ed2b482
Binary files /dev/null and b/r2sVLA/results/demo_016.mp4 differ
diff --git a/r2sVLA/results/demo_017.mp4 b/r2sVLA/results/demo_017.mp4
new file mode 100644
index 0000000..8cc50fc
Binary files /dev/null and b/r2sVLA/results/demo_017.mp4 differ
diff --git a/r2sVLA/results/demo_018.mp4 b/r2sVLA/results/demo_018.mp4
new file mode 100644
index 0000000..a3c14d3
Binary files /dev/null and b/r2sVLA/results/demo_018.mp4 differ
diff --git a/r2sVLA/results/demo_019.mp4 b/r2sVLA/results/demo_019.mp4
new file mode 100644
index 0000000..4127424
Binary files /dev/null and b/r2sVLA/results/demo_019.mp4 differ
diff --git a/r2sVLA/results/demo_020.mp4 b/r2sVLA/results/demo_020.mp4
new file mode 100644
index 0000000..e4d5a7a
Binary files /dev/null and b/r2sVLA/results/demo_020.mp4 differ
diff --git a/r2sVLA/results/demo_021.mp4 b/r2sVLA/results/demo_021.mp4
new file mode 100644
index 0000000..84b8c18
Binary files /dev/null and b/r2sVLA/results/demo_021.mp4 differ
diff --git a/r2sVLA/results/demo_022.mp4 b/r2sVLA/results/demo_022.mp4
new file mode 100644
index 0000000..26d2051
Binary files /dev/null and b/r2sVLA/results/demo_022.mp4 differ
diff --git a/r2sVLA/results/demo_023.mp4 b/r2sVLA/results/demo_023.mp4
new file mode 100644
index 0000000..9f11f33
Binary files /dev/null and b/r2sVLA/results/demo_023.mp4 differ
diff --git a/r2sVLA/results/demo_024.mp4 b/r2sVLA/results/demo_024.mp4
new file mode 100644
index 0000000..e7afa41
Binary files /dev/null and b/r2sVLA/results/demo_024.mp4 differ
diff --git a/r2sVLA/results/demo_025.mp4 b/r2sVLA/results/demo_025.mp4
new file mode 100644
index 0000000..607df99
Binary files /dev/null and b/r2sVLA/results/demo_025.mp4 differ
diff --git a/r2sVLA/results/demo_026.mp4 b/r2sVLA/results/demo_026.mp4
new file mode 100644
index 0000000..bb0b9a8
Binary files /dev/null and b/r2sVLA/results/demo_026.mp4 differ
diff --git a/r2sVLA/results/demo_027.mp4 b/r2sVLA/results/demo_027.mp4
new file mode 100644
index 0000000..f282531
Binary files /dev/null and b/r2sVLA/results/demo_027.mp4 differ
diff --git a/r2sVLA/results/demo_028.mp4 b/r2sVLA/results/demo_028.mp4
new file mode 100644
index 0000000..7a5b36b
Binary files /dev/null and b/r2sVLA/results/demo_028.mp4 differ
diff --git a/r2sVLA/results/demo_029.mp4 b/r2sVLA/results/demo_029.mp4
new file mode 100644
index 0000000..67d3ef1
Binary files /dev/null and b/r2sVLA/results/demo_029.mp4 differ
diff --git a/r2sVLA/results/demo_030.mp4 b/r2sVLA/results/demo_030.mp4
new file mode 100644
index 0000000..d52c25b
Binary files /dev/null and b/r2sVLA/results/demo_030.mp4 differ
diff --git a/r2sVLA/results/demo_031.mp4 b/r2sVLA/results/demo_031.mp4
new file mode 100644
index 0000000..7732009
Binary files /dev/null and b/r2sVLA/results/demo_031.mp4 differ
diff --git a/r2sVLA/results/demo_032.mp4 b/r2sVLA/results/demo_032.mp4
new file mode 100644
index 0000000..5974f3d
Binary files /dev/null and b/r2sVLA/results/demo_032.mp4 differ
diff --git a/r2sVLA/results/demo_033.mp4 b/r2sVLA/results/demo_033.mp4
new file mode 100644
index 0000000..621857e
Binary files /dev/null and b/r2sVLA/results/demo_033.mp4 differ
diff --git a/r2sVLA/scripts/eval_policy.py b/r2sVLA/scripts/eval_policy.py
new file mode 100644
index 0000000..6aa9a41
--- /dev/null
+++ b/r2sVLA/scripts/eval_policy.py
@@ -0,0 +1,286 @@
+"""
+Simplified policy evaluation script for r2sVLA.
+Evaluates ACT policy in Isaac Lab simulation environment.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import argparse
+import yaml
+from pathlib import Path
+from typing import Dict, Any, Optional
+import torch
+import numpy as np
+from datetime import datetime
+
+# Add paths
+sys.path.append(str(Path(__file__).parent.parent))
+
+from envs.make_env_isaac import make_env
+from envs.sim_wrapper_isaac import PolicyEvaluationWrapper
+from envs.cfgs.eval_cfg import EvaluationConfig
+from envs.cfgs.task_cfg import TaskCfg, load_task_cfg
+from algos.act.act_policy_wrapper import ACTPolicyWrapper
+
+
+def load_policy_config(config_path: str) -> Dict[str, Any]:
+    """Load policy configuration from YAML file."""
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+
+
+def create_act_policy(
+    policy_config: Dict[str, Any],
+    ckpt_path: str,
+    observation_keys: list,
+    action_type: str,
+    image_resolution: list,
+) -> ACTPolicyWrapper:
+    """Create and load ACT policy."""
+    
+    # Extract policy-specific config
+    policy_class = policy_config.get('policy_class', 'ACT')
+    temporal_agg = policy_config.get('temporal_agg', False)
+    temporal_agg_num_queries = policy_config.get('temporal_agg_num_queries', 16)
+    
+    # Build policy config dict for ACT
+    act_policy_config = {
+        'lr': policy_config.get('lr', 1e-4),
+        'num_queries': policy_config.get('chunk_size', 100),
+        'kl_weight': policy_config.get('kl_weight', 10.0),
+        'hidden_dim': policy_config.get('hidden_dim', 512),
+        'dim_feedforward': policy_config.get('dim_feedforward', 3200),
+        'lr_backbone': policy_config.get('lr_backbone', 1e-5),
+        'backbone': policy_config.get('backbone', 'resnet18'),
+        'enc_layers': policy_config.get('enc_layers', 4),
+        'dec_layers': policy_config.get('dec_layers', 7),
+        'nheads': policy_config.get('nheads', 8),
+        'camera_names': policy_config.get('camera_names', ['cam_high']),
+        'task_name': policy_config.get('task_name', 'default'),
+        'state_dim': policy_config.get('state_dim', 8),  # 7 joints + 1 gripper for Franka
+    }
+    
+    # Create policy wrapper
+    policy = ACTPolicyWrapper(
+        observation_keys=observation_keys,
+        action_type=action_type,
+        image_resolution=image_resolution,
+        policy_class=policy_class,
+        ckpt_path=ckpt_path,
+        policy_config=act_policy_config,
+        temporal_agg=temporal_agg,
+        temporal_agg_num_queries=temporal_agg_num_queries,
+    )
+    
+    return policy
+
+
+def eval_policy(
+    env: PolicyEvaluationWrapper,
+    policy: ACTPolicyWrapper,
+    num_episodes: int = 100,
+    seed: int = 0,
+) -> Dict[str, Any]:
+    """
+    Evaluate policy for multiple episodes.
+    
+    Args:
+        env: PolicyEvaluationWrapper environment
+        policy: ACT policy wrapper
+        num_episodes: Number of episodes to evaluate
+        seed: Random seed
+    
+    Returns:
+        Dictionary with evaluation results
+    """
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    
+    # Set policy in environment
+    env.set_policy(policy)
+    
+    success_count = 0
+    total_steps = 0
+    episode_results = []
+    
+    print(f"\n{'='*60}")
+    print(f"Starting evaluation: {num_episodes} episodes")
+    print(f"{'='*60}\n")
+    
+    for episode_idx in range(num_episodes):
+        # Reset environment
+        env.reset(env_ids=[0])  # Single environment evaluation
+        
+        # Run episode
+        result = env.evaluate_episode(reset_env=False, env_ids=[0])
+        
+        success = result['success'][0].item()
+        steps = result['steps'][0].item()
+        
+        success_count += int(success)
+        total_steps += steps
+        
+        episode_results.append({
+            'episode': episode_idx,
+            'success': success,
+            'steps': steps,
+        })
+        
+        # Print progress
+        success_rate = success_count / (episode_idx + 1) * 100
+        print(
+            f"Episode {episode_idx+1}/{num_episodes} | "
+            f"Success: {'✓' if success else '✗'} | "
+            f"Steps: {steps} | "
+            f"Success Rate: {success_rate:.1f}%"
+        )
+    
+    # Summary
+    final_success_rate = success_count / num_episodes * 100
+    avg_steps = total_steps / num_episodes
+    
+    print(f"\n{'='*60}")
+    print(f"Evaluation Summary")
+    print(f"{'='*60}")
+    print(f"Total Episodes: {num_episodes}")
+    print(f"Success Count: {success_count}")
+    print(f"Success Rate: {final_success_rate:.1f}%")
+    print(f"Average Steps: {avg_steps:.1f}")
+    print(f"{'='*60}\n")
+    
+    return {
+        'num_episodes': num_episodes,
+        'success_count': success_count,
+        'success_rate': final_success_rate,
+        'avg_steps': avg_steps,
+        'episode_results': episode_results,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate ACT policy')
+    parser.add_argument('--config', type=str, default=None,
+                        help='Path to evaluation config YAML file (optional if ckpt_dir has eval_config.yaml)')
+    parser.add_argument('--ckpt_path', type=str, required=True,
+                        help='Path to policy checkpoint file')
+    parser.add_argument('--ckpt_dir', type=str, default=None,
+                        help='Path to checkpoint directory (for auto-loading config, optional if --config provided)')
+    parser.add_argument('--num_episodes', type=int, default=100,
+                        help='Number of episodes to evaluate')
+    parser.add_argument('--seed', type=int, default=0,
+                        help='Random seed')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='Directory to save results')
+    
+    args = parser.parse_args()
+    
+    # Auto-detect config from checkpoint directory if not provided
+    if args.config is None:
+        if args.ckpt_dir is None:
+            # Try to infer ckpt_dir from ckpt_path
+            ckpt_path = Path(args.ckpt_path)
+            if ckpt_path.is_file():
+                args.ckpt_dir = ckpt_path.parent
+            else:
+                raise ValueError("Either --config or --ckpt_dir must be provided, or ckpt_path must be in a directory with eval_config.yaml")
+        
+        # Look for eval_config.yaml in checkpoint directory
+        eval_config_path = Path(args.ckpt_dir) / 'eval_config.yaml'
+        if eval_config_path.exists():
+            print(f"Auto-loading config from {eval_config_path}")
+            args.config = str(eval_config_path)
+        else:
+            raise ValueError(f"Config file not found. Please provide --config or ensure {eval_config_path} exists")
+    
+    # Load config
+    config = load_policy_config(args.config)
+    
+    # Extract configuration
+    task_cfg_path = config.get('task_cfg_path', None)
+    task_cfg_dict = config.get('task_cfg', {})
+    eval_cfg_dict = config.get('eval_cfg', {})
+    policy_cfg_dict = config.get('policy_cfg', {})
+    
+    # Create TaskCfg - load from JSON file if path provided, otherwise use dict
+    if task_cfg_path:
+        task_cfg = load_task_cfg(Path(task_cfg_path))
+    elif task_cfg_dict:
+        # For now, require task_cfg_path. Can add from_dict later if needed
+        raise ValueError("task_cfg_path is required in config. Please provide path to task.json file.")
+    else:
+        raise ValueError(
+            "task_cfg_path is required in config. "
+            "If using auto-loaded config from checkpoint, you still need to provide task_cfg_path. "
+            "You can either:\n"
+            "  1. Create a full config file with task_cfg_path and eval_cfg\n"
+            "  2. Or use --config with a complete config file that includes task_cfg_path"
+        )
+    
+    # Create EvaluationConfig (use defaults if not provided)
+    if eval_cfg_dict:
+        eval_cfg = EvaluationConfig.from_dict(eval_cfg_dict)
+    else:
+        # Use default evaluation config if not provided
+        print("Warning: eval_cfg not found in config, using defaults")
+        eval_cfg = EvaluationConfig()
+    
+    # Set up save directory
+    if args.save_dir is None:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        args.save_dir = f"eval_results/{task_cfg.task_name}/{timestamp}"
+    
+    save_dir = Path(args.save_dir)
+    save_dir.mkdir(parents=True, exist_ok=True)
+    eval_cfg.video_save_dir = save_dir if eval_cfg.record_video else None
+    
+    # Create environment
+    print("Creating environment...")
+    sim, scene, env_cfg = make_env(task_cfg=task_cfg)
+    num_envs = 1  # Single environment for evaluation
+    env = PolicyEvaluationWrapper(
+        sim=sim,
+        scene=scene,
+        task_cfg=task_cfg,
+        eval_cfg=eval_cfg,
+        num_envs=num_envs,
+    )
+    print("Environment created successfully")
+    
+    # Create policy
+    print("Loading policy...")
+    observation_keys = policy_cfg_dict.get('observation_keys', ['rgb'])
+    action_type = policy_cfg_dict.get('action_type', 'qpos')
+    image_resolution = policy_cfg_dict.get('image_resolution', [224, 224])
+    
+    policy = create_act_policy(
+        policy_config=policy_cfg_dict,
+        ckpt_path=args.ckpt_path,
+        observation_keys=observation_keys,
+        action_type=action_type,
+        image_resolution=image_resolution,
+    )
+    print("Policy loaded successfully")
+    
+    # Evaluate
+    results = eval_policy(
+        env=env,
+        policy=policy,
+        num_episodes=args.num_episodes,
+        seed=args.seed,
+    )
+    
+    # Save results
+    results_path = save_dir / "results.yaml"
+    with open(results_path, 'w') as f:
+        yaml.dump(results, f, default_flow_style=False)
+    
+    print(f"Results saved to {results_path}")
+    
+    return results
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/tasks/demo_genvideo/.asset_hash b/tasks/demo_genvideo/.asset_hash
new file mode 100755
index 0000000..9877ddb
--- /dev/null
+++ b/tasks/demo_genvideo/.asset_hash
@@ -0,0 +1 @@
+c2cfba1ab80d74e8f8efa33a816e1361
\ No newline at end of file
diff --git a/tasks/demo_genvideo/2_spoon_optimized.usd b/tasks/demo_genvideo/2_spoon_optimized.usd
new file mode 100644
index 0000000..f85bffa
Binary files /dev/null and b/tasks/demo_genvideo/2_spoon_optimized.usd differ
diff --git a/tasks/demo_genvideo/background.glb b/tasks/demo_genvideo/background.glb
new file mode 100755
index 0000000..6d7da31
Binary files /dev/null and b/tasks/demo_genvideo/background.glb differ
diff --git a/tasks/demo_genvideo/background.usd b/tasks/demo_genvideo/background.usd
new file mode 100644
index 0000000..a2a8508
Binary files /dev/null and b/tasks/demo_genvideo/background.usd differ
diff --git a/tasks/demo_genvideo/bg_rgb.jpg b/tasks/demo_genvideo/bg_rgb.jpg
new file mode 100755
index 0000000..92b59ff
Binary files /dev/null and b/tasks/demo_genvideo/bg_rgb.jpg differ
diff --git a/tasks/demo_genvideo/config.yaml b/tasks/demo_genvideo/config.yaml
new file mode 100755
index 0000000..a4dd3d8
--- /dev/null
+++ b/tasks/demo_genvideo/config.yaml
@@ -0,0 +1,47 @@
+asset_path: /app/outputs/demo_genvideo/simulation/2_spoon_optimized.glb
+usd_dir: /app/outputs/demo_genvideo/simulation
+usd_file_name: 2_spoon_optimized.usd
+force_usd_conversion: true
+make_instanceable: false
+mass_props:
+  mass: 1.0
+  density: null
+rigid_props:
+  rigid_body_enabled: null
+  kinematic_enabled: false
+  disable_gravity: false
+  linear_damping: null
+  angular_damping: null
+  max_linear_velocity: null
+  max_angular_velocity: null
+  max_depenetration_velocity: null
+  max_contact_impulse: null
+  enable_gyroscopic_forces: null
+  retain_accelerations: null
+  solver_position_iteration_count: null
+  solver_velocity_iteration_count: null
+  sleep_threshold: null
+  stabilization_threshold: null
+collision_props:
+  collision_enabled: true
+  contact_offset: 0.006
+  rest_offset: -0.002
+  torsional_patch_radius: 0.05
+  min_torsional_patch_radius: 0.005
+collision_approximation: convexDecomposition
+translation: !!python/tuple
+- 0.0
+- 0.0
+- 0.0
+rotation: !!python/tuple
+- 1.0
+- 0.0
+- 0.0
+- 0.0
+scale: !!python/tuple
+- 1.0
+- 1.0
+- 1.0
+##
+# Generated by MeshConverter on 2025-12-02 at 19:43:01.
+##
diff --git a/tasks/demo_genvideo/object_2.glb b/tasks/demo_genvideo/object_2.glb
new file mode 100755
index 0000000..7915535
Binary files /dev/null and b/tasks/demo_genvideo/object_2.glb differ
diff --git a/tasks/demo_genvideo/task.json b/tasks/demo_genvideo/task.json
new file mode 100644
index 0000000..0de5129
--- /dev/null
+++ b/tasks/demo_genvideo/task.json
@@ -0,0 +1 @@
+{"task_key": "demo_genvideo", "task_id": 2, "task_desc": ["Pick up the spoon and place it on the ground.", "Pick up the spoon."], "task_type": "SIMPLE_PICK", "background_cfg": {"background_rgb_path": "/app/tasks/demo_genvideo/bg_rgb.jpg", "background_mesh_path": "/app/tasks/demo_genvideo/background.glb", "background_usd_path": "/app/tasks/demo_genvideo/background.usd", "background_point": [-0.0050928425043821335, 0.000900214712601155, 0.5791710615158081]}, "camera_info": {"width": 848.0, "height": 480.0, "fx": 611.22998046875, "fy": 610.7579956054688, "cx": 429.53900146484375, "cy": 241.20399475097656, "camera_opencv_to_world": [[-0.22265699270439665, -0.9743771788702543, 0.03182729797954699, -0.023066046624221542], [-0.9743771788702543, 0.22348549726677502, 0.025364262423084753, -0.013668135446919388], [-0.03182729797954699, -0.025364262423084753, -0.9991714954376221, 0.6130032684625838], [0.0, 0.0, 0.0, 1.0]], "camera_position": [-0.023066046624221542, -0.013668135446919388, 0.6130032684625838], "camera_heading_wxyz": [0.020353188477213263, -0.6231029219692462, 0.7818749876752668, -0.0]}, "manipulated_oid": "2", "start_related": [], "end_related": [], "objects": [{"object_id": "2", "object_name": "spoon", "mesh_path": "/app/tasks/demo_genvideo/object_2.glb", "usd_path": "/app/tasks/demo_genvideo/2_spoon_optimized.usd"}], "reference_trajectory": null, "generated_trajectories": null}
\ No newline at end of file
diff --git a/tasks/demo_video/.asset_hash b/tasks/demo_video/.asset_hash
new file mode 100755
index 0000000..b33f83a
--- /dev/null
+++ b/tasks/demo_video/.asset_hash
@@ -0,0 +1 @@
+9b78ef35230774e636e844af3c970222
\ No newline at end of file
diff --git a/tasks/demo_video/1_pen_optimized.usd b/tasks/demo_video/1_pen_optimized.usd
new file mode 100644
index 0000000..ab0bee2
Binary files /dev/null and b/tasks/demo_video/1_pen_optimized.usd differ
diff --git a/tasks/demo_video/background.glb b/tasks/demo_video/background.glb
new file mode 100755
index 0000000..a78bd22
Binary files /dev/null and b/tasks/demo_video/background.glb differ
diff --git a/tasks/demo_video/background.usd b/tasks/demo_video/background.usd
new file mode 100644
index 0000000..88da644
Binary files /dev/null and b/tasks/demo_video/background.usd differ
diff --git a/tasks/demo_video/bg_rgb.jpg b/tasks/demo_video/bg_rgb.jpg
new file mode 100755
index 0000000..d686762
Binary files /dev/null and b/tasks/demo_video/bg_rgb.jpg differ
diff --git a/tasks/demo_video/config.yaml b/tasks/demo_video/config.yaml
new file mode 100755
index 0000000..fbc6e2a
--- /dev/null
+++ b/tasks/demo_video/config.yaml
@@ -0,0 +1,47 @@
+asset_path: /app/outputs/demo_video/simulation/1_pen_optimized.glb
+usd_dir: /app/outputs/demo_video/simulation
+usd_file_name: 1_pen_optimized.usd
+force_usd_conversion: true
+make_instanceable: false
+mass_props:
+  mass: 1.0
+  density: null
+rigid_props:
+  rigid_body_enabled: null
+  kinematic_enabled: false
+  disable_gravity: false
+  linear_damping: null
+  angular_damping: null
+  max_linear_velocity: null
+  max_angular_velocity: null
+  max_depenetration_velocity: null
+  max_contact_impulse: null
+  enable_gyroscopic_forces: null
+  retain_accelerations: null
+  solver_position_iteration_count: null
+  solver_velocity_iteration_count: null
+  sleep_threshold: null
+  stabilization_threshold: null
+collision_props:
+  collision_enabled: true
+  contact_offset: 0.006
+  rest_offset: -0.002
+  torsional_patch_radius: 0.05
+  min_torsional_patch_radius: 0.005
+collision_approximation: convexDecomposition
+translation: !!python/tuple
+- 0.0
+- 0.0
+- 0.0
+rotation: !!python/tuple
+- 1.0
+- 0.0
+- 0.0
+- 0.0
+scale: !!python/tuple
+- 1.0
+- 1.0
+- 1.0
+##
+# Generated by MeshConverter on 2025-12-02 at 19:51:15.
+##
diff --git a/tasks/demo_video/object_1.glb b/tasks/demo_video/object_1.glb
new file mode 100755
index 0000000..5265fca
Binary files /dev/null and b/tasks/demo_video/object_1.glb differ
diff --git a/tasks/demo_video/task.json b/tasks/demo_video/task.json
new file mode 100644
index 0000000..28b7b0d
--- /dev/null
+++ b/tasks/demo_video/task.json
@@ -0,0 +1 @@
+{"task_key": "demo_video", "task_id": 2, "task_desc": ["Pick up the pen and place it on the ground."], "task_type": "SIMPLE_PICK_PLACE", "background_cfg": {"background_rgb_path": "/app/tasks/demo_video/bg_rgb.jpg", "background_mesh_path": "/app/tasks/demo_video/background.glb", "background_usd_path": "/app/tasks/demo_video/background.usd", "background_point": [0.007876700721681118, 0.007433394435793161, 0.6783376932144165]}, "camera_info": {"width": 584.0, "height": 328.0, "fx": 664.4203491210938, "fy": 663.5459594726562, "cx": 295.5, "cy": 166.0, "camera_opencv_to_world": [[0.9998286366462708, 0.016611162573099136, -0.008169667795300484, -0.0014619952999055386], [0.016611162573099136, -0.6103171706199646, 0.7919829487800598, -0.6476890444755554], [0.008169667795300484, -0.7919829487800598, -0.6104885339736938, 0.43338295817375183], [0.0, 0.0, 0.0, 1.0]], "camera_position": [-0.0014619952999055386, -0.6476890444755554, 0.43338295817375183], "camera_heading_wxyz": [0.44131138920783997, -0.8973063230514526, -0.009256127290427685, 0.0]}, "manipulated_oid": "1", "start_related": [], "end_related": [], "objects": [{"object_id": "1", "object_name": "pen", "mesh_path": "/app/tasks/demo_video/object_1.glb", "usd_path": "/app/tasks/demo_video/1_pen_optimized.usd"}], "reference_trajectory": [{"robot_pose": [0.1803032010793686, -0.4136963486671448, 0.013969618827104568, 0.5735764503479004, 1.8387499522987127e-08, 3.810622928313023e-09, 0.8191520571708679], "object_poses": {"1": [0, 0, 0, 1, 0, 0, 0]}, "object_trajectory": [[7.806255641895632e-18, 3.469446951953614e-18, 5.551115123125783e-17, 1.0, 1.965016845084225e-17, -2.220446049250313e-16, 6.938893903907228e-18], [0.007051527034491301, 0.009465012699365616, -0.015993360430002213, 0.9831469726788524, 0.03344727934049603, -0.17949933307103916, -0.009126831004202321], [-0.016475899145007133, -0.013392122462391853, -0.014356533996760845, 0.9785306413302751, 0.054516202336950086, -0.17663286251433663, 0.09114054827851654], [-0.006909223273396492, -0.033702608197927475, -0.0013878592289984226, 0.9878290572028813, -0.015675841803810764, -0.017441978730307284, 0.15376540282969592], [0.019986754283308983, -0.009689857251942158, 0.004198385402560234, 0.9852802103713244, -0.03389668556303966, 0.0421641882630126, 0.16216073195012298], [0.04694914445281029, 0.026577500626444817, 0.005078887101262808, 0.9883821533624347, -0.03595245382574507, 0.05074320664092624, 0.13868405444873566], [0.07197888195514679, 0.052728310227394104, 0.008139587938785553, 0.9881644397647448, -0.030301288582472347, 0.05430640006431242, 0.14022726841354305], [0.098885178565979, 0.07067321240901947, 0.0029824115335941315, 0.9859156180969406, -0.0010619361090744425, -0.0043220227673461675, 0.16718428874576258], [0.09669288992881775, 0.06645581126213074, 0.011395134963095188, 0.9711629237598531, -0.08604067813905195, 0.15409129520852516, 0.16029800360748164], [0.08217167854309082, 0.05488347262144089, 0.022346483543515205, 0.9172260445041982, -0.1878138209459796, 0.29977513346374973, 0.18318630216707021], [0.0777614489197731, 0.04396161809563637, 0.035967424511909485, 0.8361528265458607, -0.28323055638970634, 0.4175152070187925, 0.2151974779001835], [0.08104538172483444, 0.04987487196922302, 0.04134378954768181, 0.7624544780967426, -0.35034058664100937, 0.5086763841872572, 0.19280295213620216]], "final_gripper_close": false, "success_metric": {"success_metric_type": "TARGET_PLANE", "final_gripper_close": false, "lift_height": null, "ground_value": 0.01775433123111725, "end_pose": [0.11726001650094986, 0.06917917728424072, 0.01775433123111725, 0.9194124937057495, -0.32787105441093445, 0.1775575876235962, 0.1251174360513687]}, "pregrasp_pose": [0.07971740514039993, -0.18867208063602448, 0.1535249501466751, 0.01927226595580578, 0.1295030415058136, 0.9805928468704224, 0.14592890441417694], "grasp_pose": [0.08863737434148788, -0.1554902344942093, 0.040638282895088196, 0.01927226595580578, 0.1295030415058136, 0.9805928468704224, 0.14592890441417694], "robot_type": "FRANKA", "init_manip_object_com": [0.05123988166451454, -0.12842372059822083, 0.11833348870277405]}], "generated_trajectories": []}
\ No newline at end of file
diff --git a/third_party/MinkowskiEngine b/third_party/MinkowskiEngine
new file mode 160000
index 0000000..02fc608
--- /dev/null
+++ b/third_party/MinkowskiEngine
@@ -0,0 +1 @@
+Subproject commit 02fc608bea4c0549b0a7b00ca1bf15dee4a0b228
diff --git a/third_party/PromptDA b/third_party/PromptDA
new file mode 160000
index 0000000..35d56b4
--- /dev/null
+++ b/third_party/PromptDA
@@ -0,0 +1 @@
+Subproject commit 35d56b41bfbf7fb00a6d8f180ee96ca3a2e5cabc
diff --git a/third_party/WiLoR b/third_party/WiLoR
new file mode 160000
index 0000000..1936830
--- /dev/null
+++ b/third_party/WiLoR
@@ -0,0 +1 @@
+Subproject commit 1936830a63d32629665a03ec038310f3a9b09772
diff --git a/third_party/graspness_unofficial b/third_party/graspness_unofficial
new file mode 160000
index 0000000..b2956d9
--- /dev/null
+++ b/third_party/graspness_unofficial
@@ -0,0 +1 @@
+Subproject commit b2956d90905c514e79ebeaabd98893275400d479
diff --git a/third_party/graspnetAPI b/third_party/graspnetAPI
new file mode 160000
index 0000000..55a4e32
--- /dev/null
+++ b/third_party/graspnetAPI
@@ -0,0 +1 @@
+Subproject commit 55a4e32d5f7237267463bef0dde962989e0b190d