diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b97641a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,195 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+/datasets
+/dataset_cache
+
+# Outputs
+/outputs
+/lightning_logs
+/checkpoints
+
+.bashrc
+/launcher_venv
+/slurm_logs
+*.torch
+*.ckpt
+table.tex
+/baselines
+/test/*
+
+wandb/
+output*
+results*
+
+*.ply
+*.mp4
+!assets/pipeline.jpg
+!examples/video/*.mp4
+
+src/loss/depth_anything/*
+
+.vscode/
+.gradio/
+note.txt
+anysplat_ckpt*
+input_images_*
+tmp_scripts/
diff --git a/.gitmodules b/.gitmodules
index 9a9af42..ed07a66 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,9 @@
-[submodule "sam-3d-objects"]
-	path = sam-3d-objects
-	url = https://github.com/facebookresearch/sam-3d-objects.git
-[submodule "sam3"]
-	path = sam3
-	url = https://github.com/facebookresearch/sam3.git
+[submodule "Sam-3d-objects"]
+	path = submodule/Sam-3d-objects
+	url = https://github.com/Yuchi-Zhang-00/sam-3d-objects.git
+[submodule "AnySplat"]
+	path = submodule/AnySplat
+	url = https://github.com/Yuchi-Zhang-00/AnySplat.git
+[submodule "Prompt-Inpaint"]
+	path = submodule/Prompt-Inpaint
+	url = https://github.com/MrZoyo/Prompt-Inpaint.git
diff --git a/README.md b/README.md
index 424ff4a..bb765b4 100644
--- a/README.md
+++ b/README.md
@@ -4,257 +4,478 @@
   </a>
 </p>
 
-# **Unified Multi-Stage 2D→3D Perception Pipeline**
+# **Unified 2D Single-Image → 3D Object Generation Pipeline**
 
-## *vLLM × SAM3 × SAM-3D-Objects Integration*
+## *Prompt-Inpaint × AnySplat × SAM-3D-Objects Integration*
+
+> This repo was originally forked from [xyys2003/sam3d_gs](https://github.com/xyys2003/sam3d_gs).
 
 ------
 
 ## **Abstract**
 
-This repository presents a unified and modular pipeline that couples large-scale vision–language reasoning, high-fidelity 2D segmentation, and multi-object 3D Gaussian splatting. It integrates three independent systems—**vLLM** (for Qwen3-VL inference), **SAM3** (for multi-object 2D segmentation), and **SAM-3D-Objects** (for 3D reconstruction from RGB + masks)—into a complete, end-to-end workflow. To ensure reproducibility, each module runs inside its own Conda environment. The pipeline supports both staged execution and a fully automated one-click execution, with built-in HuggingFace authentication, checkpoint management, and environment initialization.
+This repository packages a single-image 2D → 3D object reconstruction pipeline by composing three open-source systems behind one entry script:
 
-------
+- **Prompt-Inpaint** — text-prompted multi-object segmentation (built on SAM3) plus background inpainting, producing per-object masks and a clean background image.
+- **AnySplat** — feed-forward 3D Gaussian Splatting from a single image, plus a RANSAC-based table-alignment pass that brings the scene into a Mujoco-friendly world frame.
+- **SAM-3D-Objects** — per-object mesh and Gaussian reconstruction from RGB + mask.
 
-# **1. Repository Setup**
+The three components are wired together through scripts under `pipeline/` and a single uv-managed virtual environment, so the whole pipeline runs from one shell command.
 
-```
-git clone --recursive https://github.com/xyys2003/sam3d_gs.git
-cd sam3d_gs
-```
+------
 
-If cloned without submodules:
+# **1. Repository Layout**
 
 ```
-git submodule update --init --recursive
+.
+├── run_object_generation_pipeline.sh   # one-shot entry: image → 3D assets
+├── pipeline/
+│   ├── background_reconstruction.py       # AnySplat + table RANSAC alignment
+│   ├── objects_generation.py           # SAM-3D-Objects multi-object reconstruction
+│   ├── mesh2mjcf.py                       # optional: convert per-object .obj → MuJoCo MJCF
+│   └── utils.py                           # shared rendering / IO helpers
+└── submodule/
+    ├── Prompt-Inpaint/                    # SAM3 segmentation + inpainting
+    ├── AnySplat/                          # single-image 3DGS reconstruction
+    └── Sam-3d-objects/                    # per-object mesh / GS reconstruction
 ```
 
 ------
 
-# **2. Conda Environments**
+# **2. Setup**
 
-| Environment     | Purpose                                  | Path              |
-| --------------- | ---------------------------------------- | ----------------- |
-| `vllm`          | Serve Qwen3-VL-8B-Thinking via vLLM      | —                 |
-| `sam3`          | Multi-object segmentation (SAM3)         | `sam3/`           |
-| `sam3d-objects` | RGB + masks → 3D Gaussian reconstruction | `sam-3d-objects/` |
+The project runs inside a single `uv`-managed virtual environment (`.venv/`). The setup below targets RTX 50-series GPUs (CUDA 12.8, PyTorch 2.7) and is also verified to work on 3090 / 4090.
 
-------
+> **Hardware**: an NVIDIA GPU with **≥ 24 GB VRAM** is recommended. The pipeline loads SAM3, AnySplat, and SAM-3D-Objects sequentially and the SAM-3D-Objects stage in particular is memory-hungry.
 
-# **3. vLLM Environment (Qwen3-VL Server)**
+## **2.1 Clone with submodules**
 
-```
-conda create -n vllm python=3.10 -y
-conda activate vllm
+```bash
+git clone --recursive https://github.com/Yuchi-Zhang-00/sam3d_gs.git
+cd sam3d_gs
 ```
 
-Install PyTorch (CUDA 12.x):
+If the submodules were not initialized at clone time:
 
-```
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
-    --index-url https://download.pytorch.org/whl/cu124
+```bash
+git submodule update --init --recursive
 ```
 
-Install vLLM:
+## **2.2 Install the Python environment**
 
-```
-pip install vllm --extra-index-url https://download.pytorch.org/whl/cu124
-pip install transformers tiktoken sentencepiece xformers flashinfer-python
-pip install huggingface_hub
+The recommended path is the bundled one-command installer:
+
+```bash
+bash scripts/install_env.sh
 ```
 
-------
+It creates `.venv`, installs PyTorch for CUDA 12.8, the submodule dependencies, and the project-level runtime dependencies.
 
-# **4. SAM3 Environment**
+If you would rather run each step yourself, see [`install.md`](install.md). It also documents the small SAM-3D-Objects requirements-file patches and the AnySplat `kernels.cu` fix used to build the CUDA RoPE2D kernel.
 
-Reference implementation:
- 🔗 https://github.com/facebookresearch/sam3
- 🔗 https://huggingface.co/facebook/sam3
+## **2.3 HuggingFace access**
 
-```
-cd sam3
-conda create -n sam3 python=3.10 -y
-conda activate sam3
-```
+The pipeline pulls three models from HuggingFace:
 
-Install SAM3:
+| Model | Used by | Access |
+| --- | --- | --- |
+| [`facebook/sam3`](https://huggingface.co/facebook/sam3) | Prompt-Inpaint (Stage 1) | **Gated** — request access on the model page |
+| [`facebook/sam-3d-objects`](https://huggingface.co/facebook/sam-3d-objects) | SAM-3D-Objects (Stage 3) | **Gated** — request access on the model page |
+| [`lhjiang/anysplat`](https://huggingface.co/lhjiang/anysplat) | AnySplat (Stage 2) | Public (MIT) |
 
-```
-git clone https://github.com/facebookresearch/sam3.git
-cd sam3
-pip install -e .
+After accepting the agreements on the two gated pages, log in once:
+
+```bash
+hf auth login
 ```
 
-Optional:
+The two gated models need explicit local placement and are fetched by a
+single bootstrap script (run once, after `hf auth login`):
 
+```bash
+bash scripts/download_checkpoints.sh
 ```
-pip install -e ".[notebooks]"
-pip install -e ".[train,dev]"
-```
+
+| Model | Target |
+| --- | --- |
+| `facebook/sam-3d-objects` | `submodule/Sam-3d-objects/checkpoints/hf/` (Hydra config tree, not fetched by `from_pretrained`) |
+| `facebook/sam3` | `submodule/Prompt-Inpaint/checkpoints/sam3.pt` (~3.3 GB; placed locally so it isn't lost when `~/.cache` is cleaned) |
+
+The script is idempotent and is also invoked automatically by
+`run_object_generation_pipeline.sh` on first run. Use `--skip-sam3d`,
+`--skip-sam3`, or `--force` to control individual stages.
+
+`lhjiang/anysplat` is also fetched by the same bootstrap script (into the
+standard HuggingFace hub cache at `~/.cache/huggingface/hub/`). It is public
+(MIT), so no `hf auth login` is required for this one — pre-fetching just
+keeps the first Stage-2 run from doing a multi-GB download. Pass
+`--skip-anysplat` if you'd rather have AnySplat pull it lazily on first run.
 
 ------
 
-# **5. SAM-3D-Objects Environment**
+## **2.4 Docker image (alternative to 2.1–2.3)**
 
-Reference implementation:
- 🔗 https://github.com/facebookresearch/sam3d
- 🔗 https://huggingface.co/facebook/sam-3d-objects
+A pre-built image with the full environment (CUDA 12.8 base, the
+uv-managed `.venv`, the compiled AnySplat curope CUDA extension, and all
+PyPI deps) is published to Aliyun Container Registry:
 
 ```
-conda create -n sam_3d_body python=3.10 -y
-conda activate sam_3d_body
+crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1
+crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:latest
 ```
 
-Install dependencies (excerpt):
+Using the image skips §2.2 entirely; you still need a clone of this repo on
+the host (the launcher and the host-side checkpoint directories) and HF
+access for the two gated models (§2.3).
 
-```
-pip install pytorch-lightning pyrender opencv-python yacs scikit-image einops timm dill pandas hydra-core ...
-```
+### **Prerequisites**
 
-Install Detectron2:
+- Docker with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+  installed; an NVIDIA GPU with ≥ 24 GB VRAM
+- A local clone of this repo (`git clone --recursive ...`, see §2.1) — used
+  both for the `run_docker.sh` launcher and as the bind-mount root for
+  checkpoints, data, and outputs
+- One-time HuggingFace setup (§2.3) and a host-side run of
+  `bash scripts/download_checkpoints.sh`. Checkpoints live on the host and
+  are bind-mounted into the container, so this only runs once.
 
-```
-pip install 'git+https://github.com/facebookresearch/detectron2.git@a1ce2f9' \
-    --no-build-isolation --no-deps
+### **Pull the image**
+
+```bash
+docker pull crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1
+docker tag  crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 sam3d-gs:latest
 ```
 
-Optional: MoGe
+The re-tag is optional. `run_docker.sh` defaults to `sam3d-gs:latest`; if
+you'd rather not re-tag, prefix the launch with
+`SAM3D_IMAGE=crpi-.../sam3d_gs:v0.1` instead.
 
-```
-pip install git+https://github.com/microsoft/MoGe.git
+### **Launch the container**
+
+```bash
+./run_docker.sh                                       # uses defaults
+./run_docker.sh /path/to/sam3d_gs                     # explicit project dir
+./run_docker.sh /path/to/sam3d_gs /mnt/hf_cache       # custom HF cache root
+SAM3D_IMAGE=sam3d-gs:v0.1 ./run_docker.sh             # pick a specific tag
+TORCH_HOME=/mnt/torch_cache ./run_docker.sh           # custom torch hub cache
 ```
 
-------
+The launcher bind-mounts the relevant host paths into the container:
 
-# **6. Required HuggingFace Access**
+| Host path | Container path | Purpose |
+| --- | --- | --- |
+| `<repo>/submodule/Sam-3d-objects/checkpoints` | same | SAM-3D-Objects weights (gated) |
+| `<repo>/submodule/Prompt-Inpaint/checkpoints` | same | SAM3 weight (gated) |
+| `${HF_HOME:-$HOME/.cache/huggingface}` | `/root/.cache/huggingface` | AnySplat + other HF downloads |
+| `${TORCH_HOME:-$HOME/.cache/torch}` | `/root/.cache/torch` | `torch.hub` cache (DINOv2 etc.) |
+| `<repo>/data` | `/opt/sam3d_gs/data` | scratch input/output dir |
+| `<repo>/example` | `/opt/sam3d_gs/example` | bundled demo input/output |
 
-The pipeline requires access to the following models:
+Pipeline outputs land in whichever scene directory you point the launcher
+at — since `data/` and `example/` are bind-mounted, those outputs persist
+on the host after the container exits.
 
-- **SAM3**
-   🔗 https://huggingface.co/facebook/sam3
-- **SAM-3D-Objects**
-   🔗 https://huggingface.co/facebook/sam-3d-objects
+### **Run the pipeline inside the container**
 
-Log in after requesting access:
+You land in `/opt/sam3d_gs/`. The image's `PATH` and `PYTHONPATH` already
+point at the bundled `.venv`, so you can call `python` and run scripts
+directly — **no `source .venv/bin/activate`**.
 
-```
-hf auth login
+```bash
+# Bundled demo:
+bash run_object_generation_pipeline.sh example/example.png
+
+# Your own image:
+bash run_object_generation_pipeline.sh data/my_scene/input_image.png
 ```
 
-------
+Stage 1/2/3 each behave exactly as in §3–§4 below.
 
-# **7. Running the Pipeline**
+### **What's baked into the image**
 
-Ensure the Conda activation path is correct:
+- CUDA 12.8 devel base + Python 3.11 `.venv` with every PyPI dep
+- Compiled AnySplat `curope` CUDA extension (sm_80 / 90 / 100 / 120)
+- `coacd`, `trimesh`, `mujoco` (so `pipeline/mesh2mjcf.py` works out of the box)
+- `sitecustomize.py` patching `torch.hub` to use the local cache without
+  pinging github first (avoids `RemoteDisconnected` on flaky networks once
+  the model is in `~/.cache/torch/hub`)
+- A global `git insteadOf` rule routing `https://github.com/` through
+  `https://gh-proxy.com/https://github.com/`, so in-container `git clone`
+  works on networks where direct github access is unreliable
 
-```
-CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh"
-```
+### **What's NOT baked in**
+
+- The three model checkpoint sets (SAM3, SAM-3D-Objects, AnySplat). They
+  live on the host and are bind-mounted via the table above. Run
+  `scripts/download_checkpoints.sh` once on the host.
+- Your input data. Drop it into `<repo>/data/<scene_name>/` and reference
+  it as `data/<scene_name>/input_image.png` inside the container.
+
+### **Caveats**
+
+- **Output files end up owned by `root` on the host.** The container runs
+  as root, so anything the pipeline writes into a bind-mounted directory
+  (`data/`, `example/`, the checkpoint dirs, etc.) shows up on the host
+  with uid 0. Two ways to deal with it:
+
+  ```bash
+  # After the container exits, fix ownership on the host:
+  sudo chown -R $(id -u):$(id -g) data/ example/
+
+  # Or run the container as your host user from the start.
+  # This avoids the chown step but can break EGL / pyrender setup
+  # in some Sam-3d-objects code paths, so prefer the chown fix.
+  # (To try anyway: edit run_docker.sh and add `--user $(id -u):$(id -g)`
+  # to the `docker run` invocation.)
+  ```
+
+- **The `gh-proxy.com` redirect is for users behind the GFW.** The image
+  bakes a `git config --global url.<proxy>.insteadOf https://github.com/`
+  rule so in-container `git clone` of github URLs survives flaky direct
+  access from mainland China. **Outside mainland China this hop is
+  unnecessary and may slow things down.** Disable it once per container
+  start:
+
+  ```bash
+  git config --global --unset url."https://gh-proxy.com/https://github.com/".insteadOf
+  ```
+
+  (Or bake your own image variant with the rule removed if you'd rather
+  not run that every time.)
 
 ------
 
-## **Stage 1 — Qwen3-VL + SAM3 (2D Mask Generation)**
+# **3. Quick Start**
 
-```
-bash run_agent_with_vllm.sh
-```
+> If you're using the Docker image (§2.4), start the container first with
+> `./run_docker.sh` — every command in this section runs **inside** the
+> container exactly as written.
 
-Outputs:
+Try the bundled demo image (the entry script activates `.venv` internally, so you don't need to do it yourself):
 
+```bash
+bash run_object_generation_pipeline.sh example/example.png
 ```
-outputs/master_with_vllm/masks/
+
+By default, all outputs are written next to the input image (in this case, into `example/`). Pass an explicit output directory as the second argument if you want them elsewhere:
+
+```bash
+bash run_object_generation_pipeline.sh example/example.png path/to/scene_dir
 ```
 
+The script runs three stages in sequence inside the single `.venv`:
+
+1. `submodule/Prompt-Inpaint/main.py` — segmentation + inpainting
+2. `pipeline/background_reconstruction.py` — AnySplat reconstruction + table alignment
+3. `pipeline/objects_generation.py` — per-object mesh + Gaussian export
+
 ------
 
-## **Stage 2 — SAM-3D-Objects Reconstruction**
+# **4. Pipeline Stages**
 
-```
-bash run_sam3d_from_masks.sh
+## **Stage 1 — Prompt-Inpaint (SAM3 segmentation + inpainting)**
+
+```bash
+python submodule/Prompt-Inpaint/main.py \
+    --resize-output \
+    --save-individual-masks \
+    --config submodule/Prompt-Inpaint/configs/items.yml \
+    --image path/to/input_image.png \
+    --output-dir path/to/scene_dir
 ```
 
-Outputs:
+Outputs (under `scene_dir/`):
 
-```
-sam-3d-objects/outputs/torch_save_pt/
-sam-3d-objects/gaussians/multi/
+- `input_image.png` — resized copy of the input
+- `clean_background.png` — inpainted background with all foreground objects removed
+- `bg_mask.png` — table / desktop mask used for plane fitting
+- `masks/<object_name>.png` — per-object binary masks
+
+## **Stage 2 — AnySplat + table-aligned 3D Gaussians**
+
+```bash
+python pipeline/background_reconstruction.py path/to/scene_dir
 ```
 
-------
+Behaviour:
 
-## **Optional: One-Click Execution**
+- Loads `clean_background.png` (and the matching `input_image.png`) inside each scene folder under the input directory.
+- Runs AnySplat to recover camera intrinsics/extrinsics, depth, and a 3DGS reconstruction.
+- Fits a RANSAC plane to `bg_mask.png`, derives an OBB via inner PCA, and builds a world-to-table transform.
+- Re-emits the splat in a Mujoco-friendly frame.
 
+Useful flags:
+
+- `--model-id lhjiang/anysplat` — override the AnySplat HuggingFace model id
+- `--align-table` / `--no-align-table` — toggle RANSAC table alignment + the `bg_aligned.ply` export (default: enabled). When disabled, only the raw `bg.ply` is written
+- `--x-offset`, `--z-offset` — optional placement offsets (m) applied after alignment. Default: 0, so the aligned cloud sits at the origin
+
+Outputs (under `scene_dir/`):
+
+- `extrinsic.npy`, `intrinsic.npy` — camera parameters (world-to-camera; pixel-unit intrinsics)
+- `depth.npy`, `depth_visual.png` — depth from the splat reconstruction
+- `depth_ori.npy`, `depth_ori_visual.png` — depth from the original (non-inpainted) image
+- `scale.npy` — scene-level scale factor
+- `3d_assets/bg.ply` — raw 3DGS scene from AnySplat
+- `3d_assets/bg_aligned.ply` — table-aligned 3DGS scene (only when `--align-table` is on, which is the default)
+
+## **Stage 3 — SAM-3D-Objects per-object reconstruction**
+
+```bash
+python pipeline/objects_generation.py --input-dir path/to/scene_dir
 ```
-bash run_pipeline.sh
-```
+
+Useful flags:
+
+- `--project-root submodule/Sam-3d-objects` — checkpoint root
+- `--tag hf` — checkpoint subdirectory (`submodule/Sam-3d-objects/checkpoints/<tag>/pipeline.yaml`)
+- `--seed 42`, `--save-pt`, `--save-intermediate`
+
+For each mask, the stage runs SAM-3D-Objects inference, recovers the object's local scale by matching projected area + mean depth against the AnySplat depth map, and exports the asset at the origin.
+
+Outputs (under `scene_dir/3d_assets/`):
+
+- `<object>.obj` — per-object mesh sized for Mujoco
+- `<object>.ply` — per-object 3D Gaussians sized for Mujoco
+- `<object>_keyframe.npy` — mean XYZ of the final mesh
+- (with `--save-intermediate`) debug renderings and the pose-applied versions
 
 ------
 
-# **8. Q&A**
+# **5. Optional Tools**
 
-## **Q1: Download error “Consistency check failed: file should be XXXX but has size YYYY”?**
+## **`pipeline/mesh2mjcf.py` — mesh → MuJoCo MJCF converter**
 
-Cause: corrupted model shards in the HuggingFace cache due to unstable network.
+A standalone CLI that turns a single `.obj` or `.stl` mesh into MuJoCo MJCF
+assets (a `<asset>_dependencies.xml` + `<asset>.xml` pair, plus a per-asset
+mesh / texture directory). It is **not** wired into
+`run_object_generation_pipeline.sh`; use it on demand once Stage 3 has
+produced `<scene>/3d_assets/<obj>.obj`.
 
-Fix:
+By default, the output root is the parent directory of the input mesh, so
+running it on `scene_dir/3d_assets/cup.obj` writes a self-contained per-asset
+folder right next to the input:
 
 ```
-rm -rf sam-3d-objects/checkpoints/hf
-rm -rf ~/.cache/huggingface/hub   # optional
-bash run_sam3d_from_masks.sh
+scene_dir/3d_assets/
+  cup.obj                      (original input, untouched)
+  cup/                         (per-asset output folder, named after the obj stem)
+    cup.obj                    (copy of the input)
+    cup.mtl                    (if multi-material)
+    <texture files>            (referenced by the MTL)
+    part_0.obj part_1.obj ...  (if -cd)
+    mjcf/
+      cup.xml
+      cup_dependencies.xml
 ```
 
-Force fresh download:
+Mesh paths inside the emitted XMLs are written as `<asset>/<file>`, so the
+consuming MuJoCo scene should set `meshdir` (and `texturedir`) to the output
+root. Pass `-o/--output <dir>` to redirect.
 
-```
-force_download=True
-```
+### Required libraries
+
+Fresh installs via `scripts/install_env.sh` already include all three optional
+packages (`coacd`, `trimesh`, `mujoco`), so the table below is only for
+reference if you skip the bundled installer or build the environment
+piecemeal:
 
-## **Note on Coordinate System (PLY Output Orientation)**
+| Feature | Library | Manual install |
+| --- | --- | --- |
+| Multi-material OBJ splitting (automatic when an MTL file is present) | `trimesh` | `uv pip install trimesh` |
+| Convex decomposition (`-cd`) | `coacd`, `trimesh` | `uv pip install coacd trimesh` |
+| Preview viewer (`--verbose`) | `mujoco` | `uv pip install mujoco` |
 
-The 3D Gaussian `.ply` files exported by **SAM-3D-Objects** are expressed in the **camera coordinate system**, where:
+### Usage
 
-- **+Z axis** points **forward** from the camera
-- **+X axis** points right
-- **+Y axis** points downward (typical computer vision convention)
+```bash
+# Basic conversion (default colour / mass / inertia)
+python pipeline/mesh2mjcf.py path/to/cup.obj
 
-This means the reconstructed objects are aligned using **camera-forward Z-axis** rather than a world coordinate frame.
+# Custom RGBA, mass, and diagonal inertia
+python pipeline/mesh2mjcf.py path/to/cup.obj \
+    --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005
 
-If you want to visualize or place the objects in a global **world coordinate system**, you must apply a **camera-to-world transformation**:
-$$
-\mathbf{X}_{world} = \mathbf{R}_{c2w}\ \mathbf{X}_{camera} \ + \ \mathbf{t}_{c2w}
-$$
-Where:
+# Free-floating body + convex decomposition for accurate collisions
+python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint -cd
 
-- $\mathbf{R}_{c2w}$ is the rotation matrix from camera to world
-- $\mathbf{t}_{c2w}$ is the translation vector
-- $\mathbf{X}_{camera}$ is the Gaussian center in camera coordinates
-- $\mathbf{X}_{world}$ is the desired world coordinate position
+# Preview in mujoco.viewer after conversion
+python pipeline/mesh2mjcf.py path/to/cup.obj --verbose
+
+# Batch over all per-object meshes in one scene
+for obj in scene_dir/3d_assets/*.obj; do
+    python pipeline/mesh2mjcf.py "$obj" -cd
+done
+```
 
-After applying this transformation, the `.ply` will correctly align with your global scene, robotics simulator, or NeRF / COLMAP world frame.
 ------
 
-# **Citation**
+# **6. FAQ**
+
+**Q: HuggingFace download fails with “Consistency check failed: file should be XXXX but has size YYYY”.**
 
-### SAM3
+Corrupt shards in the HuggingFace cache. Clear and retry:
 
+```bash
+rm -rf submodule/Sam-3d-objects/checkpoints/hf
+rm -rf ~/.cache/huggingface/hub   # optional, more aggressive
+bash run_object_generation_pipeline.sh path/to/input_image.png
 ```
-@article{kirillov2024sam3,
-  title={SAM 3: Segment Anything in Images and Videos},
-  author={Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others},
-  year={2024},
-  url={https://github.com/facebookresearch/sam3}
-}
+
+You can also force a fresh download by setting `force_download=True` when invoking the HuggingFace API.
+
+**Q: AnySplat reports “cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead”.**
+
+The CUDA extension was not built. Apply the `kernels.cu` patch documented in [`install.md`](install.md) and run `python setup.py build_ext --inplace`.
+
+**Q: `ImportError: cannot import name 'cached_download' from 'huggingface_hub'` during Stage 1 (Prompt-Inpaint / iopaint).**
+
+`huggingface_hub` ≥ 0.26 removed `cached_download`, but `diffusers` 0.27.x (which is what `iopaint` pulls in) still imports it. Downgrade `huggingface_hub` to 0.25.2:
+
+```bash
+source .venv/bin/activate
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+    "huggingface_hub==0.25.2"
 ```
 
-### SAM-3D-Objects
+Fresh installs via `scripts/install_env.sh` already include this pin.
+
+**Q: `ImportError: cannot import name 'is_offline_mode' from 'huggingface_hub'` during Stage 1.**
 
+Same symptom from the other direction: `transformers` 5.x imports `is_offline_mode` from `huggingface_hub`, which doesn't exist in 0.25.2. Pin transformers to 4.48.3:
+
+```bash
+source .venv/bin/activate
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+    "transformers==4.48.3"
 ```
+
+Fresh installs via `scripts/install_env.sh` already include this pin.
+
+------
+
+# **Citations**
+
+```bibtex
+@article{kirillov2024sam3,
+  title  = {SAM 3: Segment Anything in Images and Videos},
+  author = {Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others},
+  year   = {2024},
+  url    = {https://github.com/facebookresearch/sam3}
+}
+
 @article{wu2024sam3dobjects,
-  title={SAM-3D-Objects: Segment Anything in 3D Using 2D Masks},
-  author={Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others},
-  year={2024},
-  url={https://github.com/facebookresearch/sam3d}
+  title  = {SAM-3D-Objects: Segment Anything in 3D Using 2D Masks},
+  author = {Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others},
+  year   = {2024},
+  url    = {https://github.com/facebookresearch/sam-3d-objects}
+}
+
+@article{jiang2024anysplat,
+  title  = {AnySplat: Feed-forward 3D Gaussian Splatting from Unconstrained Views},
+  author = {Jiang, Lihan and others},
+  year   = {2024},
+  url    = {https://github.com/OpenRobotLab/AnySplat}
 }
 ```
 
@@ -264,11 +485,9 @@ After applying this transformation, the `.ply` will correctly align with your gl
 
 This project is built upon and integrates:
 
-- **SAM3**
-   GitHub: https://github.com/facebookresearch/sam3
-   HuggingFace: https://huggingface.co/facebook/sam3
-- **SAM-3D-Objects**
-   GitHub: https://github.com/facebookresearch/sam3d
-   HuggingFace: https://huggingface.co/facebook/sam-3d-objects
+- **SAM3** — [GitHub](https://github.com/facebookresearch/sam3) · [HuggingFace](https://huggingface.co/facebook/sam3)
+- **SAM-3D-Objects** — [GitHub](https://github.com/facebookresearch/sam3d) · [HuggingFace](https://huggingface.co/facebook/sam-3d-objects)
+- **AnySplat** — [HuggingFace](https://huggingface.co/lhjiang/anysplat)
+- **Prompt-Inpaint** — [GitHub](https://github.com/MrZoyo/Prompt-Inpaint)
 
-We sincerely thank the authors for making their research and implementations publicly available.
\ No newline at end of file
+We thank the authors for making their research and implementations publicly available.
diff --git a/README_zh.md b/README_zh.md
index 5ab1418..0a1d1a6 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -4,327 +4,481 @@
   </a>
 </p>
 
-# **统一的多阶段 2D→3D 感知流水线**
+# **2D 单图 → 3D 物体生成流水线**
 
-## *vLLM × SAM3 × SAM-3D-Objects 集成*
+## *Prompt-Inpaint × AnySplat × SAM-3D-Objects 集成*
+
+> 本仓库最初 fork 自 [xyys2003/sam3d_gs](https://github.com/xyys2003/sam3d_gs)。
 
 ------
 
 ## **摘要**
 
-本仓库构建了一个完整的 2D → 3D 感知流水线，将 **大模型视觉理解、2D 多物体分割、3D Gaussian Splatting 重建** 三者进行统一整合。流水线由：
+本仓库将三个开源系统串联进单条流水线，使用一条命令即可完成单图 → 多物体 3D 资产的生成：
 
-- **vLLM**：提供 Qwen3-VL-8B-Thinking 视觉语言大模型推理
-- **SAM3**：执行高质量多物体 2D 分割
-- **SAM-3D-Objects**：将 RGB + mask 提升为 3D 高斯点（Gaussian Splat）
+- **Prompt-Inpaint**：基于 SAM3 的文本提示多物体分割 + 背景补全，产出有每个物体的 mask 与 clean background。
+- **AnySplat**：单图前馈式 3D Gaussian Splatting 重建；额外的 RANSAC 桌面对齐将场景对齐到坐标系原点。
+- **SAM-3D-Objects**：以 RGB + mask 为输入，重建单物体的 mesh 与 Gaussian。
 
-为确保可复现性，每个模块均独立运行在各自的 Conda 环境中。系统支持 **分阶段执行**（先 2D 分割、再 3D 重建），也支持 **一键式全流程运行**。
+三者通过 `pipeline/` 下的脚本以及一个由 `uv` 管理的单一虚拟环境串联起来，整条流水线由一个 shell 命令驱动。
 
 ------
 
-# **1. 仓库克隆**
+# **1. 仓库结构**
 
 ```
-git clone --recursive https://github.com/xyys2003/sam3d_gs.git
-cd sam3d_gs
+.
+├── run_object_generation_pipeline.sh   # 主入口：单图 → 3D 资产
+├── pipeline/
+│   ├── background_reconstruction.py       # AnySplat + 桌面 RANSAC 对齐
+│   ├── objects_generation.py           # SAM-3D-Objects 多物体重建
+│   ├── mesh2mjcf.py                       # 可选：把单物体 .obj 转成 MuJoCo MJCF
+│   └── utils.py                           # 渲染 / IO 公共工具
+└── submodule/
+    ├── Prompt-Inpaint/                    # SAM3 分割 + 背景补全
+    ├── AnySplat/                          # 单图 3DGS 重建
+    └── Sam-3d-objects/                    # 单物体 mesh / GS 重建
 ```
 
-如果你忘记使用 `--recursive` 克隆，可运行：
+------
+
+# **2. 环境安装**
+
+整个项目运行在单个由 `uv` 管理的虚拟环境 `.venv/` 中。下面的步骤面向 RTX 50 系 GPU（CUDA 12.8，PyTorch 2.7），同样在 3090 / 4090 上验证通过。
+
+> **硬件**：推荐使用 **显存 ≥ 24 GB** 的 NVIDIA GPU。流水线会依次加载 SAM3、AnySplat、SAM-3D-Objects，其中 SAM-3D-Objects 阶段对显存最敏感。
 
+## **2.1 克隆仓库（含子模块）**
+
+```bash
+git clone --recursive https://github.com/Yuchi-Zhang-00/sam3d_gs.git
+cd sam3d_gs
 ```
+
+如果克隆时忘了 `--recursive`：
+
+```bash
 git submodule update --init --recursive
 ```
 
-------
+## **2.2 安装 Python 环境**
 
-# **2. Conda 环境说明**
+推荐使用一键安装脚本：
 
-本项目使用三个互相隔离的 Conda 环境，以避免依赖冲突。
+```bash
+bash scripts/install_env.sh
+```
 
-| 环境名称        | 功能用途                           | 路径              |
-| --------------- | ---------------------------------- | ----------------- |
-| `vllm`          | 运行 Qwen3-VL-8B-Thinking 推理服务 | —                 |
-| `sam3`          | 运行 SAM3 完成 2D 多物体分割       | `sam3/`           |
-| `sam3d-objects` | 从 RGB + Mask 生成 3D Gaussian     | `sam-3d-objects/` |
+脚本会创建 `.venv`、安装 CUDA 12.8 版 PyTorch、子模块依赖以及项目级运行时依赖。
 
-------
+如果想手动一步步执行，请查阅 [`install.md`](install.md)。该文档同时记录了 SAM-3D-Objects 的几处 requirements 文件 patch 和编译 AnySplat CUDA RoPE2D 内核所需的 `kernels.cu` 修改。
 
-# **3. vLLM 环境（Qwen3-VL 服务器）**
+## **2.3 HuggingFace 权限申请**
 
-### **3.1 创建环境**
+流水线依赖以下三个 HuggingFace 模型：
 
-```
-conda create -n vllm python=3.10 -y
-conda activate vllm
-```
+| 模型 | 使用方 | 访问 |
+| --- | --- | --- |
+| [`facebook/sam3`](https://huggingface.co/facebook/sam3) | Prompt-Inpaint（Stage 1） | **gated**，需在模型页面申请权限 |
+| [`facebook/sam-3d-objects`](https://huggingface.co/facebook/sam-3d-objects) | SAM-3D-Objects（Stage 3） | **gated**，需在模型页面申请权限 |
+| [`lhjiang/anysplat`](https://huggingface.co/lhjiang/anysplat) | AnySplat（Stage 2） | 公开（MIT） |
 
-### **3.2 安装 PyTorch（CUDA 12.x）**
+在两个 gated 模型页面接受协议后，登录一次：
 
-```
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
-    --index-url https://download.pytorch.org/whl/cu124
+```bash
+hf auth login
 ```
 
-### **3.3 安装 vLLM 与相关依赖**
+两个 gated 模型需要显式放置到本地，由一个 bootstrap 脚本一次性处理（登录后
+跑一次即可）：
 
-```
-pip install vllm --extra-index-url https://download.pytorch.org/whl/cu124
-pip install transformers tiktoken sentencepiece xformers flashinfer-python
-pip install huggingface_hub
+```bash
+bash scripts/download_checkpoints.sh
 ```
 
-此配置已验证可稳定运行 **Qwen3-VL-8B-Thinking**。
+| 模型 | 落地位置 |
+| --- | --- |
+| `facebook/sam-3d-objects` | `submodule/Sam-3d-objects/checkpoints/hf/`（Hydra 配置树，不会被 `from_pretrained` 拉取） |
+| `facebook/sam3` | `submodule/Prompt-Inpaint/checkpoints/sam3.pt`（约 3.3 GB；放到本地以免 `~/.cache` 清理后丢失） |
 
-------
+该脚本是幂等的，且 `run_object_generation_pipeline.sh` 在首次运行时也会
+自动调用它。可以通过 `--skip-sam3d`、`--skip-sam3` 或 `--force` 单独控制每
+一个 stage。
 
-# **4. SAM3 环境**
+`lhjiang/anysplat` 也由同一个 bootstrap 脚本拉取（落到标准的 HuggingFace
+hub 缓存 `~/.cache/huggingface/hub/` 下）。它是公开模型（MIT），**不需要
+`hf auth login`**；提前拉只是避免 Stage 2 首次运行时做几 GB 的下载。
+传 `--skip-anysplat` 可以跳过这一步、让 AnySplat 首次运行时再 lazy 下载。
+
+------
 
-官方实现：
- 🔗 https://github.com/facebookresearch/sam3
- 🔗 https://huggingface.co/facebook/sam3
+## **2.4 Docker 镜像（2.1–2.3 的替代方案）**
 
-### **4.1 创建环境**
+仓库提供了一份预构建镜像，包含完整环境（CUDA 12.8 基础镜像、uv 管理的
+`.venv`、编译好的 AnySplat curope CUDA 扩展、所有 PyPI 依赖），已发布到
+阿里云容器镜像服务：
 
 ```
-cd sam3
-conda create -n sam3 python=3.10 -y
-conda activate sam3
+crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1
+crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:latest
 ```
 
-### **4.2 安装 PyTorch（CUDA 12.x）**
+用镜像可以完全跳过 §2.2；但宿主机仍然需要克隆本仓库（用于
+`run_docker.sh` 启动脚本和 checkpoint 的 bind-mount 目录），以及完成
+§2.3 的 HuggingFace 权限申请。
 
-```
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
-    --index-url https://download.pytorch.org/whl/cu124
-```
+### **前置条件**
 
-### **4.3 克隆并安装 SAM3**
+- 已安装 Docker 和 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)；
+  显存 ≥ 24 GB 的 NVIDIA GPU
+- 宿主机上已经 clone 了本仓库（`git clone --recursive ...`，见 §2.1）——
+  用作 `run_docker.sh` 启动脚本所在位置,以及 checkpoint / 数据 / 输出的
+  bind-mount 根目录
+- 完成 §2.3 的一次性 HuggingFace 设置，并在宿主机执行过
+  `bash scripts/download_checkpoints.sh`。Checkpoint 留在宿主机、通过
+  bind-mount 进容器，所以只需要下载一次。
 
-```
-git clone https://github.com/facebookresearch/sam3.git
-cd sam3
-pip install -e .
+### **拉取镜像**
+
+```bash
+docker pull crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1
+docker tag  crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 sam3d-gs:latest
 ```
 
-### **4.4 可选依赖（用于 Notebook 或训练）**
+`docker tag` 这一步可选。`run_docker.sh` 默认使用 `sam3d-gs:latest`；
+如果不想重 tag，可以在启动时加前缀
+`SAM3D_IMAGE=crpi-.../sam3d_gs:v0.1`。
 
-```
-pip install -e ".[notebooks]"
-pip install -e ".[train,dev]"
+### **启动容器**
+
+```bash
+./run_docker.sh                                       # 全默认（推荐）
+./run_docker.sh /path/to/sam3d_gs                     # 显式传项目目录
+./run_docker.sh /path/to/sam3d_gs /mnt/hf_cache       # 自定义 HF 缓存根
+SAM3D_IMAGE=sam3d-gs:v0.1 ./run_docker.sh             # 指定镜像 tag
+TORCH_HOME=/mnt/torch_cache ./run_docker.sh           # 自定义 torch hub 缓存
 ```
 
-------
+启动脚本会把宿主机的关键路径 bind-mount 进容器：
 
-# **5. SAM-3D-Objects 环境**
+| 宿主机路径 | 容器路径 | 用途 |
+| --- | --- | --- |
+| `<repo>/submodule/Sam-3d-objects/checkpoints` | 同名 | SAM-3D-Objects 权重（gated） |
+| `<repo>/submodule/Prompt-Inpaint/checkpoints` | 同名 | SAM3 权重（gated） |
+| `${HF_HOME:-$HOME/.cache/huggingface}` | `/root/.cache/huggingface` | AnySplat + 其它 HF 下载 |
+| `${TORCH_HOME:-$HOME/.cache/torch}` | `/root/.cache/torch` | `torch.hub` 缓存（DINOv2 等） |
+| `<repo>/data` | `/opt/sam3d_gs/data` | 输入 / 输出工作目录 |
+| `<repo>/example` | `/opt/sam3d_gs/example` | 自带示例输入 / 输出 |
 
-官方实现：
- 🔗 https://github.com/facebookresearch/sam3d
- 🔗 https://huggingface.co/facebook/sam-3d-objects
+流水线的产物会写到你指定的 scene 目录里。因为 `data/` 和 `example/`
+都是 bind-mount，容器退出后这些产物会留在宿主机上。
 
-### **5.1 创建环境**
+### **在容器内运行流水线**
 
-```
-conda create -n sam_3d_body python=3.10 -y
-conda activate sam_3d_body
-```
+进入容器后你会落到 `/opt/sam3d_gs/`。镜像里 `PATH` 和 `PYTHONPATH`
+已经指向自带的 `.venv`，可以直接调用 `python` 和脚本，**不需要
+`source .venv/bin/activate`**。
 
-### **5.2 安装 PyTorch（CUDA 12.x）**
+```bash
+# 自带示例：
+bash run_object_generation_pipeline.sh example/example.png
 
-```
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
-    --index-url https://download.pytorch.org/whl/cu124
+# 自己的图：
+bash run_object_generation_pipeline.sh data/my_scene/input_image.png
 ```
 
-### **5.3 安装其他 Python 依赖**
+Stage 1 / 2 / 3 的行为和下面 §3–§4 完全一致。
 
-```
-pip install pytorch-lightning pyrender opencv-python yacs scikit-image einops timm dill pandas rich \
-    hydra-core hydra-submitit-launcher hydra-colorlog pyrootutils webdataset chump networkx==3.2.1 \
-    roma joblib seaborn wandb appdirs appnope ffmpeg cython jsonlines pytest xtcocotools loguru \
-    optree fvcore black pycocotools tensorboard huggingface_hub
-```
+### **镜像里包含什么**
 
-### **5.4 安装 Detectron2（SAM3D 依赖）**
+- CUDA 12.8 devel 基础镜像 + Python 3.11 `.venv`,所有 PyPI 依赖
+- 已编译好的 AnySplat `curope` CUDA 扩展（sm_80 / 90 / 100 / 120）
+- `coacd`、`trimesh`、`mujoco`(`pipeline/mesh2mjcf.py` 开箱可用)
+- 一个 `sitecustomize.py`，monkey-patch `torch.hub`，使其在本地缓存
+  存在时跳过 github 的 branch ping —— 这样网络不稳时也不会再触发
+  `RemoteDisconnected`(前提是 `~/.cache/torch/hub` 已有相应模型)
+- 全局的 `git insteadOf` 规则，把 `https://github.com/` 重写到
+  `https://gh-proxy.com/https://github.com/`，让容器内的
+  `git clone` 在 github 不稳的网络上也能工作
 
-```
-pip install 'git+https://github.com/facebookresearch/detectron2.git@a1ce2f9' \
-    --no-build-isolation --no-deps
-```
+### **镜像里不包含什么**
 
-### **5.5 可选安装：MoGe**
-
-```
-pip install git+https://github.com/microsoft/MoGe.git
-```
+- 三套模型 checkpoint（SAM3 / SAM-3D-Objects / AnySplat）。它们留在
+  宿主机上、通过上面的 bind-mount 进容器。在宿主机执行一次
+  `scripts/download_checkpoints.sh` 即可。
+- 你自己的输入数据。放到 `<repo>/data/<scene_name>/` 下，容器里通过
+  `data/<scene_name>/input_image.png` 引用。
 
-------
+### **使用须知**
 
-# **6. HuggingFace 权限申请**
+- **流水线写出的文件在宿主机上属主是 `root`**。容器内是 root 用户跑的，
+  所以写进 bind-mount 目录(`data/`、`example/`、checkpoint 目录等)
+  的文件，在宿主机上看到的所有者是 uid 0。两种处理方式：
 
-本项目依赖两个需要授权的模型：
+  ```bash
+  # 容器退出后，在宿主机改回当前用户：
+  sudo chown -R $(id -u):$(id -g) data/ example/
 
-- **SAM3**
-   🔗 https://huggingface.co/facebook/sam3
-- **SAM-3D-Objects**
-   🔗 https://huggingface.co/facebook/sam-3d-objects
+  # 或者从一开始就让容器用宿主机的 uid 跑。
+  # 优点是不用 chown,缺点是 Sam-3d-objects 里某些 EGL / pyrender
+  # 代码路径在非 root 下可能跑不通,所以一般建议用上面的 chown 方案。
+  # (想试的话: 编辑 run_docker.sh,给 docker run 加上
+  # `--user $(id -u):$(id -g)`)
+  ```
 
-请在 HuggingFace 对应页面申请权限，并登录：
+- **`gh-proxy.com` 这个重写是给国内用户准备的**。镜像里烤了一条
+  `git config --global url.<proxy>.insteadOf https://github.com/` 规则,
+  让容器里 `git clone` github 仓库在 GFW 网络下也能成功。**在境外网络
+  环境下这个跳转是多余的,可能反而拖慢速度**。每次进容器后执行一次即可
+  禁用:
 
-```
-hf auth login
-```
+  ```bash
+  git config --global --unset url."https://gh-proxy.com/https://github.com/".insteadOf
+  ```
 
-脚本会自动使用你的 Token。
+  (或者自己 commit 一个去掉这条规则的镜像变体,免得每次都跑。)
 
 ------
 
-# **7. 运行流程**
+# **3. 快速开始**
 
-运行脚本前，请设置你的 Conda 激活脚本路径：
+> 如果你用的是 Docker 镜像（§2.4），先跑 `./run_docker.sh` 进容器；
+> 本节后面所有命令都在**容器内**原样执行。
 
+先用仓库自带的示例图跑一遍即可（入口脚本会自动 `source .venv`，无需手动激活环境）：
+
+```bash
+bash run_object_generation_pipeline.sh example/example.png
 ```
-CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh"
+
+默认所有产物会写到输入图像所在目录（此例中即 `example/`）。若想显式指定输出目录，可以传第二个参数：
+
+```bash
+bash run_object_generation_pipeline.sh example/example.png path/to/scene_dir
 ```
 
+脚本会在同一个 `.venv` 中按顺序执行三个 stage：
+
+1. `submodule/Prompt-Inpaint/main.py` — 分割 + 背景补全
+2. `pipeline/background_reconstruction.py` — AnySplat 重建 + 桌面对齐
+3. `pipeline/objects_generation.py` — 单物体 mesh / Gaussian 导出
+
 ------
 
-## **阶段 1：Qwen3-VL + SAM3 生成 2D Mask**
+# **4. 各 Stage 详解**
 
-执行：
+## **Stage 1 — Prompt-Inpaint（SAM3 分割 + 背景补全）**
 
-```
-bash run_agent_with_vllm.sh
+```bash
+python submodule/Prompt-Inpaint/main.py \
+    --resize-output \
+    --save-individual-masks \
+    --config submodule/Prompt-Inpaint/configs/items.yml \
+    --image path/to/input_image.png \
+    --output-dir path/to/scene_dir
 ```
 
-此脚本会：
+输出（位于 `scene_dir/`）：
 
-1. 激活 `vllm` 环境
-2. 启动 vLLM 服务，加载 Qwen3-VL
-3. 激活 `sam3` 环境
-4. 运行 `pipeline/run_sam3_agent_full.py`
-5. 生成多物体 mask
+- `input_image.png` — 输入图像的 resize 副本
+- `clean_background.png` — 去除所有前景物体后的补全背景
+- `bg_mask.png` — 用于平面拟合的桌面 mask
+- `masks/<物体名>.png` — 每个物体的二值 mask
 
-输出目录：
+## **Stage 2 — AnySplat + 桌面对齐 3DGS**
 
-```
-outputs/master_with_vllm/masks/
+```bash
+python pipeline/background_reconstruction.py path/to/scene_dir
 ```
 
-------
+行为：
 
-## **阶段 2：SAM-3D-Objects 重建 3D Gaussian**
+- 递归读取输入目录下每个场景文件夹中的 `clean_background.png` 和配套的 `input_image.png`。
+- 运行 AnySplat 恢复相机内外参、深度、3DGS 重建结果。
+- 对 `bg_mask.png` 做 RANSAC 平面拟合，结合内部 PCA 得到 OBB，构建 world → table 变换。
+- 输出 Mujoco 坐标系下的对齐点云。
 
-执行：
+常用参数：
 
-```
-bash run_sam3d_from_masks.sh
-```
+- `--model-id lhjiang/anysplat` — 覆盖 AnySplat 的 HuggingFace 模型 id
+- `--align-table` / `--no-align-table` — 是否启用 RANSAC 桌面对齐并导出 `bg_aligned.ply`（默认启用）。关闭时只导出原始 `bg.ply`
+- `--x-offset`、`--z-offset` — 对齐后可选的放置偏移（米）。默认 0，对齐后的点云落在原点
 
-此脚本会：
+输出（位于 `scene_dir/`）：
 
-1. 激活 `sam3d-objects` 环境
-2. 确保 SAM-3D-Objects 的 checkpoint 下载完成
-3. 加载 RGB + masks
-4. 生成每个物体的 `.pt` 文件
-5. 重建并导出 3D Gaussian (`.ply`, `.gif`)
+- `extrinsic.npy`、`intrinsic.npy` — 相机参数（world-to-camera；像素单位内参）
+- `depth.npy`、`depth_visual.png` — 来自 splat 重建的深度
+- `depth_ori.npy`、`depth_ori_visual.png` — 来自原始（未补全）图像的深度
+- `scale.npy` — 场景级缩放因子
+- `3d_assets/bg.ply` — AnySplat 输出的原始 3DGS 场景
+- `3d_assets/bg_aligned.ply` — 桌面对齐后的 3DGS 场景（仅当 `--align-table` 启用时输出，默认启用）
 
-输出目录：
+## **Stage 3 — SAM-3D-Objects 单物体重建**
 
-```
-sam-3d-objects/outputs/torch_save_pt/
-sam-3d-objects/gaussians/multi/
+```bash
+python pipeline/objects_generation.py --input-dir path/to/scene_dir
 ```
 
-------
+常用参数：
 
-## **可选：一键式全流程执行**
+- `--project-root submodule/Sam-3d-objects` — checkpoint 根目录
+- `--tag hf` — checkpoint 子目录（`submodule/Sam-3d-objects/checkpoints/<tag>/pipeline.yaml`）
+- `--seed 42`、`--save-pt`、`--save-intermediate`
 
-```
-bash run_pipeline.sh
-```
+针对每一个 mask，该 stage 运行 SAM-3D-Objects 推理，通过对比投影面积与平均深度恢复物体局部尺寸，并把资产以原点姿态导出。
 
-该脚本会自动完成阶段 1 + 阶段 2。
+输出（位于 `scene_dir/3d_assets/`）：
+
+- `<物体名>.obj` — Mujoco 单位的物体 mesh
+- `<物体名>.ply` — Mujoco 单位的物体 3D Gaussian
+- `<物体名>_keyframe.npy` — 最终 mesh 的平均 XYZ
+- 当传入 `--save-intermediate` 时，额外导出调试用的渲染和带姿态的中间产物
 
 ------
 
-# **Q&A**
+# **5. 可选工具**
+
+## **`pipeline/mesh2mjcf.py` — mesh → MuJoCo MJCF 转换器**
 
-## **Q1：下载模型时报 “Consistency check failed”？**
+一个独立的命令行工具，把单个 `.obj` 或 `.stl` 文件转成 MuJoCo MJCF 资产
+（`<asset>_dependencies.xml` + `<asset>.xml` 两个 XML，以及一个 per-asset 的
+mesh / texture 目录）。它**没有**被串进
+`run_object_generation_pipeline.sh`；当 Stage 3 产出
+`<scene>/3d_assets/<obj>.obj` 之后按需调用即可。
 
-**原因：** 下载中断导致 HuggingFace 缓存中出现损坏的模型分片。
- **解决：删除损坏缓存并重新下载。**
+默认输出根目录是输入 mesh 的父目录，所以对
+`scene_dir/3d_assets/cup.obj` 运行后会在输入旁边生成一个 per-asset 目录：
 
 ```
-rm -rf sam-3d-objects/checkpoints/hf
-rm -rf ~/.cache/huggingface/hub   # 可选
-bash run_sam3d_from_masks.sh
+scene_dir/3d_assets/
+  cup.obj                      （原输入，不变）
+  cup/                         （以 obj 名命名的 per-asset 输出目录）
+    cup.obj                    （输入的拷贝）
+    cup.mtl                    （若多材质）
+    <纹理文件>                  （MTL 引用的贴图）
+    part_0.obj part_1.obj ...  （若 -cd）
+    mjcf/
+      cup.xml
+      cup_dependencies.xml
 ```
 
-若要强制重新下载，可使用：
+emitted XML 中的 mesh 路径写作 `<asset>/<file>`，所以消费方的 MuJoCo
+scene 需要把 `meshdir`（和 `texturedir`）设为输出根目录。通过
+`-o/--output <dir>` 可以重定向。
 
-```
-force_download=True
-```
+### 所需依赖
+
+走 `scripts/install_env.sh` 装环境的话，`coacd`、`trimesh`、`mujoco` 三个包
+默认就装好了。下表只在你跳过一键脚本、想手动按需装时作为参考：
 
-## **关于坐标系说明（PLY 输出方向）**
+| 功能 | 依赖库 | 手动安装命令 |
+| --- | --- | --- |
+| 多材质 OBJ 自动拆分（当存在 MTL 文件时触发） | `trimesh` | `uv pip install trimesh` |
+| 凸分解（`-cd`） | `coacd`、`trimesh` | `uv pip install coacd trimesh` |
+| 预览查看器（`--verbose`） | `mujoco` | `uv pip install mujoco` |
 
-通过 **SAM-3D-Objects** 导出的 3D Gaussian `.ply` 文件默认处于 **相机坐标系** 下，其中：
+### 用法
 
-- **+Z 轴** 为相机前向
-- **+X 轴** 指向右侧
-- **+Y 轴** 指向下方（典型计算机视觉坐标系）
+```bash
+# 基本用法（使用默认颜色 / 质量 / 惯性）
+python pipeline/mesh2mjcf.py path/to/cup.obj
 
-因此，重建的对象是以 **相机前向 Z 轴** 对齐的，而不是世界坐标系。
+# 自定义 RGBA、质量、对角惯性
+python pipeline/mesh2mjcf.py path/to/cup.obj \
+    --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005
 
-如果需要将 `.ply` 放置到全局 **世界坐标系** 中（例如仿真器、机器人场景、NeRF / COLMAP world frame），必须执行一次 **相机 → 世界坐标系转换**：
-$$
-\mathbf{X}_{world} = \mathbf{R}_{c2w}\ \mathbf{X}_{camera} \ + \ \mathbf{t}_{c2w}
-$$
-其中：
+# 自由关节 + 凸分解，得到更精确的碰撞几何
+python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint -cd
 
-- $\mathbf{R}_{c2w}$：相机到世界的旋转矩阵
-- $\mathbf{t}_{c2w}$：相机到世界的平移向量
-- $\mathbf{X}_{camera}$：高斯中心的相机系坐标
-- $\mathbf{X}_{world}$：转换后的世界系坐标
+# 在 mujoco.viewer 中预览
+python pipeline/mesh2mjcf.py path/to/cup.obj --verbose
+
+# 一键批量转换某个场景下所有物体
+for obj in scene_dir/3d_assets/*.obj; do
+    python pipeline/mesh2mjcf.py "$obj" -cd
+done
+```
 
-完成转换后，你即可将 `.ply` 与全局场景或机器人环境正确对齐。
 ------
 
-# **引用（Citation）**
+# **6. 常见问题**
 
-### **SAM3**
+**Q：HuggingFace 下载报 "Consistency check failed: file should be XXXX but has size YYYY"。**
 
+HuggingFace 缓存中的 shard 损坏。清理后重试：
+
+```bash
+rm -rf submodule/Sam-3d-objects/checkpoints/hf
+rm -rf ~/.cache/huggingface/hub   # 可选，更激进
+bash run_object_generation_pipeline.sh path/to/input_image.png
 ```
-@article{kirillov2024sam3,
-  title={SAM 3: Segment Anything in Images and Videos},
-  author={Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others},
-  year={2024},
-  url={https://github.com/facebookresearch/sam3}
-}
+
+也可以在调用 HuggingFace API 时通过 `force_download=True` 强制重新下载。
+
+**Q：AnySplat 提示 "cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead"。**
+
+CUDA 扩展没编译。请按 [`install.md`](install.md) 里的说明修改 `kernels.cu`，再执行 `python setup.py build_ext --inplace`。
+
+**Q：Stage 1 (Prompt-Inpaint / iopaint) 报 `ImportError: cannot import name 'cached_download' from 'huggingface_hub'`。**
+
+`huggingface_hub` ≥ 0.26 把 `cached_download` 删掉了，但 `iopaint` 依赖的 `diffusers` 0.27.x 还在 import 它。把 `huggingface_hub` 锁到 0.25.2：
+
+```bash
+source .venv/bin/activate
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+    "huggingface_hub==0.25.2"
 ```
 
-### **SAM-3D-Objects**
+新走 `scripts/install_env.sh` 的环境已经带上这个 pin。
+
+**Q：Stage 1 报 `ImportError: cannot import name 'is_offline_mode' from 'huggingface_hub'`。**
+
+同一根问题的另一侧：`transformers` 5.x 会 import `huggingface_hub.is_offline_mode`，而 0.25.2 没有这个符号。把 transformers 锁到 4.48.3：
 
+```bash
+source .venv/bin/activate
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+    "transformers==4.48.3"
 ```
+
+新走 `scripts/install_env.sh` 的环境已经带上这个 pin。
+
+------
+
+# **引用**
+
+```bibtex
+@article{kirillov2024sam3,
+  title  = {SAM 3: Segment Anything in Images and Videos},
+  author = {Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others},
+  year   = {2024},
+  url    = {https://github.com/facebookresearch/sam3}
+}
+
 @article{wu2024sam3dobjects,
-  title={SAM-3D-Objects: Segment Anything in 3D Using 2D Masks},
-  author={Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others},
-  year={2024},
-  url={https://github.com/facebookresearch/sam3d}
+  title  = {SAM-3D-Objects: Segment Anything in 3D Using 2D Masks},
+  author = {Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others},
+  year   = {2024},
+  url    = {https://github.com/facebookresearch/sam-3d-objects}
+}
+
+@article{jiang2024anysplat,
+  title  = {AnySplat: Feed-forward 3D Gaussian Splatting from Unconstrained Views},
+  author = {Jiang, Lihan and others},
+  year   = {2024},
+  url    = {https://github.com/OpenRobotLab/AnySplat}
 }
 ```
 
 ------
 
-# **致谢（Acknowledgements）**
+# **致谢**
 
-本项目基于以下官方实现构建：
+本项目基于并整合了以下工作：
 
-- **SAM3**
-   GitHub: https://github.com/facebookresearch/sam3
-   HuggingFace: https://huggingface.co/facebook/sam3
-- **SAM-3D-Objects**
-   GitHub: https://github.com/facebookresearch/sam3d
-   HuggingFace: https://huggingface.co/facebook/sam-3d-objects
+- **SAM3** — [GitHub](https://github.com/facebookresearch/sam3) · [HuggingFace](https://huggingface.co/facebook/sam3)
+- **SAM-3D-Objects** — [GitHub](https://github.com/facebookresearch/sam3d) · [HuggingFace](https://huggingface.co/facebook/sam-3d-objects)
+- **AnySplat** — [HuggingFace](https://huggingface.co/lhjiang/anysplat)
+- **Prompt-Inpaint** — [GitHub](https://github.com/MrZoyo/Prompt-Inpaint)
 
-感谢原作者开放其卓越的研究成果与代码，使本流水线得以实现。
\ No newline at end of file
+感谢原作者开放其研究成果与代码。
diff --git a/example/example.png b/example/example.png
new file mode 100644
index 0000000..9caff67
Binary files /dev/null and b/example/example.png differ
diff --git a/install.md b/install.md
new file mode 100644
index 0000000..fa01a9c
--- /dev/null
+++ b/install.md
@@ -0,0 +1,141 @@
+# Install on RTX 50-series GPUs (torch 2.7.0 + cu128, also works on 3090,4090)
+
+> **Don't want to build the environment locally?** A pre-built Docker
+> image is published; see [README §2.4 "Docker image"](README.md#24-docker-image-alternative-to-2123)
+> for the pull / launch flow. This document is only the native-install
+> reference.
+
+One-command installer:
+
+```
+bash scripts/install_env.sh
+```
+
+This document is the manual step-by-step installation reference. Use it if you want to inspect or run each installation step yourself.
+
+
+# Run the installation commands below
+
+```
+git submodule update --init --recursive
+
+uv venv --python 3.11
+
+source .venv/bin/activate
+
+export PYTHONPATH="$(pwd)/submodule/Sam-3d-objects/notebook:$(pwd)/submodule/Sam-3d-objects:${PYTHONPATH:-}"
+
+uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
+
+# uv pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128
+
+uv pip install -r submodule/AnySplat/requirements.txt --no-build-isolation
+
+export PIP_FIND_LINKS="https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html"
+
+uv pip install hatch-requirements-txt editables wheel
+
+uv pip install -e './submodule/Sam-3d-objects[dev]'
+
+uv pip install -e './submodule/Sam-3d-objects[p3d]' --no-build-isolation
+
+uv pip install -e "./submodule/Sam-3d-objects[inference]"     --no-build-isolation     --find-links https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html
+
+# Note: do NOT pass -U here. With -U, uv would upgrade transitive deps such
+# as torch (via iopaint) and clobber the CUDA-pinned torch above.
+# Also note transformers is pinned to ==4.48.3 (not >=): transformers 5.x
+# imports `is_offline_mode` from huggingface_hub, which doesn't exist in
+# 0.25.2, and would crash iopaint even with hub pinned below.
+uv pip install --index-strategy unsafe-best-match \
+    "transformers==4.48.3" \
+    "iopaint>=1.2.0" \
+    "diffusers>=0.27.2" \
+    "numpy<2.0" \
+    "opencv-python>=4.8.0" \
+    "pyyaml>=6.0" \
+    "requests>=2.31.0" \
+    "tqdm>=4.66.0" \
+    "setuptools" \
+    "einops"
+
+# Pin huggingface_hub last, with --force-reinstall --no-deps so it can be
+# downgraded past other packages' transitive `>=0.26` constraints.
+# Reason: diffusers 0.27.2 (and the iopaint stack on top) still import
+# `cached_download` from huggingface_hub, which was removed in hub >=0.26.
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+    "huggingface_hub==0.25.2"
+
+uv pip install --index-strategy unsafe-best-match "git+https://github.com/facebookresearch/sam3.git"
+```
+
+## SAM3 model access
+
+`facebook/sam3` is a gated model on HuggingFace. Request access on the model page first, then log in:
+```
+huggingface-cli login
+```
+
+
+## Fix the AnySplat warning: `Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead`
+```
+cd submodule/AnySplat/src/model/encoder/backbone/croco/curope/
+```
+In `kernels.cu`, change:
+
+```
+AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
+```
+
+to:
+
+```
+AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.scalar_type(), "rope_2d_cuda", ([&] {
+```
+
+Then run:
+```
+python setup.py build_ext --inplace
+```
+
+
+## Extra dependencies for `pipeline/mesh2mjcf.py`
+
+`scripts/install_env.sh` already installs `coacd`, `trimesh`, and `mujoco` by
+default, so `mesh2mjcf.py` works out of the box (including `-cd` and
+`--verbose`). The commands below are only useful if you build the environment
+piecemeal and want to add the individual packages on demand:
+
+```
+# Convex decomposition (-cd)
+uv pip install coacd trimesh
+
+# Preview viewer (--verbose)
+uv pip install mujoco
+```
+
+
+# Completed modifications compared to the original repository:
+
+submodule/Sam-3d-objects/pyproject.toml:
+```
+-PIP_EXTRA_INDEX_URL = "https://pypi.ngc.nvidia.com https://download.pytorch.org/whl/cu121"  
+
+change to
+
++PIP_EXTRA_INDEX_URL = "https://pypi.ngc.nvidia.com https://download.pytorch.org/whl/cu128"
+```
+requirements.inference.txt:
+```
+kaolin==0.17.0 change to kaolin==0.18.0
+```
+requirements.txt:
+```
+nvidia-pyindex==1.0.9 change to # nvidia-pyindex==1.0.9    (comment it out)
+
+torchaudio==2.5.1+cu121 change to torchaudio,
+xformers==0.0.28.post3 change to xformers    (remove the pinned torchaudio and xformers versions)
+```
+requirements.p3d.txt:
+```
+tflash_attn==2.8.3 change to flash_attn==2.7.3
+```
\ No newline at end of file
diff --git a/pipeline/__init__.py b/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pipeline/background_reconstruction.py b/pipeline/background_reconstruction.py
new file mode 100644
index 0000000..16a5472
--- /dev/null
+++ b/pipeline/background_reconstruction.py
@@ -0,0 +1,376 @@
+"""Batch RANSAC-based table alignment + 3D Gaussian export on top of AnySplat.
+
+This is a cleaned-up rewrite of `submodule/AnySplat/inference_ransac_batch.py`.
+The script now lives outside the AnySplat submodule, so it explicitly inserts
+the AnySplat root onto `sys.path` to keep the original imports working.
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import cv2
+import imageio
+import numpy as np
+import torch
+
+# ===== Make AnySplat's `src.*` and `utils.py` importable when running from the
+# repository root (this file no longer lives inside submodule/AnySplat).
+_ANYSPLAT_ROOT = Path(__file__).resolve().parent.parent / "submodule" / "AnySplat"
+sys.path.insert(0, str(_ANYSPLAT_ROOT))
+sys.path.insert(0, str(_ANYSPLAT_ROOT.parent))  # mirrors original sys.path entry
+
+from src.misc.image_io import save_interpolated_video  # noqa: E402, F401
+from src.model.ply_export import export_ply  # noqa: E402
+from src.model.model.anysplat import AnySplat  # noqa: E402
+from src.utils.image import process_image  # noqa: E402
+from utils import (  # noqa: E402
+    align_points_to_table,
+    depth_to_points,
+    fit_plane_ransac_safe_2,
+    plane_coordinate_system,
+    render_depth_from_points,
+    shrink_mask_erode,
+)
+
+
+# ===== RANSAC / inner-rectangle hyperparameters =====
+RANSAC_NUM_ITERS = 600
+RANSAC_DIST_THRESH = 0.005  # tabletops are usually very flat
+RANSAC_SAMPLE_N = 40000
+INNER_PERCENTILE = (20, 80)  # crop to the central 60% to avoid edges
+MIN_INNER_POINTS = 50
+
+# ===== Scene normalisation =====
+# Quantile of |xyz| used as the reference radius before rescaling, and the
+# target radius the reference is mapped to.
+SCALE_QUANTILE = 0.95
+SCALE_TARGET_RANGE = 0.6
+
+# ===== Post-alignment scene placement =====
+# Offsets applied after table-alignment so the aligned cloud can be shifted
+# from the origin if the downstream consumer needs it (e.g. to place it on a
+# Mujoco table). Defaults are 0, meaning the aligned cloud sits at the origin.
+DEFAULT_X_OFFSET = 0.0
+DEFAULT_Z_OFFSET = 0.0
+
+# ===== Mask shrink before plane fitting =====
+BG_MASK_SHRINK_RATIO = 0.12
+
+# ===== Default model id =====
+DEFAULT_MODEL_ID = "lhjiang/anysplat"
+
+
+def compute_table_geometry_ransac(depth, mask, intrinsic, extrinsic):
+    """Fit a tabletop plane via RANSAC + inner PCA and build a world-aligned
+    transform that maps the original world frame onto a table-aligned frame.
+    """
+    H, W = depth.shape
+
+    # ===== 1. Intrinsics =====
+    fx = intrinsic[0, 0]
+    fy = intrinsic[1, 1]
+    cx = intrinsic[0, 2]
+    cy = intrinsic[1, 2]
+
+    # ===== 2. Depth -> camera-frame points =====
+    points_cam = depth_to_points(depth, mask, fx, fy, cx, cy)
+    print("points_cam:", points_cam.shape)
+
+    # ===== 3. RANSAC plane =====
+    normal_cam, center_cam, inlier_idx = fit_plane_ransac_safe_2(
+        points_cam,
+        num_iters=RANSAC_NUM_ITERS,
+        dist_thresh=RANSAC_DIST_THRESH,
+        sample_N=RANSAC_SAMPLE_N,
+    )
+    print(f"RANSAC normal: {normal_cam}")
+
+    pts_plane = points_cam[inlier_idx]
+
+    # ===== 4. Plane coordinate system =====
+    u, v = plane_coordinate_system(normal_cam)
+    rel = pts_plane - center_cam
+    pts_2d = np.stack([rel @ u, rel @ v], axis=1)
+
+    # ===== 5. Inner rectangle (crop edges) =====
+    x, y = pts_2d[:, 0], pts_2d[:, 1]
+    x_min, x_max = np.percentile(x, list(INNER_PERCENTILE))
+    y_min, y_max = np.percentile(y, list(INNER_PERCENTILE))
+    inner = (x > x_min) & (x < x_max) & (y > y_min) & (y < y_max)
+    pts_inner = pts_2d[inner]
+    if pts_inner.shape[0] < MIN_INNER_POINTS:
+        raise RuntimeError("Too few inner RANSAC points")
+
+    # ===== 6. PCA on the inner points =====
+    mean_2d = pts_inner.mean(axis=0)
+    centered = pts_inner - mean_2d
+    _, _, Vt = np.linalg.svd(centered, full_matrices=False)
+    dir_long_2d = Vt[0]
+
+    # ===== 7. 2D -> 3D =====
+    dir_long_cam = dir_long_2d[0] * u + dir_long_2d[1] * v
+    dir_long_cam /= np.linalg.norm(dir_long_cam)
+    dir_short_cam = np.cross(normal_cam, dir_long_cam)
+    dir_short_cam /= np.linalg.norm(dir_short_cam)
+
+    # ===== 8. World consistency (avoid axis flip) =====
+    R_cw = extrinsic[:3, :3]
+    if (R_cw @ dir_long_cam)[0] < 0:
+        dir_long_cam = -dir_long_cam
+        dir_short_cam = -dir_short_cam
+
+    # ===== 9. OBB extents =====
+    proj = centered @ Vt[:2].T
+    min_xy, max_xy = proj.min(0), proj.max(0)
+    length = max_xy[0] - min_xy[0]
+    width = max_xy[1] - min_xy[1]
+
+    center_plane_cam = center_cam + mean_2d[0] * u + mean_2d[1] * v
+
+    # ===== 10. Build world->table alignment =====
+    R_table_cam = np.stack([dir_long_cam, dir_short_cam, normal_cam], axis=1)
+    R_align_cam = R_table_cam.T
+    t_align_cam = -R_align_cam @ center_plane_cam
+
+    R_align_world = R_align_cam @ R_cw
+    t_align_world = R_align_cam @ extrinsic[:3, 3] + t_align_cam
+
+    print("RANSAC inlier ratio:", len(inlier_idx) / points_cam.shape[0])
+
+    return {
+        "length": float(length),
+        "width": float(width),
+        "normal": normal_cam,
+        "dir_long": dir_long_cam,
+        "dir_short": dir_short_cam,
+        "R_align_cam": R_align_cam,
+        "t_align_cam": t_align_cam,
+        "R_align_world": R_align_world,
+        "t_align_world": t_align_world,
+    }
+
+
+def _save_depth_npy_and_viz(depth, image_folder, base_name):
+    """Save a raw depth array and a normalized 8-bit visualisation."""
+    depth_path = Path(image_folder) / f"{base_name}.npy"
+    np.save(depth_path, depth)
+    viz = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8)
+    viz_path = Path(image_folder) / f"{base_name}_visual.png"
+    imageio.imwrite(viz_path, viz)
+
+
+def process_single_image(image_path, model, device, args):
+    """Run AnySplat on one `clean_background.png` and export aligned assets."""
+    image_folder = os.path.dirname(image_path)
+    image_ori_path = os.path.join(image_folder, "input_image.png")
+
+    # Load images.
+    image = process_image(image_path)
+    image_ori = process_image(image_ori_path)
+    images_ori = torch.stack([image_ori], dim=0).unsqueeze(0).to(device)
+    images = torch.stack([image], dim=0).unsqueeze(0).to(device)
+    b, v, _, H, W = images.shape
+
+    # Inference.
+    with torch.no_grad():
+        gaussians, pred_context_pose, depth_dict = model.inference((images + 1) * 0.5)
+        gaussians_ori, pred_context_pose_ori, depth_dict_ori = model.inference(
+            (images_ori + 1) * 0.5
+        )
+    depth_ori = depth_dict_ori["depth"][0][0].squeeze().cpu().numpy()
+    _save_depth_npy_and_viz(depth_ori, image_folder, "depth_ori")
+
+    # Camera parameters. AnySplat returns camera-to-world; we store world-to-camera.
+    pred_all_extrinsic = pred_context_pose["extrinsic"][0][0].inverse().cpu().numpy()
+    pred_all_intrinsic = pred_context_pose["intrinsic"][0][0].cpu().numpy()
+    print(f"Processing {os.path.basename(image_folder)}: converted intrinsics:")
+    print(
+        f"  fx: {pred_all_intrinsic[0, 0] * W:.2f}, "
+        f"fy: {pred_all_intrinsic[1, 1] * H:.2f}"
+    )
+    print(
+        f"  cx: {pred_all_intrinsic[0, 2] * W:.2f}, "
+        f"cy: {pred_all_intrinsic[1, 2] * H:.2f}"
+    )
+
+    # Scale normalised intrinsics to pixel units.
+    pred_all_intrinsic[0, :] = pred_all_intrinsic[0, :] * W
+    pred_all_intrinsic[1, :] = pred_all_intrinsic[1, :] * H
+
+    np.save(Path(image_folder) / "extrinsic.npy", pred_all_extrinsic)
+    np.save(Path(image_folder) / "intrinsic.npy", pred_all_intrinsic)
+
+    intrinsic = pred_all_intrinsic
+    extrinsic = pred_all_extrinsic
+    gaussian_xyz = gaussians.means[0].detach().cpu().numpy()
+    depth = depth_dict["depth"][0][0].squeeze().cpu().numpy()
+    _save_depth_npy_and_viz(depth, image_folder, "depth")
+
+    # Asset directory.
+    assets_folder = os.path.join(image_folder, "3d_assets")
+    os.makedirs(assets_folder, exist_ok=True)
+
+    # Export the raw 3DGS reconstruction.
+    export_ply(
+        gaussians.means[0],
+        gaussians.scales[0],
+        gaussians.rotations[0],
+        gaussians.harmonics[0],
+        gaussians.opacities[0],
+        Path(assets_folder) / "bg.ply",
+    )
+
+    if not args.align_table:
+        print(
+            "Table alignment disabled (--no-align-table); "
+            "skipping bg_aligned.ply export."
+        )
+        print(f"Done. Outputs saved under: {image_folder}")
+        return
+
+    # Re-render depth from the splat point cloud (used for plane fitting).
+    depth_point = render_depth_from_points(gaussian_xyz, intrinsic, extrinsic, H, W)
+
+    mask_path = Path(image_folder) / "bg_mask.png"
+    if not mask_path.exists():
+        print(f"Warning: bg_mask.png not found, skipping table alignment: {mask_path}")
+        return
+
+    mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE).astype(np.uint8)
+    mask = shrink_mask_erode(mask, ratio=BG_MASK_SHRINK_RATIO)
+
+    result = compute_table_geometry_ransac(
+        depth=depth_point,
+        mask=mask,
+        intrinsic=intrinsic,
+        extrinsic=extrinsic,
+    )
+    print(f"\n{os.path.basename(image_folder)} table geometry:")
+    print(f"  length (m): {result['length']:.3f}")
+    print(f"  width  (m): {result['width']:.3f}")
+    print(f"  normal: {result['normal']}")
+
+    # Align the splat point cloud to the table frame.
+    points_table_world = align_points_to_table(
+        gaussian_xyz,
+        result["R_align_world"],
+        result["t_align_world"],
+    )
+    points_table_world = points_table_world - np.median(points_table_world, axis=0)
+
+    # Use a robust quantile for scale so outliers don't dominate.
+    abs_points = np.abs(points_table_world)
+    ref_range = np.quantile(abs_points, SCALE_QUANTILE)
+    scale_factor = ref_range / SCALE_TARGET_RANGE
+    points_table_world = points_table_world / scale_factor
+    gaussians.scales[0] = gaussians.scales[0] / scale_factor
+
+    np.save(Path(image_folder) / "scale.npy", scale_factor)
+    print(f"  scale factor: {scale_factor:.3f}")
+
+    # Swap X/Y, flip Z, then apply optional placement offsets (default 0,0).
+    x = points_table_world[:, 0].copy()
+    y = points_table_world[:, 1].copy()
+    points_table_world[:, 0] = y
+    points_table_world[:, 1] = x
+    points_table_world[:, 2] *= -1
+    points_table_world[:, 2] += args.z_offset
+    points_table_world[:, 0] += args.x_offset
+
+    export_ply(
+        points_table_world,
+        gaussians.scales[0],
+        gaussians.rotations[0],
+        gaussians.harmonics[0],
+        gaussians.opacities[0],
+        Path(assets_folder) / "bg_aligned.ply",
+    )
+
+    print(
+        f"  Z range: min={points_table_world[:, 2].min():.3f}, "
+        f"max={points_table_world[:, 2].max():.3f}"
+    )
+    print(f"Done. Outputs saved under: {image_folder}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Reconstruct a 3D Gaussian model from a single image and emit the "
+            "associated camera intrinsics/extrinsics, depth maps, and an "
+            "optional table-aligned point cloud."
+        )
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Input directory or single file. Directories are searched recursively for clean_background.{png,jpg}.",
+    )
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        default=DEFAULT_MODEL_ID,
+        help=f"HuggingFace model id to load (default: {DEFAULT_MODEL_ID}).",
+    )
+    parser.add_argument(
+        "--align-table",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help=(
+            "Run RANSAC table alignment and export bg_aligned.ply. "
+            "Use --no-align-table to disable (only bg.ply will be emitted). "
+            "Default: enabled."
+        ),
+    )
+    parser.add_argument(
+        "--x-offset",
+        type=float,
+        default=DEFAULT_X_OFFSET,
+        help="X-axis offset (m) applied after table alignment. Default: 0 (origin).",
+    )
+    parser.add_argument(
+        "--z-offset",
+        type=float,
+        default=DEFAULT_Z_OFFSET,
+        help="Z-axis offset (m) applied after table alignment. Default: 0 (origin).",
+    )
+
+    args = parser.parse_args()
+
+    if os.path.isfile(args.input_dir):
+        input_dir = os.path.dirname(args.input_dir)
+    else:
+        input_dir = args.input_dir
+
+    print(f"Loading model: {args.model_id}")
+    model = AnySplat.from_pretrained(args.model_id)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model.eval()
+    for param in model.parameters():
+        param.requires_grad = False
+    print("Model loaded.")
+
+    clean_background_files = []
+    for root, _dirs, files in os.walk(input_dir):
+        for file in files:
+            if file.lower() in ("clean_background.png", "clean_background.jpg"):
+                clean_background_files.append(os.path.join(root, file))
+
+    print(f"Found {len(clean_background_files)} clean_background images.")
+
+    for idx, image_path in enumerate(clean_background_files, 1):
+        print(f"\nProcessing {idx}/{len(clean_background_files)}: {image_path}")
+        try:
+            process_single_image(image_path, model, device, args)
+            print(f"Successfully processed: {image_path}")
+        except Exception as e:
+            print(f"Error processing {image_path}: {e}")
+            import traceback
+            traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pipeline/mesh2mjcf.py b/pipeline/mesh2mjcf.py
new file mode 100644
index 0000000..7f1795b
--- /dev/null
+++ b/pipeline/mesh2mjcf.py
@@ -0,0 +1,674 @@
+"""Convert a single .obj or .stl mesh into MuJoCo MJCF assets.
+
+This is a generic mesh-to-MJCF converter, derived from
+github.com/discoverse-dev/DISCOVERSE/scripts/mesh2mjcf.py but stripped of any
+DISCOVERSE-specific imports or scene wiring. It is designed to consume the
+per-object meshes that this pipeline emits under `<scene>/3d_assets/<obj>.obj`,
+but works on any standalone mesh file.
+
+Output layout (under --output-dir, which defaults to the input file's parent —
+typically `scene_dir/3d_assets/` when consuming the v2 pipeline outputs):
+
+    <output>/
+        <asset_name>/                       (per-asset folder, named after the obj stem)
+            <asset_name>.obj                (copy of the input mesh)
+            <asset_name>.mtl                (if multi-material)
+            <texture files...>              (referenced by the MTL)
+            part_0.obj part_1.obj ...       (if --convex_decomposition)
+            mjcf/
+                <asset_name>.xml
+                <asset_name>_dependencies.xml
+
+Mesh paths inside the emitted XML are written as `<asset_name>/<file>`, so the
+consuming MuJoCo scene should set `meshdir` (and `texturedir`) to <output>.
+
+Examples:
+
+    # Basic conversion (default RGBA, mass, inertia; no free joint; no decomp).
+    python pipeline/mesh2mjcf.py path/to/cup.obj
+
+    # Specify RGBA, mass, inertia.
+    python pipeline/mesh2mjcf.py path/to/cup.obj \\
+        --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005
+
+    # Free-floating object.
+    python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint
+
+    # Convex decomposition for accurate collisions.
+    python pipeline/mesh2mjcf.py path/to/cup.obj -cd
+
+    # Preview in MuJoCo viewer after conversion.
+    python pipeline/mesh2mjcf.py path/to/cup.obj --verbose
+
+Notes:
+    - Multi-material OBJ files are auto-detected (via the MTL file) and split
+      into one sub-mesh per material; each material yields a MuJoCo
+      `<material>`, with textures (`map_Kd`) copied alongside.
+    - Convex decomposition requires `pip install coacd trimesh`.
+    - Material splitting requires `pip install trimesh`.
+"""
+
+import argparse
+import logging
+import os
+import re
+import shutil
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+
+# ===== MTL handling =====
+
+# MTL fields relevant to MuJoCo.
+MTL_FIELDS = (
+    "Ka",      # Ambient color
+    "Kd",      # Diffuse color
+    "Ks",      # Specular color
+    "d",       # Transparency (alpha)
+    "Tr",      # 1 - transparency
+    "Ns",      # Shininess
+    "map_Kd",  # Diffuse texture map
+)
+
+
+@dataclass
+class Material:
+    """Convenience container for MTL → MuJoCo material conversion."""
+
+    name: str
+    Ka: Optional[str] = None
+    Kd: Optional[str] = None
+    Ks: Optional[str] = None
+    d: Optional[str] = None
+    Tr: Optional[str] = None
+    Ns: Optional[str] = None
+    map_Kd: Optional[str] = None
+
+    @staticmethod
+    def from_string(lines: Sequence[str]) -> "Material":
+        attrs = {"name": lines[0].split(" ")[1].strip()}
+        for line in lines[1:]:
+            for attr in MTL_FIELDS:
+                if line.startswith(attr):
+                    elems = line.split(" ")[1:]
+                    elems = [elem for elem in elems if elem != ""]
+                    attrs[attr] = " ".join(elems)
+                    break
+        return Material(**attrs)
+
+    def mjcf_rgba(self) -> str:
+        Kd = self.Kd or "1.0 1.0 1.0"
+        if self.d is not None:
+            alpha = self.d
+        elif self.Tr is not None:
+            alpha = str(1.0 - float(self.Tr))
+        else:
+            alpha = "1.0"
+        return f"{Kd} {alpha}"
+
+    def mjcf_shininess(self) -> str:
+        if self.Ns is not None:
+            # Ns values are typically 0-1000; normalize to [0, 1].
+            ns_val = float(self.Ns) / 1_000
+        else:
+            ns_val = 0.5
+        return f"{ns_val}"
+
+    def mjcf_specular(self) -> str:
+        if self.Ks is not None:
+            # Average the specular RGB to a scalar.
+            ks_val = sum(map(float, self.Ks.split(" "))) / 3
+        else:
+            ks_val = 0.5
+        return f"{ks_val}"
+
+
+def parse_mtl_name(lines: Sequence[str]) -> Optional[str]:
+    """Return the .mtl filename referenced by an OBJ file's `mtllib` directive."""
+    mtl_regex = re.compile(r"^mtllib\s+(.+?\.mtl)(?:\s*#.*)?\s*\n?$")
+    for line in lines:
+        match = mtl_regex.match(line)
+        if match is not None:
+            return match.group(1)
+    return None
+
+
+def copy_obj_with_mtl(obj_source: Path, obj_target: Path) -> None:
+    """Copy an OBJ file, plus the MTL file it references (if any)."""
+    obj_target.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(obj_source, obj_target)
+
+    try:
+        with open(obj_source, "r") as f:
+            lines = f.readlines()
+        for line in lines:
+            if line.strip().startswith("mtllib "):
+                mtl_filename = line.strip().split()[1]
+                mtl_source = obj_source.parent / mtl_filename
+                mtl_target = obj_target.parent / mtl_filename
+                if mtl_source.exists():
+                    shutil.copy2(mtl_source, mtl_target)
+                    print(f"Copied MTL file: {mtl_source} -> {mtl_target}")
+                break
+    except Exception as e:
+        print(f"Warning: failed to check/copy MTL file for {obj_source}: {e}")
+
+
+def parse_mtl_file(mtl_path: Path) -> Dict[str, Material]:
+    """Parse an MTL file into a name → Material dict."""
+    materials: Dict[str, Material] = {}
+    if not mtl_path.exists():
+        return materials
+
+    with open(mtl_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    lines = [line for line in lines if not line.startswith("#")]
+    lines = [line for line in lines if line.strip()]
+    lines = [line.strip() for line in lines]
+
+    sub_mtls: List[List[str]] = []
+    for line in lines:
+        if line.startswith("newmtl"):
+            sub_mtls.append([])
+        if sub_mtls:
+            sub_mtls[-1].append(line)
+
+    for sub_mtl in sub_mtls:
+        if sub_mtl:
+            material = Material.from_string(sub_mtl)
+            materials[material.name] = material
+
+    return materials
+
+
+def split_obj_by_materials(
+    obj_path: Path, output_dir: Path
+) -> Tuple[Dict[str, Material], List[str]]:
+    """Split a multi-material OBJ into one sub-mesh per material.
+
+    Returns (materials, submesh_files). If the OBJ has zero or one materials,
+    submesh_files is empty and the OBJ is left as a single file.
+    """
+    materials: Dict[str, Material] = {}
+    submesh_files: List[str] = []
+
+    with open(obj_path, "r", encoding="utf-8") as f:
+        obj_lines = f.readlines()
+
+    mtl_name = parse_mtl_name(obj_lines)
+    if mtl_name:
+        mtl_path = obj_path.parent / mtl_name
+        materials = parse_mtl_file(mtl_path)
+
+    if len(materials) <= 1:
+        return materials, []
+
+    try:
+        import trimesh
+    except ImportError:
+        print("Warning: trimesh not installed; cannot split multi-material OBJ.")
+        return materials, []
+
+    try:
+        mesh = trimesh.load(
+            obj_path,
+            split_object=True,
+            group_material=True,
+            process=False,
+            maintain_order=False,
+        )
+
+        if isinstance(mesh, trimesh.base.Trimesh):
+            # Single mesh after grouping; nothing to split.
+            target_file = output_dir / f"{obj_path.stem}.obj"
+            shutil.copy(obj_path, target_file)
+            return materials, []
+
+        obj_stem = obj_path.stem
+        print(f"Splitting OBJ by material: {len(mesh.geometry)} sub-meshes")
+        for i, (material_name, geom) in enumerate(mesh.geometry.items()):
+            submesh_file = f"{obj_stem}_{i}.obj"
+            submesh_path = output_dir / submesh_file
+
+            geom.visual.material.name = material_name
+            geom.export(str(submesh_path), include_texture=True, header=None)
+            submesh_files.append(submesh_file)
+            print(f"  saved sub-mesh: {submesh_file} (material: {material_name})")
+
+        # trimesh sometimes emits a stray `material.mtl` next to the export.
+        temp_mtl = output_dir / "material.mtl"
+        if temp_mtl.exists():
+            temp_mtl.unlink()
+
+        return materials, submesh_files
+    except Exception as e:
+        print(f"Warning: failed to split OBJ by material: {e}")
+        return materials, []
+
+
+# ===== XML builders =====
+
+def create_asset_xml(asset_name, convex_parts=None, materials=None, submesh_files=None):
+    """Build the `<mujocoinclude>` element listing meshes/materials/textures."""
+    root = ET.Element("mujocoinclude")
+    asset = ET.SubElement(root, "asset")
+
+    if materials:
+        for material_name, material in materials.items():
+            material_elem = ET.SubElement(asset, "material")
+            material_elem.set("name", f"{asset_name}_{material_name}")
+            material_elem.set("rgba", material.mjcf_rgba())
+            material_elem.set("specular", material.mjcf_specular())
+            material_elem.set("shininess", material.mjcf_shininess())
+
+            if material.map_Kd:
+                texture_elem = ET.SubElement(asset, "texture")
+                texture_elem.set("type", "2d")
+                texture_elem.set("name", f"{asset_name}_{material_name}_texture")
+                texture_elem.set("file", f"{asset_name}/{material.map_Kd}")
+
+                material_elem.set("texture", f"{asset_name}_{material_name}_texture")
+                material_elem.attrib.pop("rgba", None)
+
+    # Main mesh (only when not split by material).
+    if not submesh_files:
+        mesh_elem = ET.SubElement(asset, "mesh")
+        mesh_elem.set("name", asset_name)
+        mesh_elem.set("file", f"{asset_name}/{asset_name}.obj")
+
+    # Per-material sub-meshes.
+    if submesh_files:
+        for submesh_file in submesh_files:
+            submesh_name = submesh_file.replace(".obj", "")
+            part_mesh = ET.SubElement(asset, "mesh")
+            part_mesh.set("name", submesh_name)
+            part_mesh.set("file", f"{asset_name}/{submesh_file}")
+
+    # Convex-decomposition parts.
+    if convex_parts:
+        for i in range(convex_parts):
+            part_mesh = ET.SubElement(asset, "mesh")
+            part_mesh.set("name", f"{asset_name}_part_{i}")
+            part_mesh.set("file", f"{asset_name}/part_{i}.obj")
+
+    return root
+
+
+def create_geom_xml(
+    asset_name,
+    mass,
+    diaginertia,
+    rgba,
+    free_joint=False,
+    convex_parts=None,
+    materials=None,
+    submesh_files=None,
+    output_dir=None,
+):
+    """Build the `<mujocoinclude>` element with the body's geoms + inertial."""
+    root = ET.Element("mujocoinclude")
+
+    if free_joint:
+        joint_elem = ET.SubElement(root, "joint")
+        joint_elem.set("type", "free")
+
+    inertial_elem = ET.SubElement(root, "inertial")
+    inertial_elem.set("pos", "0 0 0")
+    inertial_elem.set("mass", str(mass))
+    inertial_elem.set(
+        "diaginertia", f"{diaginertia[0]} {diaginertia[1]} {diaginertia[2]}"
+    )
+
+    if submesh_files and materials:
+        # Multi-material: one geom per sub-mesh.
+        for submesh_file in submesh_files:
+            submesh_name = submesh_file.replace(".obj", "")
+            geom_elem = ET.SubElement(root, "geom")
+            geom_elem.set("type", "mesh")
+            geom_elem.set("mesh", submesh_name)
+            geom_elem.set("class", "obj_visual")
+
+            material_assigned = False
+            submesh_path = Path(output_dir) / asset_name / submesh_file
+            if submesh_path.exists():
+                try:
+                    with open(submesh_path, "r", encoding="utf-8") as f:
+                        submesh_lines = f.readlines()
+                    for line in submesh_lines:
+                        line = line.strip()
+                        if line.startswith("usemtl "):
+                            mtl_name = line.split()[1]
+                            geom_elem.set("material", f"{asset_name}_{mtl_name}")
+                            material_assigned = True
+                            break
+                except Exception as e:
+                    print(f"Warning: could not read sub-mesh {submesh_path}: {e}")
+
+            if not material_assigned:
+                geom_elem.set(
+                    "rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}"
+                )
+
+    elif materials and len(materials) == 1:
+        # Single material with possible texture.
+        geom_elem = ET.SubElement(root, "geom")
+        geom_elem.set("type", "mesh")
+        geom_elem.set("mesh", asset_name)
+        geom_elem.set("class", "obj_visual")
+        material_name = next(iter(materials))
+        geom_elem.set("material", f"{asset_name}_{material_name}")
+
+    elif convex_parts:
+        # Visual geom (full mesh) + collision geoms (convex parts).
+        visual_geom = ET.SubElement(root, "geom")
+        visual_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}")
+        visual_geom.set("mesh", asset_name)
+        visual_geom.set("class", "obj_visual")
+
+        for i in range(convex_parts):
+            collision_geom = ET.SubElement(root, "geom")
+            collision_geom.set("type", "mesh")
+            collision_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}")
+            collision_geom.set("mesh", f"{asset_name}_part_{i}")
+
+    else:
+        # Simple solid-colour mesh geom.
+        geom_elem = ET.SubElement(root, "geom")
+        geom_elem.set("type", "mesh")
+        geom_elem.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}")
+        geom_elem.set("mesh", asset_name)
+
+    # When a material/sub-mesh path was taken AND convex decomposition is on,
+    # still emit invisible collision geoms.
+    if convex_parts and (submesh_files or (materials and len(materials) == 1)):
+        for i in range(convex_parts):
+            collision_geom = ET.SubElement(root, "geom")
+            collision_geom.set("type", "mesh")
+            collision_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} 0")
+            collision_geom.set("mesh", f"{asset_name}_part_{i}")
+
+    return root
+
+
+def save_xml_with_formatting(root, filepath):
+    """Indent and write an ElementTree XML file (Python 3.9+)."""
+    ET.indent(root, space="  ", level=0)
+    tree = ET.ElementTree(root)
+    tree.write(filepath, encoding="utf-8", xml_declaration=False)
+
+
+def create_preview_xml(asset_name):
+    """Build a minimal preview scene for `mujoco.viewer`."""
+    root = ET.Element("mujoco")
+    root.set("model", "temp_preview_env")
+
+    option = ET.SubElement(root, "option")
+    option.set("gravity", "0 0 -9.81")
+
+    compiler = ET.SubElement(root, "compiler")
+    compiler.set("meshdir", ".")
+    compiler.set("texturedir", ".")
+
+    include = ET.SubElement(root, "include")
+    include.set("file", f"{asset_name}/mjcf/{asset_name}_dependencies.xml")
+
+    default = ET.SubElement(root, "default")
+    obj_default = ET.SubElement(default, "default")
+    obj_default.set("class", "obj_visual")
+    geom_default = ET.SubElement(obj_default, "geom")
+    geom_default.set("group", "2")
+    geom_default.set("type", "mesh")
+    geom_default.set("contype", "0")
+    geom_default.set("conaffinity", "0")
+
+    worldbody = ET.SubElement(root, "worldbody")
+
+    floor_geom = ET.SubElement(worldbody, "geom")
+    floor_geom.set("name", "floor")
+    floor_geom.set("type", "plane")
+    floor_geom.set("size", "2 2 0.1")
+    floor_geom.set("rgba", ".8 .8 .8 1")
+
+    light = ET.SubElement(worldbody, "light")
+    light.set("pos", "0 0 3")
+    light.set("dir", "0 0 -1")
+
+    body = ET.SubElement(worldbody, "body")
+    body.set("name", asset_name)
+    body.set("pos", "0 0 0.5")
+
+    body_include = ET.SubElement(body, "include")
+    body_include.set("file", f"{asset_name}/mjcf/{asset_name}.xml")
+
+    return root
+
+
+# ===== Main =====
+
+def _build_argparser():
+    parser = argparse.ArgumentParser(
+        description="Convert a .obj or .stl mesh into MuJoCo MJCF assets."
+    )
+    parser.add_argument(
+        "input_file", type=str, help="Path to the input mesh (.obj or .stl)."
+    )
+    parser.add_argument(
+        "--rgba",
+        nargs=4,
+        type=float,
+        default=[0.5, 0.5, 0.5, 1],
+        help="Mesh RGBA colour. Default: [0.5, 0.5, 0.5, 1].",
+    )
+    parser.add_argument(
+        "--mass",
+        type=float,
+        default=0.001,
+        help="Mesh mass (kg). Default: 0.001.",
+    )
+    parser.add_argument(
+        "--diaginertia",
+        nargs=3,
+        type=float,
+        default=[0.00002, 0.00002, 0.00002],
+        help="Diagonal inertia tensor. Default: [2e-5, 2e-5, 2e-5].",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default=None,
+        help=(
+            "Output assets root. Default: the input file's parent directory, "
+            "so that `scene_dir/3d_assets/foo.obj` writes to `scene_dir/`."
+        ),
+    )
+    parser.add_argument(
+        "--free_joint",
+        action="store_true",
+        help="Add a free joint so the body can move.",
+    )
+    parser.add_argument(
+        "-cd",
+        "--convex_decomposition",
+        action="store_true",
+        help=(
+            "Decompose the mesh into convex parts for accurate collision. "
+            "Requires `coacd` and `trimesh`."
+        ),
+    )
+    parser.add_argument(
+        "--scene",
+        action="store_true",
+        help="Use high-precision CoACD config (smaller threshold).",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Open MuJoCo viewer with a preview scene after conversion.",
+    )
+    return parser
+
+
+def main():
+    args = _build_argparser().parse_args()
+
+    input_file = args.input_file
+    rgba = args.rgba
+    mass = args.mass
+    diaginertia = args.diaginertia
+    free_joint = args.free_joint
+    convex_de = args.convex_decomposition
+    verbose = args.verbose
+
+    if args.output is None:
+        output_assets_dir = str(Path(input_file).resolve().parent)
+    else:
+        output_assets_dir = args.output
+
+    if convex_de:
+        try:
+            import coacd  # noqa: F401
+            import trimesh  # noqa: F401
+        except ImportError:
+            print(
+                "Error: `coacd` and `trimesh` are required for "
+                "--convex_decomposition. Install with `pip install coacd trimesh`."
+            )
+            raise SystemExit(1)
+
+    if input_file.endswith(".obj"):
+        asset_name = os.path.basename(input_file)[: -len(".obj")]
+    elif input_file.endswith(".stl"):
+        asset_name = os.path.basename(input_file)[: -len(".stl")]
+    else:
+        raise SystemExit(
+            f"Error: {input_file} is not a supported mesh type. Use .obj or .stl."
+        )
+
+    # Per-asset folder lives directly under <output>, with an `mjcf/` subfolder
+    # for the generated XML files. This way the whole asset (meshes + MTL +
+    # textures + convex parts + MJCF) is self-contained in one directory.
+    output_dir = os.path.join(output_assets_dir, asset_name)
+    mjcf_obj_dir = os.path.join(output_dir, "mjcf")
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.makedirs(output_dir)
+    os.makedirs(mjcf_obj_dir, exist_ok=True)
+
+    # Copy the mesh (and MTL if relevant) into the per-asset folder.
+    if os.path.dirname(input_file) != output_dir:
+        if input_file.endswith(".obj"):
+            copy_obj_with_mtl(
+                Path(input_file), Path(output_dir) / Path(input_file).name
+            )
+        else:
+            shutil.copy(input_file, output_dir)
+
+    # Material splitting (OBJ only).
+    materials: Dict[str, Material] = {}
+    submesh_files: List[str] = []
+    if input_file.endswith(".obj"):
+        print("Checking OBJ for multiple materials...")
+        obj_path = Path(output_dir) / f"{asset_name}.obj"
+        materials, submesh_files = split_obj_by_materials(obj_path, Path(output_dir))
+
+        # Copy referenced texture files (single or multi-material case).
+        if materials:
+            input_parent = Path(input_file).parent
+            for _name, material in materials.items():
+                if material.map_Kd:
+                    texture_src = input_parent / material.map_Kd
+                    if texture_src.exists():
+                        texture_dst = Path(output_dir) / material.map_Kd
+                        shutil.copy(texture_src, texture_dst)
+                        print(f"Copied texture: {material.map_Kd}")
+
+        if submesh_files:
+            print(f"Split into {len(submesh_files)} sub-meshes.")
+        elif len(materials) == 1:
+            print("Single material; no split needed.")
+        else:
+            print("No materials detected.")
+
+    convex_parts_count = 0
+    if convex_de:
+        import coacd
+        import trimesh
+
+        print(f"Running convex decomposition on {asset_name}...")
+        mesh = trimesh.load(input_file, force="mesh")
+        mesh_coacd = coacd.Mesh(mesh.vertices, mesh.faces)
+        coacd_config_scene = {
+            "threshold": 0.01,
+            "preprocess_resolution": 100,
+        }
+        coacd_config = coacd_config_scene if args.scene else {}
+        parts = coacd.run_coacd(mesh_coacd, **coacd_config)
+
+        for i, part in enumerate(parts):
+            part_filename = f"part_{i}.obj"
+            output_part_file = os.path.join(output_dir, part_filename)
+            part_mesh = trimesh.Trimesh(vertices=part[0], faces=part[1])
+            part_mesh.export(output_part_file)
+
+        convex_parts_count = len(parts)
+        print(f"{asset_name} decomposed into {convex_parts_count} convex parts.")
+
+    # Emit the asset dependency XML.
+    asset_xml = create_asset_xml(
+        asset_name,
+        convex_parts_count if convex_de else None,
+        materials if (submesh_files or len(materials) == 1) else None,
+        submesh_files if submesh_files else None,
+    )
+    asset_file_path = os.path.join(mjcf_obj_dir, f"{asset_name}_dependencies.xml")
+    save_xml_with_formatting(asset_xml, asset_file_path)
+
+    # Emit the body geom XML.
+    geom_xml = create_geom_xml(
+        asset_name,
+        mass,
+        diaginertia,
+        rgba,
+        free_joint,
+        convex_parts_count if convex_de else None,
+        materials if (submesh_files or len(materials) == 1) else None,
+        submesh_files if submesh_files else None,
+        output_assets_dir,
+    )
+    geom_file_path = os.path.join(mjcf_obj_dir, f"{asset_name}.xml")
+    save_xml_with_formatting(geom_xml, geom_file_path)
+
+    print(f"Converted {asset_name} to MJCF.")
+    print(f"  meshes: {output_dir}")
+    print(f"  dependencies: {asset_file_path}")
+    print(f"  body geom: {geom_file_path}")
+    if submesh_files:
+        print(
+            f"  material split: {len(submesh_files)} sub-meshes, "
+            f"{len(materials)} materials"
+        )
+
+    if verbose:
+        print("\nLaunching MuJoCo viewer...")
+        py_dir = shutil.which("python") or shutil.which("python3")
+        if not py_dir:
+            print("Error: no `python`/`python3` on PATH; cannot launch viewer.")
+            raise SystemExit(1)
+
+        tmp_world_mjcf = os.path.join(output_assets_dir, "_tmp_preview.xml")
+        preview_xml = create_preview_xml(asset_name)
+        save_xml_with_formatting(preview_xml, tmp_world_mjcf)
+
+        cmd_line = f"{py_dir} -m mujoco.viewer --mjcf {tmp_world_mjcf}"
+        print(f"Running: {cmd_line}")
+        os.system(cmd_line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pipeline/objects_generation.py b/pipeline/objects_generation.py
new file mode 100644
index 0000000..906c8c5
--- /dev/null
+++ b/pipeline/objects_generation.py
@@ -0,0 +1,455 @@
+import os
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+import argparse
+import copy
+
+import numpy as np
+import torch
+import imageio
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+
+from pipeline.utils import (
+    clean_name,
+    load_image,
+    collect_mask_paths,
+    compute_fov_from_intrinsics,
+    mesh_rendering,
+    get_default_mesh_renderer,
+)
+# `inference` is exposed by submodule/Sam-3d-objects via PYTHONPATH; see the
+# top-level shell scripts.
+from inference import (
+    Inference,
+    make_scene,
+    render_gs_view,
+)
+
+
+# Coordinate-system transform applied to SAM-3D-Objects mesh outputs to bring
+# them into the world frame this pipeline operates in (rotates +Y -> +Z, etc.).
+_SAM3D_TO_WORLD = np.array(
+    [
+        [1, 0, 0, 0],
+        [0, 0, -1, 0],
+        [0, 1, 0, 0],
+        [0, 0, 0, 1],
+    ]
+)
+
+_DEFAULT_IMAGE_SIZE = (448, 448)
+
+
+def _flip_xy(arr):
+    """Negate the X and Y components in-place on an (N, 3+) array or tensor."""
+    arr[:, 0] = -arr[:, 0]
+    arr[:, 1] = -arr[:, 1]
+    return arr
+
+
+def _load_depth_with_fallback(image_dir, required_depth_path):
+    """Prefer `depth_ori.npy` (raw AnySplat output) over `depth.npy`."""
+    depth_ori_path = os.path.join(image_dir, "depth_ori.npy")
+    if os.path.exists(depth_ori_path):
+        return np.load(depth_ori_path)
+    return np.load(required_depth_path)
+
+
+def process_single_image(image_path, inference, args):
+    """Run multi-object inference and asset export for one input image."""
+    image_path = os.path.abspath(image_path)
+    image_dir = os.path.dirname(image_path)
+
+    # Optional scene scale factor produced by the AnySplat stage.
+    scale_factor_path = os.path.join(image_dir, "scale.npy")
+    if os.path.exists(scale_factor_path):
+        scale_factor = float(np.asarray(np.load(scale_factor_path)).squeeze())
+    else:
+        scale_factor = 1.0
+    print(f"Scale factor: {scale_factor}")
+
+    pil_image = load_image(image_path)
+    image_bg = np.array(pil_image)
+
+    masks_dir = os.path.join(image_dir, "masks")
+    mask_paths = collect_mask_paths(masks_dir)
+
+    assets_dir = os.path.join(image_dir, "3d_assets")
+    pt_dir = os.path.join(image_dir, "pt")
+
+    if not mask_paths:
+        print(f"Warning: No mask images found in {masks_dir}")
+        print("Creating placeholder directories and continuing...")
+        os.makedirs(assets_dir, exist_ok=True)
+        os.makedirs(pt_dir, exist_ok=True)
+        return
+
+    os.makedirs(assets_dir, exist_ok=True)
+    os.makedirs(pt_dir, exist_ok=True)
+
+    required_files = {
+        "extrinsic": os.path.join(image_dir, "extrinsic.npy"),
+        "intrinsic": os.path.join(image_dir, "intrinsic.npy"),
+        "depth": os.path.join(image_dir, "depth.npy"),
+    }
+    missing_files = [name for name, p in required_files.items() if not os.path.exists(p)]
+    if missing_files:
+        print(f"Warning: Missing required files: {missing_files}")
+        print("These files should be generated by the AnySplat pipeline first.")
+        return
+
+    extrinsics = np.load(required_files["extrinsic"])
+    intrinsics = np.load(required_files["intrinsic"])
+    depth_anysplat = _load_depth_with_fallback(image_dir, required_files["depth"])
+
+    fx_pixels = intrinsics[0, 0]
+    fy_pixels = intrinsics[1, 1]
+
+    image_size = _DEFAULT_IMAGE_SIZE
+    _, fov_y = compute_fov_from_intrinsics(fx_pixels, fy_pixels, image_size, degrees=True)
+    mesh_renderer = get_default_mesh_renderer(width=image_size[1], height=image_size[0])
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    for i, mask_path in enumerate(mask_paths):
+        print(f"\n[{i+1}/{len(mask_paths)}] Processing mask: {mask_path}")
+
+        # ===== Load and binarize mask =====
+        mask_ = np.array(Image.open(mask_path).convert("L"))
+        mask = np.where(mask_ > 0, 1, 0).astype("uint8")
+        size_ori = np.sum(mask)
+
+        depth_fg = depth_anysplat[mask > 0]
+        if len(depth_fg) == 0:
+            print("Warning: Mask has no valid depth values, skipping image.")
+            return
+        mean_depth_ori = depth_fg.mean()
+        min_depth_ori = depth_fg.min()
+        max_depth_ori = depth_fg.max()
+        print(
+            f"Depth in mask region: mean={mean_depth_ori:.4f}, "
+            f"min={min_depth_ori:.4f}, max={max_depth_ori:.4f}"
+        )
+
+        mask_stem = clean_name(os.path.splitext(os.path.basename(mask_path))[0])
+        save_path = os.path.join(pt_dir, f"{mask_stem}.pt")
+
+        # ===== Run or load inference =====
+        if os.path.exists(save_path):
+            print(f"Loading cached inference result: {save_path}")
+            out = torch.load(save_path, map_location=device, weights_only=False)
+        else:
+            print("Running inference on mask...")
+            out = inference(image_bg, mask, seed=args.seed)
+            if args.save_pt:
+                torch.save(out, save_path)
+                print(f"Saved inference result: {save_path}")
+
+        gs_origin = copy.deepcopy(out["gs"])
+
+        # ===== Optional intermediate GS preview before mesh alignment =====
+        if args.save_intermediate:
+            single_scene = make_scene(out)
+            xyz_cv = _flip_xy(single_scene.get_xyz.clone())
+            single_scene.from_xyz(xyz_cv)
+            image_gs = render_gs_view(
+                single_scene, extrinsics=extrinsics, fov_y=fov_y / 180 * np.pi
+            )
+            imageio.imwrite(
+                os.path.join(image_dir, f"{mask_stem}_1_gs.png"), image_gs
+            )
+            single_scene.save_ply(
+                os.path.join(assets_dir, f"{mask_stem}_gs_with_inferenced_pose.ply")
+            )
+        else:
+            single_scene = None
+
+        # ===== Pose parameters from SAM-3D-Objects =====
+        rotation_output = out["rotation"].cpu().numpy()
+        translation_output = out["translation"].cpu().numpy()
+        scale_output = out["scale"].squeeze(0).cpu().numpy()
+
+        print(f"Rotation (quaternion): {rotation_output}")
+        print(f"Translation: {translation_output}")
+        print(f"Scale: {scale_output}")
+
+        if not out["glb"]:
+            # No mesh produced for this object; skip to cleanup.
+            if single_scene is not None:
+                del single_scene
+            del out
+            torch.cuda.empty_cache()
+            continue
+
+        mesh = out["glb"]
+        mesh.apply_transform(_SAM3D_TO_WORLD)
+        mesh_origin = copy.deepcopy(mesh)
+
+        # The pose transform below is only used to estimate object size from the
+        # current view; the exported asset stays at the origin.
+        quat = copy.deepcopy(rotation_output)
+        rot = R.from_quat(quat, scalar_first=True).as_matrix().squeeze(0)
+        inverse_rot = np.linalg.inv(rot)
+
+        scale = np.broadcast_to(np.asarray(scale_output, dtype=float), (3,)).copy()
+        scale_mat = np.diag(scale)
+
+        transform = np.eye(4)
+        transform[:3, :3] = inverse_rot @ scale_mat
+        transform[:3, 3] = copy.deepcopy(translation_output)
+        mesh.apply_transform(transform)
+        _flip_xy(mesh.vertices)
+
+        if args.save_intermediate:
+            mesh.export(os.path.join(assets_dir, f"{mask_stem}_mesh_with_inferenced_pose.obj"))
+
+        # ===== Render to recover scale by area + depth ratio =====
+        mesh_copy = copy.deepcopy(mesh)
+        color, depth = mesh_rendering(
+            mesh=mesh_copy,
+            extrinsics=extrinsics,
+            fov_y=fov_y / 180 * np.pi,
+            renderer=mesh_renderer,
+        )
+        if args.save_intermediate:
+            imageio.imwrite(
+                os.path.join(image_dir, f"{mask_stem}_1_mesh.png"), color
+            )
+
+        valid_depth = depth[depth > 0]
+        if len(valid_depth) == 0:
+            if single_scene is not None:
+                del single_scene
+            del out
+            torch.cuda.empty_cache()
+            continue
+
+        mean_depth_sam3d = np.mean(valid_depth)
+        size_new = np.sum(depth > 0)
+        scale_factor_local = (
+            np.sqrt(size_ori / size_new) * (mean_depth_ori / mean_depth_sam3d)
+        )
+        mesh.apply_scale(scale_factor_local)
+
+        mesh_copy = mesh.copy()
+        color, depth = mesh_rendering(
+            mesh=mesh_copy,
+            extrinsics=extrinsics,
+            fov_y=fov_y / 180 * np.pi,
+            renderer=mesh_renderer,
+        )
+        valid_depth = depth[depth > 0]
+        mean_depth_sam3d_2 = np.mean(valid_depth)
+        z_shift_2 = mean_depth_ori - mean_depth_sam3d_2
+        mesh.vertices = mesh.vertices + np.array([0, 0, z_shift_2])
+
+        if args.save_intermediate:
+            transformed_mesh_path = os.path.join(assets_dir, f"{mask_stem}_mesh_final.obj")
+            mesh.export(transformed_mesh_path)
+            print(f"Saved transformed mesh: {transformed_mesh_path}")
+            color, _ = mesh_rendering(
+                mesh=mesh,
+                extrinsics=extrinsics,
+                fov_y=fov_y / 180 * np.pi,
+                renderer=mesh_renderer,
+            )
+            imageio.imwrite(
+                os.path.join(image_dir, f"{mask_stem}_mesh.png"), color
+            )
+
+        # ===== Final export at origin (mesh + GS) =====
+        total_scale = float(scale_factor_local * scale_output[0]) / scale_factor
+        print(
+            f"Total scaling: {total_scale:.4f} "
+            f"(local_scale={scale_factor_local:.4f}, "
+            f"object_scale={scale_output[0]:.4f}, scene_scale={scale_factor})"
+        )
+
+        mesh_origin.apply_scale(total_scale)
+        resized_mesh_path = os.path.join(assets_dir, f"{mask_stem}.obj")
+        mesh_origin.export(resized_mesh_path)
+        print(f"Saved resized mesh for mujoco: {resized_mesh_path}")
+
+        final_mesh_mean_xyz = np.mean(mesh_origin.vertices, axis=0)
+        mean_xyz_path = os.path.join(assets_dir, f"{mask_stem}_keyframe.npy")
+        np.save(mean_xyz_path, final_mesh_mean_xyz)
+        print(
+            f"Final mesh mean XYZ: "
+            f"[{final_mesh_mean_xyz[0]:.6f}, {final_mesh_mean_xyz[1]:.6f}, "
+            f"{final_mesh_mean_xyz[2]:.6f}]"
+        )
+        print(f"Saved final mesh mean XYZ to: {mean_xyz_path}")
+
+        if args.save_intermediate:
+            # Apply the same transform to the GS scene so the debug snapshot
+            # matches the mesh.
+            xyz_cv = single_scene.get_xyz.clone() * scale_factor_local
+            single_scene.from_xyz(xyz_cv)
+
+            scale_t = single_scene.get_scaling * scale_factor_local
+            single_scene.mininum_kernel_size *= scale_factor_local
+            scale_t = torch.maximum(
+                scale_t,
+                torch.tensor(
+                    gs_origin.mininum_kernel_size * 1.1,
+                    device=scale_t.device,
+                    dtype=scale_t.dtype,
+                ),
+            )
+            single_scene.from_scaling(scale_t)
+
+            xyz_cv = single_scene.get_xyz.clone()
+            xyz_cv[:, 2] = xyz_cv[:, 2] + z_shift_2
+            single_scene.from_xyz(xyz_cv)
+
+            single_ply_path = os.path.join(assets_dir, f"{mask_stem}_gs_final.ply")
+            single_scene.save_ply(single_ply_path)
+            print(f"Saved transformed Gaussian: {single_ply_path}")
+            image_gs = render_gs_view(
+                single_scene, extrinsics=extrinsics, fov_y=fov_y / 180 * np.pi
+            )
+            imageio.imwrite(
+                os.path.join(image_dir, f"{mask_stem}_gs.png"), image_gs
+            )
+
+        # Scale the original GS to mujoco units and save.
+        xyz = gs_origin.get_xyz * total_scale
+        gs_origin.from_xyz(xyz)
+
+        scale_t = gs_origin.get_scaling * total_scale
+        gs_origin.mininum_kernel_size *= total_scale
+        scale_t = torch.maximum(
+            scale_t,
+            torch.tensor(
+                gs_origin.mininum_kernel_size * 1.1,
+                device=scale_t.device,
+                dtype=scale_t.dtype,
+            ),
+        )
+        gs_origin.from_scaling(scale_t)
+
+        origin_ply_path = os.path.join(assets_dir, f"{mask_stem}.ply")
+        gs_origin.save_ply(origin_ply_path)
+        print(f"Saved resized Gaussian for mujoco: {origin_ply_path}")
+
+        if single_scene is not None:
+            del single_scene
+        del out
+        torch.cuda.empty_cache()
+
+        print(f"Completed processing mask: {mask_stem}")
+
+    print(f"\nAll masks processed for image: {image_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Run SAM-3D-Objects multi-object inference, save outputs to .pt, "
+            "and reconstruct per-object Gaussian (.ply) and mesh (.obj) assets."
+        )
+    )
+    parser.add_argument(
+        "--project-root",
+        type=str,
+        default="submodule/Sam-3d-objects",
+        help="Root directory of the sam-3d-objects project.",
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        required=True,
+        help="Input directory containing image folders.",
+    )
+    parser.add_argument(
+        "--image-name",
+        type=str,
+        default="input_image.png",
+        help="Name of the image file to process in each folder.",
+    )
+    parser.add_argument(
+        "--tag",
+        type=str,
+        default="hf",
+        help=(
+            "Checkpoint tag, corresponds to "
+            "submodule/Sam-3d-objects/checkpoints/{tag}/pipeline.yaml"
+        ),
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed passed into Inference.__call__.",
+    )
+    parser.add_argument(
+        "--save-pt",
+        action="store_true",
+        help="Save inference results to pt/*.pt. By default results are not saved.",
+    )
+    parser.add_argument(
+        "--save-intermediate",
+        action="store_true",
+        help="Save intermediate debug outputs (e.g. *_1_gs.png, *_1_mesh.png).",
+    )
+
+    args = parser.parse_args()
+
+    if os.path.isfile(args.input_dir):
+        input_dir = os.path.dirname(os.path.abspath(args.input_dir))
+    else:
+        input_dir = os.path.abspath(args.input_dir)
+
+    project_root = os.path.abspath(args.project_root)
+
+    print(f"Project root (Sam-3d-objects): {project_root}")
+    print(f"Input directory: {input_dir}")
+    print(f"Looking for image files named: {args.image_name}")
+
+    config_path = os.path.join(project_root, "checkpoints", args.tag, "pipeline.yaml")
+    print(f"Loading model from config: {config_path}")
+    inference = Inference(config_path, compile=False)
+    print("Model loaded successfully")
+
+    image_files = []
+    for root, _dirs, files in os.walk(input_dir):
+        for file in files:
+            if file == args.image_name:
+                image_files.append(os.path.join(root, file))
+
+    print(f"Found {len(image_files)} image files to process")
+
+    if not image_files:
+        print(f"No {args.image_name} files found in {input_dir}")
+        print("Directory structure:")
+        for root, _dirs, files in os.walk(input_dir):
+            level = root.replace(input_dir, "").count(os.sep)
+            indent = " " * 2 * level
+            print(f"{indent}{os.path.basename(root)}/")
+            subindent = " " * 2 * (level + 1)
+            for file in files:
+                if file.lower().endswith((".png", ".jpg", ".jpeg")):
+                    print(f"{subindent}{file}")
+        return
+
+    for idx, image_path in enumerate(image_files, 1):
+        print(f"\n{'=' * 80}")
+        print(f"Processing image {idx}/{len(image_files)}")
+        print(f"Image path: {image_path}")
+        print(f"{'=' * 80}")
+
+        try:
+            process_single_image(image_path, inference, args)
+            print(f"Successfully processed: {image_path}")
+        except Exception as e:
+            print(f"Error processing {image_path}: {e}")
+            import traceback
+            traceback.print_exc()
+
+    print(f"\n{'=' * 80}")
+    print(f"All processing completed. Processed {len(image_files)} images.")
+    print(f"{'=' * 80}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pipeline/reconstruct_from_pt.py b/pipeline/reconstruct_from_pt.py
deleted file mode 100644
index 929e426..0000000
--- a/pipeline/reconstruct_from_pt.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import os
-import glob
-import argparse
-
-import torch
-
-from inference import (
-    make_scene,
-    ready_gaussian_for_video_rendering,
-    render_video,
-    interactive_visualizer,
-)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Load saved *.pt and reconstruct single & multi-object Gaussian .ply"
-    )
-    parser.add_argument(
-        "--project-root",
-        type=str,
-        default="sam-3d-objects",
-        help="Root directory of sam-3d-objects project.",
-    )
-    parser.add_argument(
-        "--save-dir",
-        type=str,
-        default="sam-3d-objects/torch_save_pt",
-        help="Directory containing *.pt files.",
-    )
-    parser.add_argument(
-        "--image-path",
-        type=str,
-        default="sam3/assets/img.jpg",
-        help="Original image path (used only to derive IMAGE_NAME).",
-    )
-    parser.add_argument(
-        "--export-gif",
-        action="store_true",
-        help="If set, render GIFs for each object and the merged scene.",
-    )
-    args = parser.parse_args()
-
-    project_root = args.project_root
-    image_path = args.image_path
-    image_name = os.path.basename(os.path.dirname(image_path))
-
-    # 这里不再限定 object_*.pt，而是把 save-dir 下所有 .pt 都吃掉
-    paths = sorted(glob.glob(os.path.join(args.save_dir, "*.pt")))
-    if not paths:
-        raise RuntimeError(f"No .pt found under {args.save_dir}")
-
-    print(f"Found {len(paths)} .pt files:")
-    for p in paths:
-        print("  ", p)
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    # 单物体输出目录
-    single_gauss_dir = os.path.join(project_root, "gaussians", "single")
-    os.makedirs(single_gauss_dir, exist_ok=True)
-
-    # 合并场景要用到的 outputs
-    outputs = []
-
-    if args.export_gif:
-        import imageio
-
-    # =========================
-    # 1️⃣ 遍历每个 .pt：导出单物体 PLY (+ 可选 GIF)
-    # =========================
-    for idx, p in enumerate(paths):
-        print(f"[{idx+1}/{len(paths)}] loading {p}")
-        out = torch.load(p, map_location=device)
-        # 输出out 的dict键
-        print(f"  Output keys: {list(out.keys())}")
-        
-        outputs.append(out)
-
-        # 只用 make_scene，不做 ready_gaussian_for_video_rendering
-        single_scene = make_scene(out)
-
-        stem = os.path.splitext(os.path.basename(p))[0]
-        single_ply_path = os.path.join(single_gauss_dir, f"{stem}.ply")
-        single_scene.save_ply(single_ply_path)
-        print(f"🟢 Saved single-object PLY: {single_ply_path}")
-
-        if args.export_gif:
-            video = render_video(
-                single_scene,
-                r=1,
-                fov=60,
-                resolution=512,
-            )["color"]
-
-            single_gif_path = os.path.join(single_gauss_dir, f"{stem}.gif")
-            imageio.mimsave(
-                single_gif_path,
-                video,
-                format="GIF",
-                duration=1000 / 30,  # 30fps
-                loop=0,
-            )
-            print(f"🎞️ Saved single-object GIF: {single_gif_path}")
-
-        # 如果显存很紧张，可以在这里 del single_scene / video 等
-        del single_scene
-
-    print("✅ All single-object scenes exported.")
-
-    # =========================
-    # 2️⃣ 合并多对象场景：PLY (+ 可选 GIF)
-    # =========================
-    scene_gs = make_scene(*outputs)
-    scene_gs = ready_gaussian_for_video_rendering(scene_gs)
-
-    gauss_dir = os.path.join(project_root, "gaussians", "multi")
-    os.makedirs(gauss_dir, exist_ok=True)
-
-    ply_path = os.path.join(gauss_dir, f"{image_name}.ply")
-    scene_gs.save_ply(ply_path)
-    print(f"✅ Saved merged PLY: {ply_path}")
-
-    if args.export_gif:
-        video = render_video(
-            scene_gs,
-            r=1,
-            fov=60,
-            resolution=512,
-        )["color"]
-
-        gif_path = os.path.join(gauss_dir, f"{image_name}.gif")
-        imageio.mimsave(
-            gif_path,
-            video,
-            format="GIF",
-            duration=1000 / 30,  # 30fps
-            loop=0,
-        )
-        print(f"✅ Saved merged GIF: {gif_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pipeline/run_sam3_agent_full.py b/pipeline/run_sam3_agent_full.py
deleted file mode 100644
index c6b0290..0000000
--- a/pipeline/run_sam3_agent_full.py
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-"""
-prompt + image -> SAM3 多物体分割 mask 的完整脚本：
-
-1. 用 Qwen3-VL-8B-Thinking 看图，生成若干条英文物体描述 prompt_list
-2. 对每条 prompt 调用 SAM3 分割：
-   - 输出到 agent_output_multi/obj_i/*.json
-   - json 里包含 pred_masks（RLE）、overlay 图路径等
-3. 将所有 obj_i/*.json 里的 pred_masks 解码为 PNG 二值 mask：
-   - 保存到 agent_output_multi/masks/obj_i/<json_name>/mask_k.png
-
-之后，你的 run_sam3d_multi.py 里的 --mask-root
-可以直接指向 agent_output_multi/masks。
-"""
-
-import os
-import ast
-import json
-import argparse
-from functools import partial
-from typing import Optional
-
-import numpy as np
-import torch
-from PIL import Image
-import pycocotools.mask as mask_util
-
-import sam3
-from sam3 import build_sam3_image_model
-from sam3.model.sam3_image_processor import Sam3Processor
-from sam3.agent.client_llm import send_generate_request as send_generate_request_orig
-from sam3.agent.client_sam3 import call_sam_service as call_sam_service_orig
-
-
-# =========================
-# 0. 环境变量（可按需精简）
-# =========================
-
-
-
-# =========================
-# 1. LLM 配置（Qwen3-VL）
-# =========================
-
-LLM_CONFIGS = {
-    # vLLM-served models
-    "qwen3_vl_8b_thinking": {
-        "provider": "vllm",
-        # model 不再写死，在 build_llm_config 时通过参数传入
-        "model": None,
-    },
-}
-
-
-def build_llm_config(
-    name: str = "qwen3_vl_8b_thinking",
-    model_id: Optional[str] = None,
-):
-    """
-    构建 LLM config：
-    - name: 在 LLM_CONFIGS 里的 key
-    - model_id: 要发给 vLLM 的模型名称（需与 --served-model-name 一致）
-    """
-    cfg = LLM_CONFIGS[name].copy()
-    cfg["name"] = name
-    cfg["api_key"] = "LOCAL_VLLM"
-
-    if model_id is not None:
-        cfg["model"] = model_id
-    elif cfg.get("model") is None:
-        raise ValueError(
-            "LLM model id is not set. Please pass --llm-model-id to match vLLM --served-model-name."
-        )
-
-    if cfg["provider"] == "vllm":
-        server_url = "http://127.0.0.1:8001/v1"
-    else:
-        server_url = cfg["base_url"]
-
-    return cfg, server_url
-
-
-# =========================
-# 2. SAM3 模型构建
-# =========================
-
-def build_sam3_processor() -> Sam3Processor:
-    sam3_root = os.path.join(os.path.dirname(sam3.__file__), "..")
-    bpe_path = f"{sam3_root}/assets/bpe_simple_vocab_16e6.txt.gz"
-    model = build_sam3_image_model(bpe_path=bpe_path)
-    processor = Sam3Processor(model, confidence_threshold=0.5)
-    return processor
-
-
-# =========================
-# 3. Qwen 生成场景 prompt_list
-# =========================
-
-def generate_scene_prompts_with_qwen(
-    image_path: str,
-    send_generate_request,
-    llm_config: dict,
-    max_prompts: int = 12,
-    system_prompt_path: str = "examples/system_prompt_scene_prompts.txt",
-):
-    """
-    1. 调 Qwen3-VL-8B-Thinking，看图生成可分割对象的英文短 prompt 列表。
-    2. 更鲁棒地解析 <prompt_list>...[...]...</prompt_list>，在缺少 closing tag 时也能工作。
-    3. 自动清洗掉 </think> 等无效内容。
-    """
-
-    # 1) 读取 system prompt
-    if not os.path.exists(system_prompt_path):
-        raise FileNotFoundError(f"system prompt file not found: {system_prompt_path}")
-
-    with open(system_prompt_path, "r", encoding="utf-8") as f:
-        system_prompt = f.read().strip()
-
-    # 2) 构造 messages（带 image_url）
-    image_path = os.path.abspath(image_path)
-    image_url = f"file://{image_path}"
-
-    messages = [
-        {"role": "system", "content": system_prompt},
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": (
-                        "You are given the image above. "
-                        "Follow the instructions in the system prompt to analyze the scene, "
-                        "then output both <analysis>...</analysis> and <prompt_list>...</prompt_list>. "
-                        "Do NOT omit the <prompt_list> block. The <prompt_list> block must be a valid Python list of strings."
-                    ),
-                },
-                {"type": "image_url", "image_url": {"url": image_url}},
-            ],
-        },
-    ]
-
-    # 3) 调用 vLLM / Qwen
-    resp = send_generate_request(messages=messages)
-
-    # 4) 统一拿到 raw_text
-    if isinstance(resp, str):
-        raw_text = resp
-    elif isinstance(resp, dict):
-        try:
-            raw_text = resp["choices"][0]["message"]["content"]
-        except Exception:
-            raw_text = str(resp)
-    else:
-        try:
-            raw_text = resp.choices[0].message.content
-        except Exception:
-            raw_text = str(resp)
-
-    raw_text = raw_text.strip()
-
-    # ---------------------------
-    # 5) 尝试从 <prompt_list> 中抽取“[...]”这段
-    # ---------------------------
-    list_block = raw_text
-
-    # 先截掉 <prompt_list> 前面的分析内容
-    if "<prompt_list>" in raw_text:
-        after_tag = raw_text.split("<prompt_list>", 1)[1]
-        list_block = after_tag
-    # 如果有 closing tag，再截掉后面
-    if "</prompt_list>" in list_block:
-        list_block = list_block.split("</prompt_list>", 1)[0]
-
-    # 从 list_block 中找第一个 '[' 和最后一个 ']'，尽量拿到一个完整的 Python list 字符串
-    inner = None
-    start = list_block.find("[")
-    end = list_block.rfind("]")
-    if start != -1 and end != -1 and end > start:
-        inner = list_block[start : end + 1].strip()
-
-    # 如果还是没拿到，就 fallback：把整个 list_block 当作 inner
-    if inner is None:
-        inner = list_block.strip()
-
-    # ---------------------------
-    # 6) 解析 inner -> Python list[str]
-    # ---------------------------
-    prompt_list: list[str] = []
-
-    # 优先 literal_eval
-    try:
-        data = ast.literal_eval(inner)
-        if isinstance(data, list):
-            prompt_list = [
-                s.strip()
-                for s in data
-                if isinstance(s, str) and s.strip()
-            ]
-        else:
-            raise ValueError("parsed object is not a list")
-    except Exception:
-        # fallback：行级解析（更严格一点，只收“看起来像短 prompt”的行）
-        lines = [l.strip() for l in inner.splitlines() if l.strip()]
-        tmp: list[str] = []
-        for l in lines:
-            # 跳过明显是 tag 或分析段落的行
-            if l.startswith("<") and l.endswith(">"):
-                continue
-            if l in ("<think>", "</think>"):
-                continue
-
-            # 如果是形如 1. xxx / 2) xxx
-            if l[0].isdigit():
-                parts = l.split(maxsplit=1)
-                if len(parts) == 2:
-                    candidate = parts[1].lstrip(".)").strip()
-                else:
-                    candidate = l
-            else:
-                candidate = l
-
-            # 简单过滤掉过长的整段分析（比如一个大段落 > 200 字符）
-            if len(candidate) > 200:
-                continue
-
-            if candidate:
-                tmp.append(candidate)
-
-        prompt_list = tmp
-
-    # ---------------------------
-    # 7) 最后再清洗一遍 prompt_list
-    # ---------------------------
-    cleaned: list[str] = []
-    for s in prompt_list:
-        s = s.strip()
-        if not s:
-            continue
-        # 丢掉残余的 tag / think
-        if s.startswith("<") and s.endswith(">"):
-            continue
-        if s in ("<think>", "</think>"):
-            continue
-        cleaned.append(s)
-
-    prompt_list = cleaned[:max_prompts]
-    return raw_text, prompt_list
-
-
-# =========================
-# 4. JSON → PNG mask 工具
-# =========================
-
-def safe_name(name: str) -> str:
-    """简单处理一下名字中的空格，避免路径问题。"""
-    return name.replace(" ", "_")
-
-
-def decode_rle_mask(counts: str, h: int, w: int) -> np.ndarray:
-    """将 SAM3/COCO RLE 字符串解码为 (h, w) 的 0/1 uint8 mask。"""
-    rle = {"counts": counts.encode("utf-8"), "size": [h, w]}
-    mask = mask_util.decode(rle)   # (h, w, 1) 或 (h, w)
-    if mask.ndim == 3:
-        mask = mask[:, :, 0]
-    return mask.astype(np.uint8)
-
-
-def convert_agent_json_to_masks(agent_root: str):
-    """
-    遍历 agent_root 下的 obj_*/ 目录，
-    把所有 json 里的 pred_masks 解码为 PNG mask。
-
-    输出结构：
-      agent_root/masks/obj_i/<json_basename>/mask_k.png
-    """
-    agent_root = os.path.abspath(agent_root)
-    mask_root = os.path.join(agent_root, "masks")
-    os.makedirs(mask_root, exist_ok=True)
-
-    print(f"[INFO] Converting JSON → PNG masks under: {agent_root}")
-    print(f"[INFO] Masks will be saved to: {mask_root}")
-
-    for obj_name in os.listdir(agent_root):
-        obj_dir = os.path.join(agent_root, obj_name)
-        if not os.path.isdir(obj_dir):
-            continue
-        if os.path.abspath(obj_dir) == os.path.abspath(mask_root):
-            continue
-
-        safe_obj_name = safe_name(obj_name)
-        obj_mask_root = os.path.join(mask_root, safe_obj_name)
-        os.makedirs(obj_mask_root, exist_ok=True)
-
-        print(f"\n=== Scanning folder: {obj_dir} → {obj_mask_root} ===")
-
-        for root, _, files in os.walk(obj_dir):
-            for fname in files:
-                if not fname.endswith(".json"):
-                    continue
-
-                json_path = os.path.join(root, fname)
-
-                try:
-                    with open(json_path, "r") as f:
-                        data = json.load(f)
-                except Exception as e:
-                    print(f"  [SKIP] Failed to load {json_path}: {e}")
-                    continue
-
-                # 某些是 list（history log），直接跳过
-                if not isinstance(data, dict):
-                    print(f"  [SKIP] {json_path}: json is list, not mask dict")
-                    continue
-
-                pred_masks = data.get("pred_masks")
-                if not pred_masks:
-                    print(f"  [SKIP] {json_path}: no pred_masks")
-                    continue
-
-                h = data.get("orig_img_h")
-                w = data.get("orig_img_w")
-                if h is None or w is None:
-                    print(f"  [SKIP] {json_path}: missing height/width")
-                    continue
-
-                json_basename = os.path.splitext(os.path.basename(json_path))[0]
-                safe_json_basename = safe_name(json_basename)
-
-                out_dir = os.path.join(obj_mask_root, safe_json_basename)
-                os.makedirs(out_dir, exist_ok=True)
-
-                print(f"  [OK] {json_path}: {len(pred_masks)} masks → {out_dir}")
-
-                scores = data.get("pred_scores", [])
-                for i, counts in enumerate(pred_masks):
-                    mask = decode_rle_mask(counts, h, w)
-
-                    mask_save_path = os.path.join(out_dir, f"mask_{i+1}.png")
-                    Image.fromarray(mask * 255).save(mask_save_path)
-
-                    score_str = f", score={scores[i]:.3f}" if i < len(scores) else ""
-                    print(f"    saved mask_{i+1}.png{score_str}")
-
-
-# =========================
-# 5. 主流程：prompt + img -> mask
-# =========================
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Qwen3-VL + SAM3: prompt+image -> multi-object masks"
-    )
-    parser.add_argument(
-        "--image-path",
-        type=str,
-        default="sam3/assets/img.jpg",
-        help="输入图片路径",
-    )
-    parser.add_argument(
-        "--output-root",
-        type=str,
-        default="sam3/agent_output_multi",
-        help="SAM3 多物体输出根目录（内部会建 obj_1, obj_2, ...）",
-    )
-    parser.add_argument(
-        "--system-prompt-path",
-        type=str,
-        default="sam3/examples/system_prompt_scene_prompts.txt",
-        help="Qwen 用的 system prompt 文本路径",
-    )
-    parser.add_argument(
-        "--max-prompts",
-        type=int,
-        default=12,
-        help="最多保留多少个物体 prompt",
-    )
-    parser.add_argument(
-        "--skip-first",
-        action="store_true",
-        help="是否丢弃 prompt_list 的第一个元素（如果它更像场景描述而不是具体物体）",
-    )
-    parser.add_argument(
-        "--llm-model-id",
-        type=str,
-        default="sam3/models",
-        help="发送给 LLM 服务的模型名称（需与 vLLM --served-model-name 一致）",
-    )
-
-    args = parser.parse_args()
-
-
-    # 构建 LLM & SAM3
-    llm_config, llm_server_url = build_llm_config(
-        name="qwen3_vl_8b_thinking",
-        model_id=args.llm_model_id,
-    )
-    processor = build_sam3_processor()
-
-    send_generate_request = partial(
-        send_generate_request_orig,
-        server_url=llm_server_url,
-        model=llm_config["model"],
-        api_key=llm_config["api_key"],
-    )
-    call_sam_service = partial(call_sam_service_orig, sam3_processor=processor)
-
-    image = os.path.abspath(args.image_path)
-    output_root = os.path.abspath(args.output_root)
-    os.makedirs(output_root, exist_ok=True)
-
-    # 1) Qwen 生成场景 prompt_list
-    print(f"[INFO] Generating prompts for image: {image}")
-    raw_text, prompt_list = generate_scene_prompts_with_qwen(
-        image_path=image,
-        send_generate_request=send_generate_request,
-        llm_config=llm_config,
-        max_prompts=args.max_prompts,
-        system_prompt_path=args.system_prompt_path,
-    )
-
-    print("\n====== 原始 Qwen 输出（raw_text，截断开头 800 字） ======")
-    print(raw_text[:800])
-    print("......\n")
-
-    if args.skip_first and len(prompt_list) > 1:
-        prompt_list = prompt_list[1:]
-
-    print("====== 解析后的 prompt_list ======")
-    for i, p in enumerate(prompt_list, start=1):
-        print(f"{i}. {p}")
-
-    # 2) 逐个 prompt 调用 SAM3，写入 json
-    for i, prompt in enumerate(prompt_list, start=1):
-        print(f"\n================ [Prompt {i}] {prompt} ================\n")
-
-        this_output_dir = os.path.join(output_root, f"obj_{i}")
-        os.makedirs(this_output_dir, exist_ok=True)
-
-        json_path = call_sam_service(
-            image_path=image,
-            text_prompt=prompt,
-            output_folder_path=this_output_dir,
-        )
-        print(f"[OK] SAM3 output json: {json_path}")
-
-    # 3) 把所有 json 里的 pred_masks 解码为 PNG mask
-    convert_agent_json_to_masks(output_root)
-
-    print("\n✅ All done. Masks are under:")
-    print(f"   {os.path.join(output_root, 'masks')}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pipeline/run_sam3d_multi.py b/pipeline/run_sam3d_multi.py
deleted file mode 100644
index 4ea9f00..0000000
--- a/pipeline/run_sam3d_multi.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import os
-import argparse
-
-import numpy as np
-import torch
-from PIL import Image
-
-from inference import Inference
-
-import re
-
-def clean_name(x: str):
-    return re.sub(r'[^0-9a-zA-Z_]', '', x)
-
-
-
-
-def load_image(path: str) -> Image.Image:
-    img = Image.open(path).convert("RGB")
-    return img
-
-
-def collect_mask_paths(mask_root: str):
-    """
-    递归收集 mask_root 下所有 png/jpg/jpeg 的路径。
-    """
-    all_mask_paths = []
-    for root, _, files in os.walk(mask_root):
-        for f in files:
-            lf = f.lower()
-            if lf.endswith(".png") or lf.endswith(".jpg") or lf.endswith(".jpeg"):
-                all_mask_paths.append(os.path.join(root, f))
-
-    all_mask_paths.sort()
-    print(f"Found {len(all_mask_paths)} mask files under {mask_root}")
-    return all_mask_paths
-
-
-def load_binary_mask(path: str):
-    """
-    单个 mask 文件 → 二值 uint8 数组 (H, W), {0, 1}
-    """
-    m = np.array(Image.open(path).convert("L"))
-    m = (m > 128).astype("uint8")
-    return m
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run SAM3D multi-object inference and save outputs to .pt"
-    )
-    parser.add_argument(
-        "--image-path",
-        type=str,
-        default="sam3/assets/img.jpg",
-        help="Input image path to lift to 3D.",
-    )
-    parser.add_argument(
-        "--mask-root",
-        type=str,
-        default="sam3/agent_output_multi/masks",
-        help="Directory containing mask PNG/JPGs.",
-    )
-    parser.add_argument(
-        "--save-dir",
-        type=str,
-        default="sam-3d-objects/torch_save_pt",
-        help="Where to save <parent>_<maskname>.pt files.",
-    )
-    parser.add_argument(
-        "--tag",
-        type=str,
-        default="hf",
-        help="Checkpoint tag, corresponds to ../sam-3d-objects/checkpoints/{tag}/pipeline.yaml",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="Random seed passed into Inference.__call__.",
-    )
-    parser.add_argument(
-        "--project-root",
-        type=str,
-        default=None,
-        help=(
-            "Root directory of sam-3d-objects repo. "
-            "If not set, will be inferred as <this_script_dir>/../sam-3d-objects."
-        ),
-    )
-    args = parser.parse_args()
-
-
-
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-
-    if args.project_root is not None:
-        # 如果用户通过命令行显式传入了 --project-root，就直接用它
-        project_root = os.path.abspath(args.project_root)
-    else:
-        # 否则自动推断：假设当前脚本位于 sam3d_gs/pipeline/ 下，
-        # sam-3-objects 位于 sam3d_gs/sam-3-objects
-        project_root = os.path.abspath(os.path.join(script_dir, "..", "sam-3-objects"))
-
-    print(f"Project root (sam-3-objects): {project_root}")
-
-    config_path = os.path.join(project_root, "checkpoints", args.tag, "pipeline.yaml")
-    print(f"Using config: {config_path}")
-    inference = Inference(config_path, compile=False)
-
-    pil_image = load_image(args.image_path)
-    image = np.array(pil_image)
-
-    mask_paths = collect_mask_paths(args.mask_root)
-    if not mask_paths:
-        raise RuntimeError(f"No mask images found under {args.mask_root}")
-
-    os.makedirs(args.save_dir, exist_ok=True)
-
-    for i, mask_path in enumerate(mask_paths):
-        print(f"[{i+1}/{len(mask_paths)}] running inference on mask: {mask_path}")
-
-        mask = load_binary_mask(mask_path)
-
-        out = inference(image, mask, seed=args.seed)
-
-        # 构造保存名字：父目录名 + "_" + mask 文件名（无扩展）
-        parent_name_raw = os.path.basename(os.path.dirname(mask_path))
-        parent_name = clean_name(parent_name_raw)
-        mask_stem_raw = os.path.splitext(os.path.basename(mask_path))[0]
-        mask_stem = clean_name(mask_stem_raw)
-        save_name = f"{parent_name}_{mask_stem}.pt"
-        save_path = os.path.join(args.save_dir, save_name)
-
-        torch.save(out, save_path)
-        print(f"✅ Saved: {save_path}")
-
-        # 显式释放显存
-        del out
-        torch.cuda.empty_cache()
-
-    print("✅ All objects processed and saved as .pt")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pipeline/utils.py b/pipeline/utils.py
new file mode 100644
index 0000000..bf4c986
--- /dev/null
+++ b/pipeline/utils.py
@@ -0,0 +1,200 @@
+import re
+import os
+import atexit
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+from PIL import Image
+import trimesh
+import pyrender
+import numpy as np
+import imageio
+
+
+_DEFAULT_MESH_RENDERERS = {}
+
+
+class MeshRenderContext:
+    def __init__(
+        self,
+        width=448,
+        height=448,
+        add_axis=False,
+        debug_depth_path=None,
+        verbose=False,
+    ):
+        self.width = width
+        self.height = height
+        self.add_axis = add_axis
+        self.debug_depth_path = debug_depth_path
+        self.verbose = verbose
+        self.renderer = pyrender.OffscreenRenderer(width, height)
+        self.material = pyrender.MetallicRoughnessMaterial(
+            baseColorFactor=[0.7, 0.7, 0.7, 1.0],
+            metallicFactor=0.0,
+            roughnessFactor=1.0,
+        )
+        self.cv_to_gl = np.array(
+            [
+                [1, 0, 0, 0],
+                [0, -1, 0, 0],
+                [0, 0, -1, 0],
+                [0, 0, 0, 1],
+            ],
+            dtype=np.float32,
+        )
+
+    def close(self):
+        if self.renderer is not None:
+            self.renderer.delete()
+            self.renderer = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def render(self, mesh, extrinsics, fov_y):
+        if self.renderer is None:
+            self.renderer = pyrender.OffscreenRenderer(self.width, self.height)
+
+        if self.verbose:
+            print(
+                f"vertices shape {mesh.vertices.shape} "
+                f"mesh vertices mean {np.mean(mesh.vertices, axis=0)}"
+            )
+
+        render_mesh = pyrender.Mesh.from_trimesh(
+            mesh,
+            material=self.material,
+            smooth=False,
+        )
+
+        scene = pyrender.Scene()
+        scene.add(render_mesh)
+
+        camera = pyrender.PerspectiveCamera(
+            yfov=fov_y,
+            aspectRatio=self.width / self.height,
+        )
+
+        camera_pose = extrinsics @ self.cv_to_gl
+        scene.add(camera, pose=camera_pose)
+
+        if self.add_axis:
+            axis = trimesh.creation.axis(axis_length=0.5)
+            scene.add(pyrender.Mesh.from_trimesh(axis, smooth=False))
+
+        light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0)
+        scene.add(light, pose=camera_pose)
+
+        color, depth = self.renderer.render(scene)
+
+        if self.debug_depth_path:
+            depth_min = depth.min()
+            depth_range = depth.max() - depth_min
+            if depth_range > 0:
+                depth_normalized = (
+                    (depth - depth_min) / depth_range * 255
+                ).astype(np.uint8)
+            else:
+                depth_normalized = np.zeros_like(depth, dtype=np.uint8)
+            imageio.imwrite(self.debug_depth_path, depth_normalized)
+
+        if self.verbose:
+            valid_depth = depth[depth > 0]
+            valid_mean = valid_depth.mean() if valid_depth.size > 0 else np.nan
+            print(
+                f"max depth {depth.max()}, min depth {depth.min()}, "
+                f"mean depth {depth.mean()}, valid mean depth {valid_mean}"
+            )
+
+        return color, depth
+
+
+def get_default_mesh_renderer(
+    width=448,
+    height=448,
+    add_axis=False,
+    debug_depth_path=None,
+    verbose=False,
+):
+    key = (width, height, add_axis, debug_depth_path, verbose)
+    renderer = _DEFAULT_MESH_RENDERERS.get(key)
+    if renderer is None:
+        renderer = MeshRenderContext(
+            width=width,
+            height=height,
+            add_axis=add_axis,
+            debug_depth_path=debug_depth_path,
+            verbose=verbose,
+        )
+        _DEFAULT_MESH_RENDERERS[key] = renderer
+    return renderer
+
+
+def close_default_mesh_renderers():
+    for renderer in _DEFAULT_MESH_RENDERERS.values():
+        renderer.close()
+    _DEFAULT_MESH_RENDERERS.clear()
+
+
+atexit.register(close_default_mesh_renderers)
+
+
+def clean_name(x: str):
+    return re.sub(r'[^0-9a-zA-Z_-]', '', x)
+
+
+def load_image(path: str) -> Image.Image:
+    img = Image.open(path).convert("RGB")
+    return img
+
+
+def collect_mask_paths(mask_root: str):
+    """Recursively collect all .png / .jpg / .jpeg paths under mask_root."""
+    all_mask_paths = []
+    for root, _, files in os.walk(mask_root):
+        for f in files:
+            lf = f.lower()
+            if lf.endswith(".png") or lf.endswith(".jpg") or lf.endswith(".jpeg"):
+                all_mask_paths.append(os.path.join(root, f))
+
+    all_mask_paths.sort()
+    print(f"Found {len(all_mask_paths)} mask files under {mask_root}")
+    return all_mask_paths
+
+
+def compute_fov_from_intrinsics(fx, fy, image_size, degrees=True):
+    """Compute horizontal / vertical FOV from pixel-unit fx, fy."""
+    height, width = image_size
+
+    fov_y = 2 * np.arctan(height / (2 * fy))
+    fov_x = 2 * np.arctan(width  / (2 * fx))
+
+    if degrees:
+        fov_y = np.degrees(fov_y)
+        fov_x = np.degrees(fov_x)
+
+    return fov_x, fov_y
+
+def mesh_rendering(
+    mesh,
+    extrinsics,
+    fov_y,
+    renderer=None,
+    width=448,
+    height=448,
+    add_axis=False,
+    debug_depth_path=None,
+    verbose=False,
+):
+    if renderer is None:
+        renderer = get_default_mesh_renderer(
+            width=width,
+            height=height,
+            add_axis=add_axis,
+            debug_depth_path=debug_depth_path,
+            verbose=verbose,
+        )
+    return renderer.render(mesh, extrinsics, fov_y)
+
diff --git a/run_agent_with_vllm.sh b/run_agent_with_vllm.sh
deleted file mode 100644
index 3ed5925..0000000
--- a/run_agent_with_vllm.sh
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-############################################
-# 0. Resolve project root (directory of this script)
-############################################
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-############################################
-# 1. Global config (paths are relative to SCRIPT_DIR)
-############################################
-export HF_ENDPOINT="https://hf-mirror.com"
-
-export HF_HOME="${SCRIPT_DIR}/huggingface"
-export TRANSFORMERS_CACHE="${HF_HOME}"
-export HF_DATASETS_CACHE="${HF_HOME}"
-export HF_HUB_CACHE="${HF_HOME}"
-
-# Path to conda initialization script (usually absolute)
-CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh"
-
-# Conda env names
-VLLM_ENV="vllm"
-SAM3_ENV="sam3"
-
-# vLLM model directory (where Qwen3-VL-8B-Thinking will be downloaded)
-VLLM_MODEL_DIR="${SCRIPT_DIR}/models/qwen3_vl_8b_thinking"
-
-# Model name exposed by vLLM and used by the Python script (--llm-model-id)
-SERVED_MODEL_NAME="qwen3-vl-8b-thinking"
-
-# vLLM server port
-VLLM_PORT=8001
-
-# SAM3 agent script (Python entry)
-AGENT_SCRIPT="${SCRIPT_DIR}/pipeline/run_sam3_agent_full.py"
-
-# Input image
-IMAGE_PATH="${SCRIPT_DIR}/assets/img.jpg"
-
-# Output root directory
-OUTPUT_ROOT="${SCRIPT_DIR}/outputs/master_with_vllm"
-
-# System prompt file for Qwen
-SYSTEM_PROMPT_PATH="${SCRIPT_DIR}/assets/system_prompt_scene_prompts.txt"
-
-# vLLM log
-LOG_DIR="${SCRIPT_DIR}/logs"
-mkdir -p "${LOG_DIR}"
-VLLM_LOG="${LOG_DIR}/vllm_server.log"
-
-############################################
-# 2. Initialize conda
-############################################
-if [ -f "${CONDA_SH}" ]; then
-    # Enable `conda activate`
-    # shellcheck disable=SC1090
-    source "${CONDA_SH}"
-else
-    echo "ERROR: conda.sh not found at ${CONDA_SH}"
-    exit 1
-fi
-
-############################################
-# 3. HuggingFace login (interactive, in vLLM env)
-############################################
-echo ">>> Activating conda env: ${VLLM_ENV}"
-conda activate "${VLLM_ENV}"
-
-echo ">>> Running 'hf auth login' (you may be prompted for a token)..."
-hf auth login
-echo ">>> HuggingFace login finished ✓"
-
-############################################
-# 4. Download Qwen3-VL-8B-Thinking if model dir is empty
-############################################
-if [ ! -d "${VLLM_MODEL_DIR}" ] || [ -z "$(ls -A "${VLLM_MODEL_DIR}" 2>/dev/null)" ]; then
-    echo ">>> Model directory is empty: ${VLLM_MODEL_DIR}"
-    echo ">>> Auto-downloading Qwen/Qwen3-VL-8B-Thinking ..."
-
-    mkdir -p "${VLLM_MODEL_DIR}"
-
-    if command -v huggingface-cli >/dev/null 2>&1; then
-        huggingface-cli download \
-            Qwen/Qwen3-VL-8B-Thinking \
-            --local-dir "${VLLM_MODEL_DIR}" \
-            --local-dir-use-symlinks False
-    elif command -v hf >/dev/null 2>&1; then
-        hf snapshot download Qwen/Qwen3-VL-8B-Thinking \
-            --local-dir "${VLLM_MODEL_DIR}" \
-            --local-dir-use-symlinks False
-    else
-        echo "ERROR: Neither 'huggingface-cli' nor 'hf' CLI is installed."
-        echo "Please install with:  pip install -U huggingface_hub"
-        exit 1
-    fi
-
-    echo ">>> Model download complete!"
-else
-    echo ">>> Model already exists at ${VLLM_MODEL_DIR}, skip download."
-fi
-
-############################################
-# 5. Start vLLM server (still in vLLM env)
-############################################
-echo ">>> Starting vLLM server on GPUs 6,7 ..."
-CUDA_VISIBLE_DEVICES=6,7 \
-vllm serve "${VLLM_MODEL_DIR}" \
-    --tensor-parallel-size 2 \
-    --dtype float16 \
-    --gpu-memory-utilization 0.9 \
-    --max-model-len 65536 \
-    --max-num-seqs 4 \
-    --port 8001 \
-    --allowed-local-media-path / \
-    --served-model-name "${SERVED_MODEL_NAME}" \
-    > "${VLLM_LOG}" 2>&1 &
-
-VLLM_PID=$!
-echo ">>> vLLM server started. PID = ${VLLM_PID}"
-echo ">>> Logs: ${VLLM_LOG}"
-
-echo ">>> Waiting for vLLM server to become ready..."
-until curl -s "http://localhost:${VLLM_PORT}/v1/models" > /dev/null; do
-    echo "vLLM not ready yet, waiting 2s..."
-    sleep 2
-done
-echo ">>> vLLM server is ready!"
-
-############################################
-# 6. Run SAM3 agent (in sam3 env)
-############################################
-echo ">>> Activating SAM3 env: ${SAM3_ENV}"
-conda activate "${SAM3_ENV}"
-
-echo ">>> Running SAM3 agent with CUDA_VISIBLE_DEVICES=0 ..."
-CUDA_VISIBLE_DEVICES=0 \
-python "${AGENT_SCRIPT}" \
-    --image-path "${IMAGE_PATH}" \
-    --output-root "${OUTPUT_ROOT}" \
-    --system-prompt-path "${SYSTEM_PROMPT_PATH}" \
-    --llm-model-id "${SERVED_MODEL_NAME}" \
-    --skip-first
-
-echo ">>> SAM3 agent finished."
-
-############################################
-# 7. Done (vLLM is still running)
-############################################
-echo ">>> All done. vLLM is still running with PID = ${VLLM_PID}"
-echo ">>> To stop it manually, run:  kill ${VLLM_PID}"
diff --git a/run_docker.sh b/run_docker.sh
new file mode 100755
index 0000000..e48454e
--- /dev/null
+++ b/run_docker.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# Launch sam3d-gs:latest with host checkpoints + data bind-mounted.
+#
+# Usage:
+#   run_docker.sh [PROJECT_DIR] [HF_CACHE_DIR]
+#
+# PROJECT_DIR    Path to the sam3d_gs repo on the host.
+#                Defaults to the directory this script lives in.
+# HF_CACHE_DIR   Path to host HuggingFace cache (so AnySplat and other
+#                HF models are reused across container starts).
+#                Defaults to ${HF_HOME:-$HOME/.cache/huggingface}.
+#
+# Environment overrides:
+#   SAM3D_IMAGE  Docker image to run.  Default: sam3d-gs:latest
+#   TORCH_HOME   Host PyTorch hub cache (DINOv2 etc. land here).
+#                Default: $HOME/.cache/torch
+
+set -euo pipefail
+
+DEFAULT_REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO="${1:-${DEFAULT_REPO}}"
+HF_CACHE="${2:-${HF_HOME:-${HOME}/.cache/huggingface}}"
+TORCH_CACHE="${TORCH_HOME:-${HOME}/.cache/torch}"
+IMAGE="${SAM3D_IMAGE:-sam3d-gs:latest}"
+
+REPO="$(realpath "${REPO}")"
+HF_CACHE="$(realpath -m "${HF_CACHE}")"
+TORCH_CACHE="$(realpath -m "${TORCH_CACHE}")"
+
+# Sanity-check that PROJECT_DIR really looks like the sam3d_gs repo.
+for marker in submodule/Sam-3d-objects submodule/Prompt-Inpaint scripts/install_env.sh; do
+    if [[ ! -e "${REPO}/${marker}" ]]; then
+        echo "ERROR: ${REPO} does not look like a sam3d_gs checkout (missing ${marker})." >&2
+        echo "Pass the project root explicitly: $0 /path/to/sam3d_gs" >&2
+        exit 1
+    fi
+done
+
+# Ensure host-side bind targets exist (Docker would otherwise create them as root).
+mkdir -p \
+    "${REPO}/submodule/Sam-3d-objects/checkpoints" \
+    "${REPO}/submodule/Prompt-Inpaint/checkpoints" \
+    "${REPO}/data" \
+    "${REPO}/example" \
+    "${HF_CACHE}" \
+    "${TORCH_CACHE}"
+
+echo "==> repo:        ${REPO}"
+echo "==> hf cache:    ${HF_CACHE}"
+echo "==> torch cache: ${TORCH_CACHE}"
+echo "==> image:       ${IMAGE}"
+
+docker run --rm -it \
+    --gpus all \
+    --shm-size=8g \
+    --network host \
+    -v "${REPO}/submodule/Sam-3d-objects/checkpoints":/opt/sam3d_gs/submodule/Sam-3d-objects/checkpoints \
+    -v "${REPO}/submodule/Prompt-Inpaint/checkpoints":/opt/sam3d_gs/submodule/Prompt-Inpaint/checkpoints \
+    -v "${HF_CACHE}":/root/.cache/huggingface \
+    -v "${TORCH_CACHE}":/root/.cache/torch \
+    -v "${REPO}/data":/opt/sam3d_gs/data \
+    -v "${REPO}/example":/opt/sam3d_gs/example \
+    "${IMAGE}"
diff --git a/run_object_generation_pipeline.sh b/run_object_generation_pipeline.sh
new file mode 100755
index 0000000..aac03a5
--- /dev/null
+++ b/run_object_generation_pipeline.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+if [[ $# -lt 1 || $# -gt 2 ]]; then
+    echo "Usage: $0 <path_img> [path]"
+    echo "Example: $0 data/new-desk/input_image.png"
+    exit 1
+fi
+
+path_img="$1"
+if [[ $# -eq 2 ]]; then
+    path="$2"
+else
+    path="$(dirname "${path_img}")"
+fi
+
+path_img="$(realpath "${path_img}")"
+path="$(realpath "${path}")"
+
+if [[ ! -f "${path_img}" ]]; then
+    echo "Input image not found: ${path_img}"
+    exit 1
+fi
+
+if [[ ! -d "${path}" ]]; then
+    echo "Input directory not found: ${path}"
+    exit 1
+fi
+
+source "${SCRIPT_DIR}/.venv/bin/activate"
+
+export PYTHONPATH="${SCRIPT_DIR}/submodule/Sam-3d-objects/notebook:${SCRIPT_DIR}/submodule/Sam-3d-objects:${PYTHONPATH:-}"
+
+echo "Python: $(which python)"
+echo "Image: ${path_img}"
+echo "Directory: ${path}"
+
+# Bootstrap gated HuggingFace weights on first run.
+# Both models are gated; the user must have run `hf auth login` and accepted
+# the model agreements for facebook/sam-3d-objects and facebook/sam3.
+SAM3D_PIPELINE_YAML="${SCRIPT_DIR}/submodule/Sam-3d-objects/checkpoints/hf/pipeline.yaml"
+SAM3_WEIGHT="${SCRIPT_DIR}/submodule/Prompt-Inpaint/checkpoints/sam3.pt"
+if [[ ! -f "${SAM3D_PIPELINE_YAML}" || ! -f "${SAM3_WEIGHT}" ]]; then
+    echo "==> One or more gated checkpoints missing locally; running bootstrap..."
+    bash "${SCRIPT_DIR}/scripts/download_checkpoints.sh"
+fi
+
+echo "==> Step 1/3: Prompt-Inpaint"
+python "${SCRIPT_DIR}/submodule/Prompt-Inpaint/main.py" \
+    --resize-output \
+    --save-individual-masks \
+    --config "${SCRIPT_DIR}/submodule/Prompt-Inpaint/configs/items.yml" \
+    --image "${path_img}" \
+    --output-dir "${path}"
+
+echo "==> Step 2/3: AnySplat"
+python "${SCRIPT_DIR}/pipeline/background_reconstruction.py" "${path}"
+
+echo "==> Step 3/3: Object generation"
+python "${SCRIPT_DIR}/pipeline/objects_generation.py" --input-dir "${path}"
+
+echo "Done."
diff --git a/run_pipeline.sh b/run_pipeline.sh
deleted file mode 100644
index 547c65b..0000000
--- a/run_pipeline.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-bash run_agent_with_vllm.sh
-bash run_sam3d_from_masks.sh
\ No newline at end of file
diff --git a/run_sam3d_from_masks.sh b/run_sam3d_from_masks.sh
deleted file mode 100644
index 924d1f5..0000000
--- a/run_sam3d_from_masks.sh
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/usr/bin/env bash
-# 不要开 -u，会和 conda activate 脚本打架
-set -eo pipefail
-
-############################################
-# 0. Resolve project root (directory of this script)
-############################################
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# 防止 conda activate 的 binutils 脚本里引用未定义 ADDR2LINE
-export ADDR2LINE=addr2line
-
-############################################
-# 1. Global config (all paths relative to SCRIPT_DIR)
-############################################
-
-# GPU used for SAM3D reconstruction
-export CUDA_VISIBLE_DEVICES="0"
-
-# HF / Torch cache (和 run_agent_with_vllm.sh 共用一套)
-export HF_ENDPOINT="https://hf-mirror.com"
-export HF_HOME="${SCRIPT_DIR}/huggingface"
-export TRANSFORMERS_CACHE="${HF_HOME}"
-export HF_DATASETS_CACHE="${HF_HOME}"
-export HF_HUB_CACHE="${HF_HOME}"
-export HF_HUB_ENABLE_HF_TRANSFER=0
-
-export TORCH_HOME="${SCRIPT_DIR}/torch_hub"
-export TORCH_HUB="${SCRIPT_DIR}/torch_hub"
-
-# Conda init script (absolute)
-CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh"
-
-# Conda env for SAM3D
-SAM3D_ENV="sam3d-objects"
-
-# sam-3d-objects repo root
-PROJECT_ROOT="${SCRIPT_DIR}/sam-3d-objects"
-
-# Where sam-3-objects stores intermediate .pt
-PT_SAVE_DIR="${PROJECT_ROOT}/outputs/torch_save_pt"
-
-# Checkpoints / config paths
-CHECKPOINTS_DIR="${PROJECT_ROOT}/checkpoints"
-PIPELINE_YAML="${CHECKPOINTS_DIR}/hf/pipeline.yaml"
-
-# Python entry scripts (放在 sam3d_gs/pipeline 下)
-SAM3D_MULTI_SCRIPT="${SCRIPT_DIR}/pipeline/run_sam3d_multi.py"
-RECONSTRUCT_SCRIPT="${SCRIPT_DIR}/pipeline/reconstruct_from_pt.py"
-
-# Input image: 使用和 SAM3 agent 一样的图
-IMAGE_PATH="${SCRIPT_DIR}/assets/img.jpg"
-
-# 🔴 关键：mask-root = SAM3 agent 的 mask 输出目录
-# 如果你的 run_sam3_agent_full.py 把 mask 写在：
-#   outputs/master_with_vllm/masks
-# 就用这一行：
-MASK_ROOT="${SCRIPT_DIR}/outputs/master_with_vllm/masks"
-# 如果暂时还用旧目录，比如 sam3/agent_output_multi/masks，可以改成：
-# MASK_ROOT="${SCRIPT_DIR}/sam3/agent_output_multi/masks"
-
-# Run configs
-TAG="hf"
-SEED=42
-EXPORT_GIF=1   # 1 = reconstruct 时加 --export-gif，0 = 不导出 GIF
-
-############################################
-# 2. Initialize conda
-############################################
-if [ -f "${CONDA_SH}" ]; then
-    # shellcheck disable=SC1090
-    source "${CONDA_SH}"
-else
-    echo "ERROR: conda.sh not found at ${CONDA_SH}"
-    exit 1
-fi
-
-echo ">>> Activating conda env: ${SAM3D_ENV}"
-conda activate "${SAM3D_ENV}"
-
-mkdir -p "${PT_SAVE_DIR}"
-
-############################################
-# 2.5. Ensure checkpoints/${TAG}/pipeline.yaml
-############################################
-if [ ! -f "${PIPELINE_YAML}" ]; then
-    echo ">>> pipeline.yaml not found at: ${PIPELINE_YAML}"
-    echo ">>> Downloading checkpoints from facebook/sam-3d-objects ..."
-    echo ">>> (确保已运行 'hf auth login' 并在网页上接受模型协议)"
-
-    # 关闭 hf_transfer（在镜像环境下容易出奇怪错误）
-    export HF_HUB_ENABLE_HF_TRANSFER=0
-
-    # 临时下载目录（避免直接弄脏 sam-3d-objects 根目录）
-    TMP_DIR="${CHECKPOINTS_DIR}/.tmp_download_${TAG}"
-    rm -rf "${TMP_DIR}"
-    mkdir -p "${TMP_DIR}"
-
-    # 1) 把远端的 checkpoints/** 全部下载到临时目录
-    if command -v huggingface-cli >/dev/null 2>&1; then
-        huggingface-cli download \
-            facebook/sam-3d-objects \
-            --local-dir "${TMP_DIR}" \
-            --local-dir-use-symlinks False \
-            --include "checkpoints/**"
-    elif command -v hf >/dev/null 2>&1; then
-        hf snapshot download \
-            facebook/sam-3d-objects \
-            --local-dir "${TMP_DIR}" \
-            --local-dir-use-symlinks False \
-            --include "checkpoints/**"
-    else
-        echo "ERROR: neither 'huggingface-cli' nor 'hf' CLI is installed."
-        echo "       Try: pip install -U huggingface_hub"
-        rm -rf "${TMP_DIR}"
-        exit 1
-    fi
-
-    # 2) 远端结构：TMP_DIR/checkpoints/...
-    #    本地目标：CHECKPOINTS_DIR/TAG/...
-    mkdir -p "${CHECKPOINTS_DIR}/${TAG}"
-
-    if [ -d "${TMP_DIR}/checkpoints" ]; then
-        echo ">>> Moving downloaded checkpoints into checkpoints/${TAG} ..."
-        # 把 checkpoints/* 都移到 checkpoints/hf/
-        mv "${TMP_DIR}/checkpoints/"* "${CHECKPOINTS_DIR}/${TAG}/"
-    else
-        echo "ERROR: Expected ${TMP_DIR}/checkpoints directory, but not found."
-        rm -rf "${TMP_DIR}"
-        exit 1
-    fi
-
-    # 清理临时目录
-    rm -rf "${TMP_DIR}"
-
-    echo ">>> Checkpoints downloaded → ${CHECKPOINTS_DIR}/${TAG}"
-    echo ">>> Expected config at: ${PIPELINE_YAML}"
-else
-    echo ">>> Found existing pipeline config: ${PIPELINE_YAML}"
-fi
-
-
-# 确保 sam-3-objects/notebook 在 PYTHONPATH 里，供 inference 等模块 import
-export PYTHONPATH="${PROJECT_ROOT}/notebook:${PYTHONPATH:-}"
-
-############################################
-# 3. Step 1 – run SAM3D multi-object & save .pt
-############################################
-echo "=== [SAM3D] Step 1: run multi-object reconstruction & save .pt ==="
-python "${SAM3D_MULTI_SCRIPT}" \
-  --image-path "${IMAGE_PATH}" \
-  --mask-root "${MASK_ROOT}" \
-  --save-dir "${PT_SAVE_DIR}" \
-  --tag "${TAG}" \
-  --seed "${SEED}" \
-  --project-root "${PROJECT_ROOT}"
-
-############################################
-# 4. Step 2 – reconstruct from .pt to .ply (and optional .gif)
-############################################
-echo "=== [SAM3D] Step 2: reconstruct from .pt to .ply ==="
-
-RECONSTRUCT_CMD=(
-  python "${RECONSTRUCT_SCRIPT}"
-  --project-root "${PROJECT_ROOT}"
-  --save-dir "${PT_SAVE_DIR}"
-  --image-path "${IMAGE_PATH}"
-)
-
-if [ "${EXPORT_GIF}" -eq 1 ]; then
-  RECONSTRUCT_CMD+=(--export-gif)
-fi
-
-"${RECONSTRUCT_CMD[@]}"
-
-echo "✅ Pipeline finished. Check ${PROJECT_ROOT}/gaussians/multi 下的 .ply/.gif 文件"
diff --git a/sam-3d-objects b/sam-3d-objects
deleted file mode 160000
index cf06676..0000000
--- a/sam-3d-objects
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit cf066761706cd02b07e2fc6274570ec8cdafb683
diff --git a/sam3 b/sam3
deleted file mode 160000
index 2d1cbae..0000000
--- a/sam3
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2d1cbaeac7b52ca64baf61e58973d0940ae843d0
diff --git a/scripts/download_checkpoints.sh b/scripts/download_checkpoints.sh
new file mode 100755
index 0000000..285b4e8
--- /dev/null
+++ b/scripts/download_checkpoints.sh
@@ -0,0 +1,226 @@
+#!/usr/bin/env bash
+# Bootstrap gated HuggingFace checkpoints needed by the pipeline.
+#
+# This script handles the two models that require explicit local placement:
+#
+#   1. facebook/sam-3d-objects
+#      The SAM-3D-Objects codepath expects a Hydra config tree at
+#        submodule/Sam-3d-objects/checkpoints/<tag>/pipeline.yaml
+#      which is NOT fetched by `from_pretrained`.
+#
+#   2. facebook/sam3
+#      Prompt-Inpaint's _resolve_checkpoint() will fall back to a HuggingFace
+#      auto-download, but pulling the 3.3 GB sam3.pt into the local
+#      `submodule/Prompt-Inpaint/checkpoints/` keeps the weights co-located
+#      with the project and survives `~/.cache` cleanups.
+#
+#   3. lhjiang/anysplat
+#      AnySplat.from_pretrained reads from the HuggingFace hub cache
+#      (~/.cache/huggingface/hub/). Pre-fetching avoids a multi-GB download
+#      on the first pipeline run inside an ephemeral container.
+#
+# The script is idempotent: existing target files are skipped unless --force.
+#
+# Usage:
+#   bash scripts/download_checkpoints.sh [options]
+#
+# Options:
+#   --tag TAG       Sub-directory under submodule/Sam-3d-objects/checkpoints/
+#                   for the SAM-3D-Objects bundle. Default: hf
+#   --skip-sam3d    Do not download the SAM-3D-Objects bundle.
+#   --skip-sam3     Do not download the SAM3 weight (sam3.pt).
+#   --skip-anysplat Do not pre-fetch the AnySplat weights into the HF cache.
+#   --force         Re-download even if the target files already exist.
+#   -h, --help      Show this help.
+#
+# Environment overrides:
+#   SAM3D_CHECKPOINT_TAG    Same as --tag
+#   SAM3D_MODEL_ID          SAM-3D-Objects repo id (default: facebook/sam-3d-objects)
+#   SAM3_MODEL_ID           SAM3 repo id           (default: facebook/sam3)
+#   SAM3_WEIGHT_FILENAME    SAM3 weight file name  (default: sam3.pt)
+#   ANYSPLAT_MODEL_ID       AnySplat repo id       (default: lhjiang/anysplat)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+TAG="${SAM3D_CHECKPOINT_TAG:-hf}"
+SAM3D_MODEL_ID="${SAM3D_MODEL_ID:-facebook/sam-3d-objects}"
+SAM3_MODEL_ID="${SAM3_MODEL_ID:-facebook/sam3}"
+SAM3_WEIGHT_FILENAME="${SAM3_WEIGHT_FILENAME:-sam3.pt}"
+ANYSPLAT_MODEL_ID="${ANYSPLAT_MODEL_ID:-lhjiang/anysplat}"
+SKIP_SAM3D=0
+SKIP_SAM3=0
+SKIP_ANYSPLAT=0
+FORCE=0
+
+usage() {
+    sed -n '2,42p' "${BASH_SOURCE[0]}" | sed 's/^# //; s/^#$//'
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --tag)
+            TAG="$2"
+            shift 2
+            ;;
+        --skip-sam3d)
+            SKIP_SAM3D=1
+            shift
+            ;;
+        --skip-sam3)
+            SKIP_SAM3=1
+            shift
+            ;;
+        --skip-anysplat)
+            SKIP_ANYSPLAT=1
+            shift
+            ;;
+        --force)
+            FORCE=1
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1" >&2
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+require_hf_cli() {
+    if ! command -v hf >/dev/null 2>&1; then
+        cat >&2 <<'EOF'
+ERROR: the 'hf' CLI is not installed.
+       Fix:  pip install -U huggingface_hub
+       Then make sure you've accepted the relevant model agreements on
+       huggingface.co and logged in with:  hf auth login
+EOF
+        exit 1
+    fi
+}
+
+# hf_transfer occasionally trips on mirrored networks; disable it for safety.
+export HF_HUB_ENABLE_HF_TRANSFER=0
+
+
+download_sam3d_objects() {
+    local checkpoints_dir="${PROJECT_ROOT}/submodule/Sam-3d-objects/checkpoints"
+    local target_dir="${checkpoints_dir}/${TAG}"
+    local pipeline_yaml="${target_dir}/pipeline.yaml"
+
+    if [[ -f "${pipeline_yaml}" && "${FORCE}" -eq 0 ]]; then
+        echo "==> [sam-3d-objects] already present: ${pipeline_yaml}"
+        return 0
+    fi
+
+    require_hf_cli
+    echo "==> [sam-3d-objects] downloading ${SAM3D_MODEL_ID} into ${target_dir}"
+
+    local tmp_dir="${checkpoints_dir}/.tmp_download_${TAG}"
+    rm -rf "${tmp_dir}"
+    mkdir -p "${tmp_dir}"
+
+    # Local cleanup trap (scoped to this function via a subshell would also
+    # work, but we want the trap to run on Ctrl-C too).
+    trap 'rm -rf "${tmp_dir}"' EXIT
+
+    hf download "${SAM3D_MODEL_ID}" \
+        --local-dir "${tmp_dir}" \
+        --include "checkpoints/**"
+
+    if [[ ! -d "${tmp_dir}/checkpoints" ]]; then
+        echo "ERROR: expected ${tmp_dir}/checkpoints after download." >&2
+        exit 1
+    fi
+
+    mkdir -p "${target_dir}"
+    shopt -s dotglob
+    mv "${tmp_dir}/checkpoints/"* "${target_dir}/"
+    shopt -u dotglob
+
+    if [[ ! -f "${pipeline_yaml}" ]]; then
+        echo "ERROR: pipeline.yaml missing after move: ${pipeline_yaml}" >&2
+        exit 1
+    fi
+
+    rm -rf "${tmp_dir}"
+    trap - EXIT
+
+    echo "==> [sam-3d-objects] done: ${target_dir}"
+}
+
+
+download_sam3() {
+    local target_dir="${PROJECT_ROOT}/submodule/Prompt-Inpaint/checkpoints"
+    local target_file="${target_dir}/${SAM3_WEIGHT_FILENAME}"
+
+    if [[ -f "${target_file}" && "${FORCE}" -eq 0 ]]; then
+        echo "==> [sam3] already present: ${target_file}"
+        return 0
+    fi
+
+    require_hf_cli
+    echo "==> [sam3] downloading ${SAM3_MODEL_ID}/${SAM3_WEIGHT_FILENAME} into ${target_dir}"
+
+    mkdir -p "${target_dir}"
+    hf download "${SAM3_MODEL_ID}" "${SAM3_WEIGHT_FILENAME}" \
+        --local-dir "${target_dir}"
+
+    if [[ ! -f "${target_file}" ]]; then
+        echo "ERROR: ${target_file} missing after download." >&2
+        exit 1
+    fi
+
+    echo "==> [sam3] done: ${target_file}"
+}
+
+
+download_anysplat() {
+    # AnySplat.from_pretrained looks up the model in the HuggingFace hub
+    # cache, so we leave files under the standard cache layout (no
+    # --local-dir). The cache root is HF_HOME if set, otherwise
+    # ~/.cache/huggingface.
+    local hf_root="${HF_HOME:-${HOME}/.cache/huggingface}"
+    # HF cache layout: hub/models--<org>--<name>/snapshots/<rev>/...
+    local hub_dirname="models--$(echo "${ANYSPLAT_MODEL_ID}" | sed 's|/|--|g')"
+    local snapshots_dir="${hf_root}/hub/${hub_dirname}/snapshots"
+
+    if [[ -d "${snapshots_dir}" ]] && \
+       [[ -n "$(ls -A "${snapshots_dir}" 2>/dev/null)" ]] && \
+       [[ "${FORCE}" -eq 0 ]]; then
+        echo "==> [anysplat] already present in HF cache: ${snapshots_dir}"
+        return 0
+    fi
+
+    require_hf_cli
+    echo "==> [anysplat] downloading ${ANYSPLAT_MODEL_ID} into HF cache (${hf_root})"
+    hf download "${ANYSPLAT_MODEL_ID}"
+    echo "==> [anysplat] done."
+}
+
+
+if [[ "${SKIP_SAM3D}" -eq 0 ]]; then
+    download_sam3d_objects
+else
+    echo "==> [sam-3d-objects] skipped (--skip-sam3d)"
+fi
+
+if [[ "${SKIP_SAM3}" -eq 0 ]]; then
+    download_sam3
+else
+    echo "==> [sam3] skipped (--skip-sam3)"
+fi
+
+if [[ "${SKIP_ANYSPLAT}" -eq 0 ]]; then
+    download_anysplat
+else
+    echo "==> [anysplat] skipped (--skip-anysplat)"
+fi
+
+echo "==> All requested checkpoints are in place."
diff --git a/scripts/install_env.sh b/scripts/install_env.sh
new file mode 100755
index 0000000..e2e699b
--- /dev/null
+++ b/scripts/install_env.sh
@@ -0,0 +1,204 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+PYTHON_VERSION="3.11"
+TORCH_VERSION="2.7.0"
+TORCHVISION_VERSION="0.22.0"
+TORCHAUDIO_VERSION="2.7.0"
+PYTORCH_INDEX_URL="https://download.pytorch.org/whl/cu128"
+KAOLIN_FIND_LINKS="https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html"
+
+INSTALL_TORCH=1
+UPDATE_SUBMODULES=1
+COMPILE_CUROPE=1
+
+usage() {
+    cat <<'EOF'
+Usage: bash scripts/install_env.sh [options]
+
+Options:
+  --python VERSION        Python version for uv venv. Default: 3.11
+  --skip-torch           Do not install torch/torchvision/torchaudio.
+  --skip-submodules      Do not run git submodule update --init --recursive.
+  --skip-curope          Do NOT patch+compile AnySplat curope CUDA extension
+                         (compiled by default; without it AnySplat falls back
+                         to a slower PyTorch RoPE2D implementation).
+  -h, --help             Show this help.
+
+Examples:
+  bash scripts/install_env.sh
+  bash scripts/install_env.sh --skip-torch
+  bash scripts/install_env.sh --skip-curope
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --python)
+            PYTHON_VERSION="$2"
+            shift 2
+            ;;
+        --skip-torch)
+            INSTALL_TORCH=0
+            shift
+            ;;
+        --skip-submodules)
+            UPDATE_SUBMODULES=0
+            shift
+            ;;
+        --skip-curope)
+            COMPILE_CUROPE=0
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1" >&2
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+cd "${PROJECT_ROOT}"
+
+echo "==> Project root: ${PROJECT_ROOT}"
+
+if ! command -v uv >/dev/null 2>&1; then
+    echo "==> uv not found. Installing uv with pip..."
+    python3 -m pip install -U uv
+fi
+
+if [[ "${UPDATE_SUBMODULES}" -eq 1 ]]; then
+    echo "==> Updating git submodules..."
+    git submodule update --init --recursive
+fi
+
+echo "==> Creating/updating .venv with Python ${PYTHON_VERSION}..."
+uv venv --python "${PYTHON_VERSION}" .venv
+
+# shellcheck disable=SC1091
+source "${PROJECT_ROOT}/.venv/bin/activate"
+
+export PYTHONPATH="${PROJECT_ROOT}/submodule/Sam-3d-objects/notebook:${PROJECT_ROOT}/submodule/Sam-3d-objects:${PYTHONPATH:-}"
+export PIP_FIND_LINKS="${KAOLIN_FIND_LINKS}"
+
+echo "==> Python: $(which python)"
+python --version
+
+if [[ "${INSTALL_TORCH}" -eq 1 ]]; then
+    echo "==> Installing PyTorch ${TORCH_VERSION} from ${PYTORCH_INDEX_URL}..."
+    uv pip install \
+        "torch==${TORCH_VERSION}" \
+        "torchvision==${TORCHVISION_VERSION}" \
+        "torchaudio==${TORCHAUDIO_VERSION}" \
+        --index-url "${PYTORCH_INDEX_URL}"
+else
+    echo "==> Skipping PyTorch install."
+fi
+
+echo "==> Installing AnySplat requirements..."
+uv pip install -r submodule/AnySplat/requirements.txt --no-build-isolation
+
+echo "==> Installing SAM-3D-Objects build helpers..."
+uv pip install hatch-requirements-txt editables wheel
+
+echo "==> Installing SAM-3D-Objects extras..."
+uv pip install -e './submodule/Sam-3d-objects[dev]'
+uv pip install -e './submodule/Sam-3d-objects[p3d]' --no-build-isolation
+uv pip install -e './submodule/Sam-3d-objects[inference]' \
+    --no-build-isolation \
+    --find-links "${KAOLIN_FIND_LINKS}"
+
+echo "==> Installing project-level runtime dependencies..."
+# Do NOT use -U here: that would let uv upgrade transitive deps (notably
+# torch, via iopaint) and clobber the CUDA-pinned torch installed above.
+uv pip install --index-strategy unsafe-best-match \
+    "transformers==4.48.3" \
+    "iopaint>=1.2.0" \
+    "diffusers>=0.27.2" \
+    "numpy<2.0" \
+    "opencv-python>=4.8.0" \
+    "pyyaml>=6.0" \
+    "requests>=2.31.0" \
+    "tqdm>=4.66.0" \
+    "setuptools" \
+    "einops"
+
+# Pin huggingface_hub to 0.25.2 as the very last step: diffusers 0.27.2 (and
+# the iopaint stack on top of it) still imports `cached_download` from
+# huggingface_hub, which was removed in hub >= 0.26. Upstream Sam-3d-objects /
+# iopaint extras may pull in a newer hub transitively, so we force-reinstall
+# last (with --no-deps so it can downgrade without uv complaining) and lock
+# the exact version that was empirically verified to work.
+#
+# Note: transformers above is pinned to ==4.48.3 (not >=) because transformers
+# 5.x imports `is_offline_mode` from huggingface_hub, which doesn't exist in
+# 0.25.2 — using a floor here lets pip resolve to 5.x and breaks iopaint at
+# runtime even though hub stays pinned.
+echo "==> Pinning huggingface_hub==0.25.2 (force-reinstall, no-deps)..."
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+    "huggingface_hub==0.25.2"
+
+echo "==> Installing SAM3..."
+uv pip install --index-strategy unsafe-best-match \
+    "git+https://github.com/facebookresearch/sam3.git"
+
+# Optional mesh2mjcf extras (installed by default so `-cd` / `--verbose` Just
+# Work; `trimesh` is also used for multi-material OBJ splitting).
+echo "==> Installing mesh2mjcf extras (coacd, trimesh, mujoco)..."
+uv pip install --index-strategy unsafe-best-match \
+    "coacd" \
+    "trimesh" \
+    "mujoco"
+
+if [[ "${COMPILE_CUROPE}" -eq 1 ]]; then
+    CUROPE_DIR="${PROJECT_ROOT}/submodule/AnySplat/src/model/encoder/backbone/croco/curope"
+    KERNELS_CU="${CUROPE_DIR}/kernels.cu"
+
+    if [[ ! -f "${KERNELS_CU}" ]]; then
+        echo "ERROR: kernels.cu not found: ${KERNELS_CU}" >&2
+        exit 1
+    fi
+
+    echo "==> Patching AnySplat curope kernels.cu..."
+    python - "${KERNELS_CU}" <<'PY'
+from pathlib import Path
+import sys
+
+path = Path(sys.argv[1])
+text = path.read_text()
+patched = text.replace(
+    'AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {',
+    'AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.scalar_type(), "rope_2d_cuda", ([&] {',
+)
+if patched != text:
+    path.write_text(patched)
+    print(f"patched {path}")
+else:
+    print(f"no patch needed for {path}")
+PY
+
+    echo "==> Building AnySplat curope extension..."
+    (
+        cd "${CUROPE_DIR}"
+        python setup.py build_ext --inplace
+    )
+fi
+
+cat <<EOF
+
+==> Install finished.
+
+Next steps:
+  source .venv/bin/activate
+  export PYTHONPATH="${PROJECT_ROOT}/submodule/Sam-3d-objects/notebook:${PROJECT_ROOT}/submodule/Sam-3d-objects:\${PYTHONPATH:-}"
+
+If you use gated HuggingFace models, run:
+  huggingface-cli login
+EOF
diff --git a/submodule/AnySplat b/submodule/AnySplat
new file mode 160000
index 0000000..d29bc6a
--- /dev/null
+++ b/submodule/AnySplat
@@ -0,0 +1 @@
+Subproject commit d29bc6adf82c953f1fd337d8d0ba6259d906b2c9
diff --git a/submodule/Prompt-Inpaint b/submodule/Prompt-Inpaint
new file mode 160000
index 0000000..0dffc4b
--- /dev/null
+++ b/submodule/Prompt-Inpaint
@@ -0,0 +1 @@
+Subproject commit 0dffc4b50c33509d80135159b2b031d94e272e6e
diff --git a/submodule/Sam-3d-objects b/submodule/Sam-3d-objects
new file mode 160000
index 0000000..d4b6362
--- /dev/null
+++ b/submodule/Sam-3d-objects
@@ -0,0 +1 @@
+Subproject commit d4b63627dc2a7ae0a175be482942e6f32633ff55