diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b97641a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,195 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+/datasets
+/dataset_cache
+
+# Outputs
+/outputs
+/lightning_logs
+/checkpoints
+
+.bashrc
+/launcher_venv
+/slurm_logs
+*.torch
+*.ckpt
+table.tex
+/baselines
+/test/*
+
+wandb/
+output*
+results*
+
+*.ply
+*.mp4
+!assets/pipeline.jpg
+!examples/video/*.mp4
+
+src/loss/depth_anything/*
+
+.vscode/
+.gradio/
+note.txt
+anysplat_ckpt*
+input_images_*
+tmp_scripts/
diff --git a/.gitmodules b/.gitmodules
index 9a9af42..ed07a66 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,9 @@
-[submodule "sam-3d-objects"]
- path = sam-3d-objects
- url = https://github.com/facebookresearch/sam-3d-objects.git
-[submodule "sam3"]
- path = sam3
- url = https://github.com/facebookresearch/sam3.git
+[submodule "Sam-3d-objects"]
+ path = submodule/Sam-3d-objects
+ url = https://github.com/Yuchi-Zhang-00/sam-3d-objects.git
+[submodule "AnySplat"]
+ path = submodule/AnySplat
+ url = https://github.com/Yuchi-Zhang-00/AnySplat.git
+[submodule "Prompt-Inpaint"]
+ path = submodule/Prompt-Inpaint
+ url = https://github.com/MrZoyo/Prompt-Inpaint.git
diff --git a/README.md b/README.md
index 424ff4a..bb765b4 100644
--- a/README.md
+++ b/README.md
@@ -4,257 +4,478 @@
-# **Unified Multi-Stage 2D→3D Perception Pipeline**
+# **Unified 2D Single-Image → 3D Object Generation Pipeline**
-## *vLLM × SAM3 × SAM-3D-Objects Integration*
+## *Prompt-Inpaint × AnySplat × SAM-3D-Objects Integration*
+
+> This repo was originally forked from [xyys2003/sam3d_gs](https://github.com/xyys2003/sam3d_gs).
------
## **Abstract**
-This repository presents a unified and modular pipeline that couples large-scale vision–language reasoning, high-fidelity 2D segmentation, and multi-object 3D Gaussian splatting. It integrates three independent systems—**vLLM** (for Qwen3-VL inference), **SAM3** (for multi-object 2D segmentation), and **SAM-3D-Objects** (for 3D reconstruction from RGB + masks)—into a complete, end-to-end workflow. To ensure reproducibility, each module runs inside its own Conda environment. The pipeline supports both staged execution and a fully automated one-click execution, with built-in HuggingFace authentication, checkpoint management, and environment initialization.
+This repository packages a single-image 2D → 3D object reconstruction pipeline by composing three open-source systems behind one entry script:
-------
+- **Prompt-Inpaint** — text-prompted multi-object segmentation (built on SAM3) plus background inpainting, producing per-object masks and a clean background image.
+- **AnySplat** — feed-forward 3D Gaussian Splatting from a single image, plus a RANSAC-based table-alignment pass that brings the scene into a Mujoco-friendly world frame.
+- **SAM-3D-Objects** — per-object mesh and Gaussian reconstruction from RGB + mask.
-# **1. Repository Setup**
+The three components are wired together through scripts under `pipeline/` and a single uv-managed virtual environment, so the whole pipeline runs from one shell command.
-```
-git clone --recursive https://github.com/xyys2003/sam3d_gs.git
-cd sam3d_gs
-```
+------
-If cloned without submodules:
+# **1. Repository Layout**
```
-git submodule update --init --recursive
+.
+├── run_object_generation_pipeline.sh # one-shot entry: image → 3D assets
+├── pipeline/
+│ ├── background_reconstruction.py # AnySplat + table RANSAC alignment
+│ ├── objects_generation.py # SAM-3D-Objects multi-object reconstruction
+│ ├── mesh2mjcf.py # optional: convert per-object .obj → MuJoCo MJCF
+│ └── utils.py # shared rendering / IO helpers
+└── submodule/
+ ├── Prompt-Inpaint/ # SAM3 segmentation + inpainting
+ ├── AnySplat/ # single-image 3DGS reconstruction
+ └── Sam-3d-objects/ # per-object mesh / GS reconstruction
```
------
-# **2. Conda Environments**
+# **2. Setup**
-| Environment | Purpose | Path |
-| --------------- | ---------------------------------------- | ----------------- |
-| `vllm` | Serve Qwen3-VL-8B-Thinking via vLLM | — |
-| `sam3` | Multi-object segmentation (SAM3) | `sam3/` |
-| `sam3d-objects` | RGB + masks → 3D Gaussian reconstruction | `sam-3d-objects/` |
+The project runs inside a single `uv`-managed virtual environment (`.venv/`). The setup below targets RTX 50-series GPUs (CUDA 12.8, PyTorch 2.7) and is also verified to work on 3090 / 4090.
-------
+> **Hardware**: an NVIDIA GPU with **≥ 24 GB VRAM** is recommended. The pipeline loads SAM3, AnySplat, and SAM-3D-Objects sequentially and the SAM-3D-Objects stage in particular is memory-hungry.
-# **3. vLLM Environment (Qwen3-VL Server)**
+## **2.1 Clone with submodules**
-```
-conda create -n vllm python=3.10 -y
-conda activate vllm
+```bash
+git clone --recursive https://github.com/Yuchi-Zhang-00/sam3d_gs.git
+cd sam3d_gs
```
-Install PyTorch (CUDA 12.x):
+If the submodules were not initialized at clone time:
-```
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
- --index-url https://download.pytorch.org/whl/cu124
+```bash
+git submodule update --init --recursive
```
-Install vLLM:
+## **2.2 Install the Python environment**
-```
-pip install vllm --extra-index-url https://download.pytorch.org/whl/cu124
-pip install transformers tiktoken sentencepiece xformers flashinfer-python
-pip install huggingface_hub
+The recommended path is the bundled one-command installer:
+
+```bash
+bash scripts/install_env.sh
```
-------
+It creates `.venv`, installs PyTorch for CUDA 12.8, the submodule dependencies, and the project-level runtime dependencies.
-# **4. SAM3 Environment**
+If you would rather run each step yourself, see [`install.md`](install.md). It also documents the small SAM-3D-Objects requirements-file patches and the AnySplat `kernels.cu` fix used to build the CUDA RoPE2D kernel.
-Reference implementation:
- 🔗 https://github.com/facebookresearch/sam3
- 🔗 https://huggingface.co/facebook/sam3
+## **2.3 HuggingFace access**
-```
-cd sam3
-conda create -n sam3 python=3.10 -y
-conda activate sam3
-```
+The pipeline pulls three models from HuggingFace:
-Install SAM3:
+| Model | Used by | Access |
+| --- | --- | --- |
+| [`facebook/sam3`](https://huggingface.co/facebook/sam3) | Prompt-Inpaint (Stage 1) | **Gated** — request access on the model page |
+| [`facebook/sam-3d-objects`](https://huggingface.co/facebook/sam-3d-objects) | SAM-3D-Objects (Stage 3) | **Gated** — request access on the model page |
+| [`lhjiang/anysplat`](https://huggingface.co/lhjiang/anysplat) | AnySplat (Stage 2) | Public (MIT) |
-```
-git clone https://github.com/facebookresearch/sam3.git
-cd sam3
-pip install -e .
+After accepting the agreements on the two gated pages, log in once:
+
+```bash
+hf auth login
```
-Optional:
+The two gated models need explicit local placement and are fetched by a
+single bootstrap script (run once, after `hf auth login`):
+```bash
+bash scripts/download_checkpoints.sh
```
-pip install -e ".[notebooks]"
-pip install -e ".[train,dev]"
-```
+
+| Model | Target |
+| --- | --- |
+| `facebook/sam-3d-objects` | `submodule/Sam-3d-objects/checkpoints/hf/` (Hydra config tree, not fetched by `from_pretrained`) |
+| `facebook/sam3` | `submodule/Prompt-Inpaint/checkpoints/sam3.pt` (~3.3 GB; placed locally so it isn't lost when `~/.cache` is cleaned) |
+
+The script is idempotent and is also invoked automatically by
+`run_object_generation_pipeline.sh` on first run. Use `--skip-sam3d`,
+`--skip-sam3`, or `--force` to control individual stages.
+
+`lhjiang/anysplat` is also fetched by the same bootstrap script (into the
+standard HuggingFace hub cache at `~/.cache/huggingface/hub/`). It is public
+(MIT), so no `hf auth login` is required for this one — pre-fetching just
+keeps the first Stage-2 run from doing a multi-GB download. Pass
+`--skip-anysplat` if you'd rather have AnySplat pull it lazily on first run.
------
-# **5. SAM-3D-Objects Environment**
+## **2.4 Docker image (alternative to 2.1–2.3)**
-Reference implementation:
- 🔗 https://github.com/facebookresearch/sam3d
- 🔗 https://huggingface.co/facebook/sam-3d-objects
+A pre-built image with the full environment (CUDA 12.8 base, the
+uv-managed `.venv`, the compiled AnySplat curope CUDA extension, and all
+PyPI deps) is published to Aliyun Container Registry:
```
-conda create -n sam_3d_body python=3.10 -y
-conda activate sam_3d_body
+crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1
+crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:latest
```
-Install dependencies (excerpt):
+Using the image skips §2.2 entirely; you still need a clone of this repo on
+the host (the launcher and the host-side checkpoint directories) and HF
+access for the two gated models (§2.3).
-```
-pip install pytorch-lightning pyrender opencv-python yacs scikit-image einops timm dill pandas hydra-core ...
-```
+### **Prerequisites**
-Install Detectron2:
+- Docker with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+ installed; an NVIDIA GPU with ≥ 24 GB VRAM
+- A local clone of this repo (`git clone --recursive ...`, see §2.1) — used
+ both for the `run_docker.sh` launcher and as the bind-mount root for
+ checkpoints, data, and outputs
+- One-time HuggingFace setup (§2.3) and a host-side run of
+ `bash scripts/download_checkpoints.sh`. Checkpoints live on the host and
+ are bind-mounted into the container, so this only runs once.
-```
-pip install 'git+https://github.com/facebookresearch/detectron2.git@a1ce2f9' \
- --no-build-isolation --no-deps
+### **Pull the image**
+
+```bash
+docker pull crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1
+docker tag crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 sam3d-gs:latest
```
-Optional: MoGe
+The re-tag is optional. `run_docker.sh` defaults to `sam3d-gs:latest`; if
+you'd rather not re-tag, prefix the launch with
+`SAM3D_IMAGE=crpi-.../sam3d_gs:v0.1` instead.
-```
-pip install git+https://github.com/microsoft/MoGe.git
+### **Launch the container**
+
+```bash
+./run_docker.sh # uses defaults
+./run_docker.sh /path/to/sam3d_gs # explicit project dir
+./run_docker.sh /path/to/sam3d_gs /mnt/hf_cache # custom HF cache root
+SAM3D_IMAGE=sam3d-gs:v0.1 ./run_docker.sh # pick a specific tag
+TORCH_HOME=/mnt/torch_cache ./run_docker.sh # custom torch hub cache
```
-------
+The launcher bind-mounts the relevant host paths into the container:
-# **6. Required HuggingFace Access**
+| Host path | Container path | Purpose |
+| --- | --- | --- |
+| `/submodule/Sam-3d-objects/checkpoints` | same | SAM-3D-Objects weights (gated) |
+| `/submodule/Prompt-Inpaint/checkpoints` | same | SAM3 weight (gated) |
+| `${HF_HOME:-$HOME/.cache/huggingface}` | `/root/.cache/huggingface` | AnySplat + other HF downloads |
+| `${TORCH_HOME:-$HOME/.cache/torch}` | `/root/.cache/torch` | `torch.hub` cache (DINOv2 etc.) |
+| `/data` | `/opt/sam3d_gs/data` | scratch input/output dir |
+| `/example` | `/opt/sam3d_gs/example` | bundled demo input/output |
-The pipeline requires access to the following models:
+Pipeline outputs land in whichever scene directory you point the launcher
+at — since `data/` and `example/` are bind-mounted, those outputs persist
+on the host after the container exits.
-- **SAM3**
- 🔗 https://huggingface.co/facebook/sam3
-- **SAM-3D-Objects**
- 🔗 https://huggingface.co/facebook/sam-3d-objects
+### **Run the pipeline inside the container**
-Log in after requesting access:
+You land in `/opt/sam3d_gs/`. The image's `PATH` and `PYTHONPATH` already
+point at the bundled `.venv`, so you can call `python` and run scripts
+directly — **no `source .venv/bin/activate`**.
-```
-hf auth login
+```bash
+# Bundled demo:
+bash run_object_generation_pipeline.sh example/example.png
+
+# Your own image:
+bash run_object_generation_pipeline.sh data/my_scene/input_image.png
```
-------
+Stage 1/2/3 each behave exactly as in §3–§4 below.
-# **7. Running the Pipeline**
+### **What's baked into the image**
-Ensure the Conda activation path is correct:
+- CUDA 12.8 devel base + Python 3.11 `.venv` with every PyPI dep
+- Compiled AnySplat `curope` CUDA extension (sm_80 / 90 / 100 / 120)
+- `coacd`, `trimesh`, `mujoco` (so `pipeline/mesh2mjcf.py` works out of the box)
+- `sitecustomize.py` patching `torch.hub` to use the local cache without
+ pinging github first (avoids `RemoteDisconnected` on flaky networks once
+ the model is in `~/.cache/torch/hub`)
+- A global `git insteadOf` rule routing `https://github.com/` through
+ `https://gh-proxy.com/https://github.com/`, so in-container `git clone`
+ works on networks where direct github access is unreliable
-```
-CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh"
-```
+### **What's NOT baked in**
+
+- The three model checkpoint sets (SAM3, SAM-3D-Objects, AnySplat). They
+ live on the host and are bind-mounted via the table above. Run
+ `scripts/download_checkpoints.sh` once on the host.
+- Your input data. Drop it into `/data//` and reference
+ it as `data//input_image.png` inside the container.
+
+### **Caveats**
+
+- **Output files end up owned by `root` on the host.** The container runs
+ as root, so anything the pipeline writes into a bind-mounted directory
+ (`data/`, `example/`, the checkpoint dirs, etc.) shows up on the host
+ with uid 0. Two ways to deal with it:
+
+ ```bash
+ # After the container exits, fix ownership on the host:
+ sudo chown -R $(id -u):$(id -g) data/ example/
+
+ # Or run the container as your host user from the start.
+ # This avoids the chown step but can break EGL / pyrender setup
+ # in some Sam-3d-objects code paths, so prefer the chown fix.
+ # (To try anyway: edit run_docker.sh and add `--user $(id -u):$(id -g)`
+ # to the `docker run` invocation.)
+ ```
+
+- **The `gh-proxy.com` redirect is for users behind the GFW.** The image
+ bakes a `git config --global url..insteadOf https://github.com/`
+ rule so in-container `git clone` of github URLs survives flaky direct
+ access from mainland China. **Outside mainland China this hop is
+ unnecessary and may slow things down.** Disable it once per container
+ start:
+
+ ```bash
+ git config --global --unset url."https://gh-proxy.com/https://github.com/".insteadOf
+ ```
+
+ (Or bake your own image variant with the rule removed if you'd rather
+ not run that every time.)
------
-## **Stage 1 — Qwen3-VL + SAM3 (2D Mask Generation)**
+# **3. Quick Start**
-```
-bash run_agent_with_vllm.sh
-```
+> If you're using the Docker image (§2.4), start the container first with
+> `./run_docker.sh` — every command in this section runs **inside** the
+> container exactly as written.
-Outputs:
+Try the bundled demo image (the entry script activates `.venv` internally, so you don't need to do it yourself):
+```bash
+bash run_object_generation_pipeline.sh example/example.png
```
-outputs/master_with_vllm/masks/
+
+By default, all outputs are written next to the input image (in this case, into `example/`). Pass an explicit output directory as the second argument if you want them elsewhere:
+
+```bash
+bash run_object_generation_pipeline.sh example/example.png path/to/scene_dir
```
+The script runs three stages in sequence inside the single `.venv`:
+
+1. `submodule/Prompt-Inpaint/main.py` — segmentation + inpainting
+2. `pipeline/background_reconstruction.py` — AnySplat reconstruction + table alignment
+3. `pipeline/objects_generation.py` — per-object mesh + Gaussian export
+
------
-## **Stage 2 — SAM-3D-Objects Reconstruction**
+# **4. Pipeline Stages**
-```
-bash run_sam3d_from_masks.sh
+## **Stage 1 — Prompt-Inpaint (SAM3 segmentation + inpainting)**
+
+```bash
+python submodule/Prompt-Inpaint/main.py \
+ --resize-output \
+ --save-individual-masks \
+ --config submodule/Prompt-Inpaint/configs/items.yml \
+ --image path/to/input_image.png \
+ --output-dir path/to/scene_dir
```
-Outputs:
+Outputs (under `scene_dir/`):
-```
-sam-3d-objects/outputs/torch_save_pt/
-sam-3d-objects/gaussians/multi/
+- `input_image.png` — resized copy of the input
+- `clean_background.png` — inpainted background with all foreground objects removed
+- `bg_mask.png` — table / desktop mask used for plane fitting
+- `masks/.png` — per-object binary masks
+
+## **Stage 2 — AnySplat + table-aligned 3D Gaussians**
+
+```bash
+python pipeline/background_reconstruction.py path/to/scene_dir
```
-------
+Behaviour:
-## **Optional: One-Click Execution**
+- Loads `clean_background.png` (and the matching `input_image.png`) inside each scene folder under the input directory.
+- Runs AnySplat to recover camera intrinsics/extrinsics, depth, and a 3DGS reconstruction.
+- Fits a RANSAC plane to `bg_mask.png`, derives an OBB via inner PCA, and builds a world-to-table transform.
+- Re-emits the splat in a Mujoco-friendly frame.
+Useful flags:
+
+- `--model-id lhjiang/anysplat` — override the AnySplat HuggingFace model id
+- `--align-table` / `--no-align-table` — toggle RANSAC table alignment + the `bg_aligned.ply` export (default: enabled). When disabled, only the raw `bg.ply` is written
+- `--x-offset`, `--z-offset` — optional placement offsets (m) applied after alignment. Default: 0, so the aligned cloud sits at the origin
+
+Outputs (under `scene_dir/`):
+
+- `extrinsic.npy`, `intrinsic.npy` — camera parameters (world-to-camera; pixel-unit intrinsics)
+- `depth.npy`, `depth_visual.png` — depth from the splat reconstruction
+- `depth_ori.npy`, `depth_ori_visual.png` — depth from the original (non-inpainted) image
+- `scale.npy` — scene-level scale factor
+- `3d_assets/bg.ply` — raw 3DGS scene from AnySplat
+- `3d_assets/bg_aligned.ply` — table-aligned 3DGS scene (only when `--align-table` is on, which is the default)
+
+## **Stage 3 — SAM-3D-Objects per-object reconstruction**
+
+```bash
+python pipeline/objects_generation.py --input-dir path/to/scene_dir
```
-bash run_pipeline.sh
-```
+
+Useful flags:
+
+- `--project-root submodule/Sam-3d-objects` — checkpoint root
+- `--tag hf` — checkpoint subdirectory (`submodule/Sam-3d-objects/checkpoints//pipeline.yaml`)
+- `--seed 42`, `--save-pt`, `--save-intermediate`
+
+For each mask, the stage runs SAM-3D-Objects inference, recovers the object's local scale by matching projected area + mean depth against the AnySplat depth map, and exports the asset at the origin.
+
+Outputs (under `scene_dir/3d_assets/`):
+
+- `.obj` — per-object mesh sized for Mujoco
+- `.ply` — per-object 3D Gaussians sized for Mujoco
+- `_keyframe.npy` — mean XYZ of the final mesh
+- (with `--save-intermediate`) debug renderings and the pose-applied versions
------
-# **8. Q&A**
+# **5. Optional Tools**
-## **Q1: Download error “Consistency check failed: file should be XXXX but has size YYYY”?**
+## **`pipeline/mesh2mjcf.py` — mesh → MuJoCo MJCF converter**
-Cause: corrupted model shards in the HuggingFace cache due to unstable network.
+A standalone CLI that turns a single `.obj` or `.stl` mesh into MuJoCo MJCF
+assets (a `_dependencies.xml` + `.xml` pair, plus a per-asset
+mesh / texture directory). It is **not** wired into
+`run_object_generation_pipeline.sh`; use it on demand once Stage 3 has
+produced `/3d_assets/.obj`.
-Fix:
+By default, the output root is the parent directory of the input mesh, so
+running it on `scene_dir/3d_assets/cup.obj` writes a self-contained per-asset
+folder right next to the input:
```
-rm -rf sam-3d-objects/checkpoints/hf
-rm -rf ~/.cache/huggingface/hub # optional
-bash run_sam3d_from_masks.sh
+scene_dir/3d_assets/
+ cup.obj (original input, untouched)
+ cup/ (per-asset output folder, named after the obj stem)
+ cup.obj (copy of the input)
+ cup.mtl (if multi-material)
+ (referenced by the MTL)
+ part_0.obj part_1.obj ... (if -cd)
+ mjcf/
+ cup.xml
+ cup_dependencies.xml
```
-Force fresh download:
+Mesh paths inside the emitted XMLs are written as `/`, so the
+consuming MuJoCo scene should set `meshdir` (and `texturedir`) to the output
+root. Pass `-o/--output ` to redirect.
-```
-force_download=True
-```
+### Required libraries
+
+Fresh installs via `scripts/install_env.sh` already include all three optional
+packages (`coacd`, `trimesh`, `mujoco`), so the table below is only for
+reference if you skip the bundled installer or build the environment
+piecemeal:
-## **Note on Coordinate System (PLY Output Orientation)**
+| Feature | Library | Manual install |
+| --- | --- | --- |
+| Multi-material OBJ splitting (automatic when an MTL file is present) | `trimesh` | `uv pip install trimesh` |
+| Convex decomposition (`-cd`) | `coacd`, `trimesh` | `uv pip install coacd trimesh` |
+| Preview viewer (`--verbose`) | `mujoco` | `uv pip install mujoco` |
-The 3D Gaussian `.ply` files exported by **SAM-3D-Objects** are expressed in the **camera coordinate system**, where:
+### Usage
-- **+Z axis** points **forward** from the camera
-- **+X axis** points right
-- **+Y axis** points downward (typical computer vision convention)
+```bash
+# Basic conversion (default colour / mass / inertia)
+python pipeline/mesh2mjcf.py path/to/cup.obj
-This means the reconstructed objects are aligned using **camera-forward Z-axis** rather than a world coordinate frame.
+# Custom RGBA, mass, and diagonal inertia
+python pipeline/mesh2mjcf.py path/to/cup.obj \
+ --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005
-If you want to visualize or place the objects in a global **world coordinate system**, you must apply a **camera-to-world transformation**:
-$$
-\mathbf{X}_{world} = \mathbf{R}_{c2w}\ \mathbf{X}_{camera} \ + \ \mathbf{t}_{c2w}
-$$
-Where:
+# Free-floating body + convex decomposition for accurate collisions
+python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint -cd
-- $\mathbf{R}_{c2w}$ is the rotation matrix from camera to world
-- $\mathbf{t}_{c2w}$ is the translation vector
-- $\mathbf{X}_{camera}$ is the Gaussian center in camera coordinates
-- $\mathbf{X}_{world}$ is the desired world coordinate position
+# Preview in mujoco.viewer after conversion
+python pipeline/mesh2mjcf.py path/to/cup.obj --verbose
+
+# Batch over all per-object meshes in one scene
+for obj in scene_dir/3d_assets/*.obj; do
+ python pipeline/mesh2mjcf.py "$obj" -cd
+done
+```
-After applying this transformation, the `.ply` will correctly align with your global scene, robotics simulator, or NeRF / COLMAP world frame.
------
-# **Citation**
+# **6. FAQ**
+
+**Q: HuggingFace download fails with “Consistency check failed: file should be XXXX but has size YYYY”.**
-### SAM3
+Corrupt shards in the HuggingFace cache. Clear and retry:
+```bash
+rm -rf submodule/Sam-3d-objects/checkpoints/hf
+rm -rf ~/.cache/huggingface/hub # optional, more aggressive
+bash run_object_generation_pipeline.sh path/to/input_image.png
```
-@article{kirillov2024sam3,
- title={SAM 3: Segment Anything in Images and Videos},
- author={Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others},
- year={2024},
- url={https://github.com/facebookresearch/sam3}
-}
+
+You can also force a fresh download by setting `force_download=True` when invoking the HuggingFace API.
+
+**Q: AnySplat reports “cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead”.**
+
+The CUDA extension was not built. Apply the `kernels.cu` patch documented in [`install.md`](install.md) and run `python setup.py build_ext --inplace`.
+
+**Q: `ImportError: cannot import name 'cached_download' from 'huggingface_hub'` during Stage 1 (Prompt-Inpaint / iopaint).**
+
+`huggingface_hub` ≥ 0.26 removed `cached_download`, but `diffusers` 0.27.x (which is what `iopaint` pulls in) still imports it. Downgrade `huggingface_hub` to 0.25.2:
+
+```bash
+source .venv/bin/activate
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+ "huggingface_hub==0.25.2"
```
-### SAM-3D-Objects
+Fresh installs via `scripts/install_env.sh` already include this pin.
+
+**Q: `ImportError: cannot import name 'is_offline_mode' from 'huggingface_hub'` during Stage 1.**
+Same symptom from the other direction: `transformers` 5.x imports `is_offline_mode` from `huggingface_hub`, which doesn't exist in 0.25.2. Pin transformers to 4.48.3:
+
+```bash
+source .venv/bin/activate
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+ "transformers==4.48.3"
```
+
+Fresh installs via `scripts/install_env.sh` already include this pin.
+
+------
+
+# **Citations**
+
+```bibtex
+@article{kirillov2024sam3,
+ title = {SAM 3: Segment Anything in Images and Videos},
+ author = {Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others},
+ year = {2024},
+ url = {https://github.com/facebookresearch/sam3}
+}
+
@article{wu2024sam3dobjects,
- title={SAM-3D-Objects: Segment Anything in 3D Using 2D Masks},
- author={Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others},
- year={2024},
- url={https://github.com/facebookresearch/sam3d}
+ title = {SAM-3D-Objects: Segment Anything in 3D Using 2D Masks},
+ author = {Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others},
+ year = {2024},
+ url = {https://github.com/facebookresearch/sam-3d-objects}
+}
+
+@article{jiang2024anysplat,
+ title = {AnySplat: Feed-forward 3D Gaussian Splatting from Unconstrained Views},
+ author = {Jiang, Lihan and others},
+ year = {2024},
+ url = {https://github.com/OpenRobotLab/AnySplat}
}
```
@@ -264,11 +485,9 @@ After applying this transformation, the `.ply` will correctly align with your gl
This project is built upon and integrates:
-- **SAM3**
- GitHub: https://github.com/facebookresearch/sam3
- HuggingFace: https://huggingface.co/facebook/sam3
-- **SAM-3D-Objects**
- GitHub: https://github.com/facebookresearch/sam3d
- HuggingFace: https://huggingface.co/facebook/sam-3d-objects
+- **SAM3** — [GitHub](https://github.com/facebookresearch/sam3) · [HuggingFace](https://huggingface.co/facebook/sam3)
+- **SAM-3D-Objects** — [GitHub](https://github.com/facebookresearch/sam3d) · [HuggingFace](https://huggingface.co/facebook/sam-3d-objects)
+- **AnySplat** — [HuggingFace](https://huggingface.co/lhjiang/anysplat)
+- **Prompt-Inpaint** — [GitHub](https://github.com/MrZoyo/Prompt-Inpaint)
-We sincerely thank the authors for making their research and implementations publicly available.
\ No newline at end of file
+We thank the authors for making their research and implementations publicly available.
diff --git a/README_zh.md b/README_zh.md
index 5ab1418..0a1d1a6 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -4,327 +4,481 @@
-# **统一的多阶段 2D→3D 感知流水线**
+# **2D 单图 → 3D 物体生成流水线**
-## *vLLM × SAM3 × SAM-3D-Objects 集成*
+## *Prompt-Inpaint × AnySplat × SAM-3D-Objects 集成*
+
+> 本仓库最初 fork 自 [xyys2003/sam3d_gs](https://github.com/xyys2003/sam3d_gs)。
------
## **摘要**
-本仓库构建了一个完整的 2D → 3D 感知流水线,将 **大模型视觉理解、2D 多物体分割、3D Gaussian Splatting 重建** 三者进行统一整合。流水线由:
+本仓库将三个开源系统串联进单条流水线,使用一条命令即可完成单图 → 多物体 3D 资产的生成:
-- **vLLM**:提供 Qwen3-VL-8B-Thinking 视觉语言大模型推理
-- **SAM3**:执行高质量多物体 2D 分割
-- **SAM-3D-Objects**:将 RGB + mask 提升为 3D 高斯点(Gaussian Splat)
+- **Prompt-Inpaint**:基于 SAM3 的文本提示多物体分割 + 背景补全,产出有每个物体的 mask 与 clean background。
+- **AnySplat**:单图前馈式 3D Gaussian Splatting 重建;额外的 RANSAC 桌面对齐将场景对齐到坐标系原点。
+- **SAM-3D-Objects**:以 RGB + mask 为输入,重建单物体的 mesh 与 Gaussian。
-为确保可复现性,每个模块均独立运行在各自的 Conda 环境中。系统支持 **分阶段执行**(先 2D 分割、再 3D 重建),也支持 **一键式全流程运行**。
+三者通过 `pipeline/` 下的脚本以及一个由 `uv` 管理的单一虚拟环境串联起来,整条流水线由一个 shell 命令驱动。
------
-# **1. 仓库克隆**
+# **1. 仓库结构**
```
-git clone --recursive https://github.com/xyys2003/sam3d_gs.git
-cd sam3d_gs
+.
+├── run_object_generation_pipeline.sh # 主入口:单图 → 3D 资产
+├── pipeline/
+│ ├── background_reconstruction.py # AnySplat + 桌面 RANSAC 对齐
+│ ├── objects_generation.py # SAM-3D-Objects 多物体重建
+│ ├── mesh2mjcf.py # 可选:把单物体 .obj 转成 MuJoCo MJCF
+│ └── utils.py # 渲染 / IO 公共工具
+└── submodule/
+ ├── Prompt-Inpaint/ # SAM3 分割 + 背景补全
+ ├── AnySplat/ # 单图 3DGS 重建
+ └── Sam-3d-objects/ # 单物体 mesh / GS 重建
```
-如果你忘记使用 `--recursive` 克隆,可运行:
+------
+
+# **2. 环境安装**
+
+整个项目运行在单个由 `uv` 管理的虚拟环境 `.venv/` 中。下面的步骤面向 RTX 50 系 GPU(CUDA 12.8,PyTorch 2.7),同样在 3090 / 4090 上验证通过。
+
+> **硬件**:推荐使用 **显存 ≥ 24 GB** 的 NVIDIA GPU。流水线会依次加载 SAM3、AnySplat、SAM-3D-Objects,其中 SAM-3D-Objects 阶段对显存最敏感。
+## **2.1 克隆仓库(含子模块)**
+
+```bash
+git clone --recursive https://github.com/Yuchi-Zhang-00/sam3d_gs.git
+cd sam3d_gs
```
+
+如果克隆时忘了 `--recursive`:
+
+```bash
git submodule update --init --recursive
```
-------
+## **2.2 安装 Python 环境**
-# **2. Conda 环境说明**
+推荐使用一键安装脚本:
-本项目使用三个互相隔离的 Conda 环境,以避免依赖冲突。
+```bash
+bash scripts/install_env.sh
+```
-| 环境名称 | 功能用途 | 路径 |
-| --------------- | ---------------------------------- | ----------------- |
-| `vllm` | 运行 Qwen3-VL-8B-Thinking 推理服务 | — |
-| `sam3` | 运行 SAM3 完成 2D 多物体分割 | `sam3/` |
-| `sam3d-objects` | 从 RGB + Mask 生成 3D Gaussian | `sam-3d-objects/` |
+脚本会创建 `.venv`、安装 CUDA 12.8 版 PyTorch、子模块依赖以及项目级运行时依赖。
-------
+如果想手动一步步执行,请查阅 [`install.md`](install.md)。该文档同时记录了 SAM-3D-Objects 的几处 requirements 文件 patch 和编译 AnySplat CUDA RoPE2D 内核所需的 `kernels.cu` 修改。
-# **3. vLLM 环境(Qwen3-VL 服务器)**
+## **2.3 HuggingFace 权限申请**
-### **3.1 创建环境**
+流水线依赖以下三个 HuggingFace 模型:
-```
-conda create -n vllm python=3.10 -y
-conda activate vllm
-```
+| 模型 | 使用方 | 访问 |
+| --- | --- | --- |
+| [`facebook/sam3`](https://huggingface.co/facebook/sam3) | Prompt-Inpaint(Stage 1) | **gated**,需在模型页面申请权限 |
+| [`facebook/sam-3d-objects`](https://huggingface.co/facebook/sam-3d-objects) | SAM-3D-Objects(Stage 3) | **gated**,需在模型页面申请权限 |
+| [`lhjiang/anysplat`](https://huggingface.co/lhjiang/anysplat) | AnySplat(Stage 2) | 公开(MIT) |
-### **3.2 安装 PyTorch(CUDA 12.x)**
+在两个 gated 模型页面接受协议后,登录一次:
-```
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
- --index-url https://download.pytorch.org/whl/cu124
+```bash
+hf auth login
```
-### **3.3 安装 vLLM 与相关依赖**
+两个 gated 模型需要显式放置到本地,由一个 bootstrap 脚本一次性处理(登录后
+跑一次即可):
-```
-pip install vllm --extra-index-url https://download.pytorch.org/whl/cu124
-pip install transformers tiktoken sentencepiece xformers flashinfer-python
-pip install huggingface_hub
+```bash
+bash scripts/download_checkpoints.sh
```
-此配置已验证可稳定运行 **Qwen3-VL-8B-Thinking**。
+| 模型 | 落地位置 |
+| --- | --- |
+| `facebook/sam-3d-objects` | `submodule/Sam-3d-objects/checkpoints/hf/`(Hydra 配置树,不会被 `from_pretrained` 拉取) |
+| `facebook/sam3` | `submodule/Prompt-Inpaint/checkpoints/sam3.pt`(约 3.3 GB;放到本地以免 `~/.cache` 清理后丢失) |
-------
+该脚本是幂等的,且 `run_object_generation_pipeline.sh` 在首次运行时也会
+自动调用它。可以通过 `--skip-sam3d`、`--skip-sam3` 或 `--force` 单独控制每
+一个 stage。
-# **4. SAM3 环境**
+`lhjiang/anysplat` 也由同一个 bootstrap 脚本拉取(落到标准的 HuggingFace
+hub 缓存 `~/.cache/huggingface/hub/` 下)。它是公开模型(MIT),**不需要
+`hf auth login`**;提前拉只是避免 Stage 2 首次运行时做几 GB 的下载。
+传 `--skip-anysplat` 可以跳过这一步、让 AnySplat 首次运行时再 lazy 下载。
+
+------
-官方实现:
- 🔗 https://github.com/facebookresearch/sam3
- 🔗 https://huggingface.co/facebook/sam3
+## **2.4 Docker 镜像(2.1–2.3 的替代方案)**
-### **4.1 创建环境**
+仓库提供了一份预构建镜像,包含完整环境(CUDA 12.8 基础镜像、uv 管理的
+`.venv`、编译好的 AnySplat curope CUDA 扩展、所有 PyPI 依赖),已发布到
+阿里云容器镜像服务:
```
-cd sam3
-conda create -n sam3 python=3.10 -y
-conda activate sam3
+crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1
+crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:latest
```
-### **4.2 安装 PyTorch(CUDA 12.x)**
+用镜像可以完全跳过 §2.2;但宿主机仍然需要克隆本仓库(用于
+`run_docker.sh` 启动脚本和 checkpoint 的 bind-mount 目录),以及完成
+§2.3 的 HuggingFace 权限申请。
-```
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
- --index-url https://download.pytorch.org/whl/cu124
-```
+### **前置条件**
-### **4.3 克隆并安装 SAM3**
+- 已安装 Docker 和 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html);
+ 显存 ≥ 24 GB 的 NVIDIA GPU
+- 宿主机上已经 clone 了本仓库(`git clone --recursive ...`,见 §2.1)——
+ 用作 `run_docker.sh` 启动脚本所在位置,以及 checkpoint / 数据 / 输出的
+ bind-mount 根目录
+- 完成 §2.3 的一次性 HuggingFace 设置,并在宿主机执行过
+ `bash scripts/download_checkpoints.sh`。Checkpoint 留在宿主机、通过
+ bind-mount 进容器,所以只需要下载一次。
-```
-git clone https://github.com/facebookresearch/sam3.git
-cd sam3
-pip install -e .
+### **拉取镜像**
+
+```bash
+docker pull crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1
+docker tag crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 sam3d-gs:latest
```
-### **4.4 可选依赖(用于 Notebook 或训练)**
+`docker tag` 这一步可选。`run_docker.sh` 默认使用 `sam3d-gs:latest`;
+如果不想重 tag,可以在启动时加前缀
+`SAM3D_IMAGE=crpi-.../sam3d_gs:v0.1`。
-```
-pip install -e ".[notebooks]"
-pip install -e ".[train,dev]"
+### **启动容器**
+
+```bash
+./run_docker.sh # 全默认(推荐)
+./run_docker.sh /path/to/sam3d_gs # 显式传项目目录
+./run_docker.sh /path/to/sam3d_gs /mnt/hf_cache # 自定义 HF 缓存根
+SAM3D_IMAGE=sam3d-gs:v0.1 ./run_docker.sh # 指定镜像 tag
+TORCH_HOME=/mnt/torch_cache ./run_docker.sh # 自定义 torch hub 缓存
```
-------
+启动脚本会把宿主机的关键路径 bind-mount 进容器:
-# **5. SAM-3D-Objects 环境**
+| 宿主机路径 | 容器路径 | 用途 |
+| --- | --- | --- |
+| `/submodule/Sam-3d-objects/checkpoints` | 同名 | SAM-3D-Objects 权重(gated) |
+| `/submodule/Prompt-Inpaint/checkpoints` | 同名 | SAM3 权重(gated) |
+| `${HF_HOME:-$HOME/.cache/huggingface}` | `/root/.cache/huggingface` | AnySplat + 其它 HF 下载 |
+| `${TORCH_HOME:-$HOME/.cache/torch}` | `/root/.cache/torch` | `torch.hub` 缓存(DINOv2 等) |
+| `/data` | `/opt/sam3d_gs/data` | 输入 / 输出工作目录 |
+| `/example` | `/opt/sam3d_gs/example` | 自带示例输入 / 输出 |
-官方实现:
- 🔗 https://github.com/facebookresearch/sam3d
- 🔗 https://huggingface.co/facebook/sam-3d-objects
+流水线的产物会写到你指定的 scene 目录里。因为 `data/` 和 `example/`
+都是 bind-mount,容器退出后这些产物会留在宿主机上。
-### **5.1 创建环境**
+### **在容器内运行流水线**
-```
-conda create -n sam_3d_body python=3.10 -y
-conda activate sam_3d_body
-```
+进入容器后你会落到 `/opt/sam3d_gs/`。镜像里 `PATH` 和 `PYTHONPATH`
+已经指向自带的 `.venv`,可以直接调用 `python` 和脚本,**不需要
+`source .venv/bin/activate`**。
-### **5.2 安装 PyTorch(CUDA 12.x)**
+```bash
+# 自带示例:
+bash run_object_generation_pipeline.sh example/example.png
-```
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
- --index-url https://download.pytorch.org/whl/cu124
+# 自己的图:
+bash run_object_generation_pipeline.sh data/my_scene/input_image.png
```
-### **5.3 安装其他 Python 依赖**
+Stage 1 / 2 / 3 的行为和下面 §3–§4 完全一致。
-```
-pip install pytorch-lightning pyrender opencv-python yacs scikit-image einops timm dill pandas rich \
- hydra-core hydra-submitit-launcher hydra-colorlog pyrootutils webdataset chump networkx==3.2.1 \
- roma joblib seaborn wandb appdirs appnope ffmpeg cython jsonlines pytest xtcocotools loguru \
- optree fvcore black pycocotools tensorboard huggingface_hub
-```
+### **镜像里包含什么**
-### **5.4 安装 Detectron2(SAM3D 依赖)**
+- CUDA 12.8 devel 基础镜像 + Python 3.11 `.venv`,所有 PyPI 依赖
+- 已编译好的 AnySplat `curope` CUDA 扩展(sm_80 / 90 / 100 / 120)
+- `coacd`、`trimesh`、`mujoco`(`pipeline/mesh2mjcf.py` 开箱可用)
+- 一个 `sitecustomize.py`,monkey-patch `torch.hub`,使其在本地缓存
+ 存在时跳过 github 的 branch ping —— 这样网络不稳时也不会再触发
+ `RemoteDisconnected`(前提是 `~/.cache/torch/hub` 已有相应模型)
+- 全局的 `git insteadOf` 规则,把 `https://github.com/` 重写到
+ `https://gh-proxy.com/https://github.com/`,让容器内的
+ `git clone` 在 github 不稳的网络上也能工作
-```
-pip install 'git+https://github.com/facebookresearch/detectron2.git@a1ce2f9' \
- --no-build-isolation --no-deps
-```
+### **镜像里不包含什么**
-### **5.5 可选安装:MoGe**
-
-```
-pip install git+https://github.com/microsoft/MoGe.git
-```
+- 三套模型 checkpoint(SAM3 / SAM-3D-Objects / AnySplat)。它们留在
+ 宿主机上、通过上面的 bind-mount 进容器。在宿主机执行一次
+ `scripts/download_checkpoints.sh` 即可。
+- 你自己的输入数据。放到 `/data//` 下,容器里通过
+ `data//input_image.png` 引用。
-------
+### **使用须知**
-# **6. HuggingFace 权限申请**
+- **流水线写出的文件在宿主机上属主是 `root`**。容器内是 root 用户跑的,
+ 所以写进 bind-mount 目录(`data/`、`example/`、checkpoint 目录等)
+ 的文件,在宿主机上看到的所有者是 uid 0。两种处理方式:
-本项目依赖两个需要授权的模型:
+ ```bash
+ # 容器退出后,在宿主机改回当前用户:
+ sudo chown -R $(id -u):$(id -g) data/ example/
-- **SAM3**
- 🔗 https://huggingface.co/facebook/sam3
-- **SAM-3D-Objects**
- 🔗 https://huggingface.co/facebook/sam-3d-objects
+ # 或者从一开始就让容器用宿主机的 uid 跑。
+ # 优点是不用 chown,缺点是 Sam-3d-objects 里某些 EGL / pyrender
+ # 代码路径在非 root 下可能跑不通,所以一般建议用上面的 chown 方案。
+ # (想试的话: 编辑 run_docker.sh,给 docker run 加上
+ # `--user $(id -u):$(id -g)`)
+ ```
-请在 HuggingFace 对应页面申请权限,并登录:
+- **`gh-proxy.com` 这个重写是给国内用户准备的**。镜像里烤了一条
+ `git config --global url..insteadOf https://github.com/` 规则,
+ 让容器里 `git clone` github 仓库在 GFW 网络下也能成功。**在境外网络
+ 环境下这个跳转是多余的,可能反而拖慢速度**。每次进容器后执行一次即可
+ 禁用:
-```
-hf auth login
-```
+ ```bash
+ git config --global --unset url."https://gh-proxy.com/https://github.com/".insteadOf
+ ```
-脚本会自动使用你的 Token。
+ (或者自己 commit 一个去掉这条规则的镜像变体,免得每次都跑。)
------
-# **7. 运行流程**
+# **3. 快速开始**
-运行脚本前,请设置你的 Conda 激活脚本路径:
+> 如果你用的是 Docker 镜像(§2.4),先跑 `./run_docker.sh` 进容器;
+> 本节后面所有命令都在**容器内**原样执行。
+先用仓库自带的示例图跑一遍即可(入口脚本会自动 `source .venv`,无需手动激活环境):
+
+```bash
+bash run_object_generation_pipeline.sh example/example.png
```
-CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh"
+
+默认所有产物会写到输入图像所在目录(此例中即 `example/`)。若想显式指定输出目录,可以传第二个参数:
+
+```bash
+bash run_object_generation_pipeline.sh example/example.png path/to/scene_dir
```
+脚本会在同一个 `.venv` 中按顺序执行三个 stage:
+
+1. `submodule/Prompt-Inpaint/main.py` — 分割 + 背景补全
+2. `pipeline/background_reconstruction.py` — AnySplat 重建 + 桌面对齐
+3. `pipeline/objects_generation.py` — 单物体 mesh / Gaussian 导出
+
------
-## **阶段 1:Qwen3-VL + SAM3 生成 2D Mask**
+# **4. 各 Stage 详解**
-执行:
+## **Stage 1 — Prompt-Inpaint(SAM3 分割 + 背景补全)**
-```
-bash run_agent_with_vllm.sh
+```bash
+python submodule/Prompt-Inpaint/main.py \
+ --resize-output \
+ --save-individual-masks \
+ --config submodule/Prompt-Inpaint/configs/items.yml \
+ --image path/to/input_image.png \
+ --output-dir path/to/scene_dir
```
-此脚本会:
+输出(位于 `scene_dir/`):
-1. 激活 `vllm` 环境
-2. 启动 vLLM 服务,加载 Qwen3-VL
-3. 激活 `sam3` 环境
-4. 运行 `pipeline/run_sam3_agent_full.py`
-5. 生成多物体 mask
+- `input_image.png` — 输入图像的 resize 副本
+- `clean_background.png` — 去除所有前景物体后的补全背景
+- `bg_mask.png` — 用于平面拟合的桌面 mask
+- `masks/<物体名>.png` — 每个物体的二值 mask
-输出目录:
+## **Stage 2 — AnySplat + 桌面对齐 3DGS**
-```
-outputs/master_with_vllm/masks/
+```bash
+python pipeline/background_reconstruction.py path/to/scene_dir
```
-------
+行为:
-## **阶段 2:SAM-3D-Objects 重建 3D Gaussian**
+- 递归读取输入目录下每个场景文件夹中的 `clean_background.png` 和配套的 `input_image.png`。
+- 运行 AnySplat 恢复相机内外参、深度、3DGS 重建结果。
+- 对 `bg_mask.png` 做 RANSAC 平面拟合,结合内部 PCA 得到 OBB,构建 world → table 变换。
+- 输出 Mujoco 坐标系下的对齐点云。
-执行:
+常用参数:
-```
-bash run_sam3d_from_masks.sh
-```
+- `--model-id lhjiang/anysplat` — 覆盖 AnySplat 的 HuggingFace 模型 id
+- `--align-table` / `--no-align-table` — 是否启用 RANSAC 桌面对齐并导出 `bg_aligned.ply`(默认启用)。关闭时只导出原始 `bg.ply`
+- `--x-offset`、`--z-offset` — 对齐后可选的放置偏移(米)。默认 0,对齐后的点云落在原点
-此脚本会:
+输出(位于 `scene_dir/`):
-1. 激活 `sam3d-objects` 环境
-2. 确保 SAM-3D-Objects 的 checkpoint 下载完成
-3. 加载 RGB + masks
-4. 生成每个物体的 `.pt` 文件
-5. 重建并导出 3D Gaussian (`.ply`, `.gif`)
+- `extrinsic.npy`、`intrinsic.npy` — 相机参数(world-to-camera;像素单位内参)
+- `depth.npy`、`depth_visual.png` — 来自 splat 重建的深度
+- `depth_ori.npy`、`depth_ori_visual.png` — 来自原始(未补全)图像的深度
+- `scale.npy` — 场景级缩放因子
+- `3d_assets/bg.ply` — AnySplat 输出的原始 3DGS 场景
+- `3d_assets/bg_aligned.ply` — 桌面对齐后的 3DGS 场景(仅当 `--align-table` 启用时输出,默认启用)
-输出目录:
+## **Stage 3 — SAM-3D-Objects 单物体重建**
-```
-sam-3d-objects/outputs/torch_save_pt/
-sam-3d-objects/gaussians/multi/
+```bash
+python pipeline/objects_generation.py --input-dir path/to/scene_dir
```
-------
+常用参数:
-## **可选:一键式全流程执行**
+- `--project-root submodule/Sam-3d-objects` — checkpoint 根目录
+- `--tag hf` — checkpoint 子目录(`submodule/Sam-3d-objects/checkpoints//pipeline.yaml`)
+- `--seed 42`、`--save-pt`、`--save-intermediate`
-```
-bash run_pipeline.sh
-```
+针对每一个 mask,该 stage 运行 SAM-3D-Objects 推理,通过对比投影面积与平均深度恢复物体局部尺寸,并把资产以原点姿态导出。
-该脚本会自动完成阶段 1 + 阶段 2。
+输出(位于 `scene_dir/3d_assets/`):
+
+- `<物体名>.obj` — Mujoco 单位的物体 mesh
+- `<物体名>.ply` — Mujoco 单位的物体 3D Gaussian
+- `<物体名>_keyframe.npy` — 最终 mesh 的平均 XYZ
+- 当传入 `--save-intermediate` 时,额外导出调试用的渲染和带姿态的中间产物
------
-# **Q&A**
+# **5. 可选工具**
+
+## **`pipeline/mesh2mjcf.py` — mesh → MuJoCo MJCF 转换器**
-## **Q1:下载模型时报 “Consistency check failed”?**
+一个独立的命令行工具,把单个 `.obj` 或 `.stl` 文件转成 MuJoCo MJCF 资产
+(`_dependencies.xml` + `.xml` 两个 XML,以及一个 per-asset 的
+mesh / texture 目录)。它**没有**被串进
+`run_object_generation_pipeline.sh`;当 Stage 3 产出
+`/3d_assets/.obj` 之后按需调用即可。
-**原因:** 下载中断导致 HuggingFace 缓存中出现损坏的模型分片。
- **解决:删除损坏缓存并重新下载。**
+默认输出根目录是输入 mesh 的父目录,所以对
+`scene_dir/3d_assets/cup.obj` 运行后会在输入旁边生成一个 per-asset 目录:
```
-rm -rf sam-3d-objects/checkpoints/hf
-rm -rf ~/.cache/huggingface/hub # 可选
-bash run_sam3d_from_masks.sh
+scene_dir/3d_assets/
+ cup.obj (原输入,不变)
+ cup/ (以 obj 名命名的 per-asset 输出目录)
+ cup.obj (输入的拷贝)
+ cup.mtl (若多材质)
+ <纹理文件> (MTL 引用的贴图)
+ part_0.obj part_1.obj ... (若 -cd)
+ mjcf/
+ cup.xml
+ cup_dependencies.xml
```
-若要强制重新下载,可使用:
+emitted XML 中的 mesh 路径写作 `/`,所以消费方的 MuJoCo
+scene 需要把 `meshdir`(和 `texturedir`)设为输出根目录。通过
+`-o/--output ` 可以重定向。
-```
-force_download=True
-```
+### 所需依赖
+
+走 `scripts/install_env.sh` 装环境的话,`coacd`、`trimesh`、`mujoco` 三个包
+默认就装好了。下表只在你跳过一键脚本、想手动按需装时作为参考:
-## **关于坐标系说明(PLY 输出方向)**
+| 功能 | 依赖库 | 手动安装命令 |
+| --- | --- | --- |
+| 多材质 OBJ 自动拆分(当存在 MTL 文件时触发) | `trimesh` | `uv pip install trimesh` |
+| 凸分解(`-cd`) | `coacd`、`trimesh` | `uv pip install coacd trimesh` |
+| 预览查看器(`--verbose`) | `mujoco` | `uv pip install mujoco` |
-通过 **SAM-3D-Objects** 导出的 3D Gaussian `.ply` 文件默认处于 **相机坐标系** 下,其中:
+### 用法
-- **+Z 轴** 为相机前向
-- **+X 轴** 指向右侧
-- **+Y 轴** 指向下方(典型计算机视觉坐标系)
+```bash
+# 基本用法(使用默认颜色 / 质量 / 惯性)
+python pipeline/mesh2mjcf.py path/to/cup.obj
-因此,重建的对象是以 **相机前向 Z 轴** 对齐的,而不是世界坐标系。
+# 自定义 RGBA、质量、对角惯性
+python pipeline/mesh2mjcf.py path/to/cup.obj \
+ --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005
-如果需要将 `.ply` 放置到全局 **世界坐标系** 中(例如仿真器、机器人场景、NeRF / COLMAP world frame),必须执行一次 **相机 → 世界坐标系转换**:
-$$
-\mathbf{X}_{world} = \mathbf{R}_{c2w}\ \mathbf{X}_{camera} \ + \ \mathbf{t}_{c2w}
-$$
-其中:
+# 自由关节 + 凸分解,得到更精确的碰撞几何
+python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint -cd
-- $\mathbf{R}_{c2w}$:相机到世界的旋转矩阵
-- $\mathbf{t}_{c2w}$:相机到世界的平移向量
-- $\mathbf{X}_{camera}$:高斯中心的相机系坐标
-- $\mathbf{X}_{world}$:转换后的世界系坐标
+# 在 mujoco.viewer 中预览
+python pipeline/mesh2mjcf.py path/to/cup.obj --verbose
+
+# 一键批量转换某个场景下所有物体
+for obj in scene_dir/3d_assets/*.obj; do
+ python pipeline/mesh2mjcf.py "$obj" -cd
+done
+```
-完成转换后,你即可将 `.ply` 与全局场景或机器人环境正确对齐。
------
-# **引用(Citation)**
+# **6. 常见问题**
-### **SAM3**
+**Q:HuggingFace 下载报 "Consistency check failed: file should be XXXX but has size YYYY"。**
+HuggingFace 缓存中的 shard 损坏。清理后重试:
+
+```bash
+rm -rf submodule/Sam-3d-objects/checkpoints/hf
+rm -rf ~/.cache/huggingface/hub # 可选,更激进
+bash run_object_generation_pipeline.sh path/to/input_image.png
```
-@article{kirillov2024sam3,
- title={SAM 3: Segment Anything in Images and Videos},
- author={Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others},
- year={2024},
- url={https://github.com/facebookresearch/sam3}
-}
+
+也可以在调用 HuggingFace API 时通过 `force_download=True` 强制重新下载。
+
+**Q:AnySplat 提示 "cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead"。**
+
+CUDA 扩展没编译。请按 [`install.md`](install.md) 里的说明修改 `kernels.cu`,再执行 `python setup.py build_ext --inplace`。
+
+**Q:Stage 1 (Prompt-Inpaint / iopaint) 报 `ImportError: cannot import name 'cached_download' from 'huggingface_hub'`。**
+
+`huggingface_hub` ≥ 0.26 把 `cached_download` 删掉了,但 `iopaint` 依赖的 `diffusers` 0.27.x 还在 import 它。把 `huggingface_hub` 锁到 0.25.2:
+
+```bash
+source .venv/bin/activate
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+ "huggingface_hub==0.25.2"
```
-### **SAM-3D-Objects**
+新走 `scripts/install_env.sh` 的环境已经带上这个 pin。
+
+**Q:Stage 1 报 `ImportError: cannot import name 'is_offline_mode' from 'huggingface_hub'`。**
+
+同一根问题的另一侧:`transformers` 5.x 会 import `huggingface_hub.is_offline_mode`,而 0.25.2 没有这个符号。把 transformers 锁到 4.48.3:
+```bash
+source .venv/bin/activate
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+ "transformers==4.48.3"
```
+
+新走 `scripts/install_env.sh` 的环境已经带上这个 pin。
+
+------
+
+# **引用**
+
+```bibtex
+@article{kirillov2024sam3,
+ title = {SAM 3: Segment Anything in Images and Videos},
+ author = {Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others},
+ year = {2024},
+ url = {https://github.com/facebookresearch/sam3}
+}
+
@article{wu2024sam3dobjects,
- title={SAM-3D-Objects: Segment Anything in 3D Using 2D Masks},
- author={Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others},
- year={2024},
- url={https://github.com/facebookresearch/sam3d}
+ title = {SAM-3D-Objects: Segment Anything in 3D Using 2D Masks},
+ author = {Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others},
+ year = {2024},
+ url = {https://github.com/facebookresearch/sam-3d-objects}
+}
+
+@article{jiang2024anysplat,
+ title = {AnySplat: Feed-forward 3D Gaussian Splatting from Unconstrained Views},
+ author = {Jiang, Lihan and others},
+ year = {2024},
+ url = {https://github.com/OpenRobotLab/AnySplat}
}
```
------
-# **致谢(Acknowledgements)**
+# **致谢**
-本项目基于以下官方实现构建:
+本项目基于并整合了以下工作:
-- **SAM3**
- GitHub: https://github.com/facebookresearch/sam3
- HuggingFace: https://huggingface.co/facebook/sam3
-- **SAM-3D-Objects**
- GitHub: https://github.com/facebookresearch/sam3d
- HuggingFace: https://huggingface.co/facebook/sam-3d-objects
+- **SAM3** — [GitHub](https://github.com/facebookresearch/sam3) · [HuggingFace](https://huggingface.co/facebook/sam3)
+- **SAM-3D-Objects** — [GitHub](https://github.com/facebookresearch/sam3d) · [HuggingFace](https://huggingface.co/facebook/sam-3d-objects)
+- **AnySplat** — [HuggingFace](https://huggingface.co/lhjiang/anysplat)
+- **Prompt-Inpaint** — [GitHub](https://github.com/MrZoyo/Prompt-Inpaint)
-感谢原作者开放其卓越的研究成果与代码,使本流水线得以实现。
\ No newline at end of file
+感谢原作者开放其研究成果与代码。
diff --git a/example/example.png b/example/example.png
new file mode 100644
index 0000000..9caff67
Binary files /dev/null and b/example/example.png differ
diff --git a/install.md b/install.md
new file mode 100644
index 0000000..fa01a9c
--- /dev/null
+++ b/install.md
@@ -0,0 +1,141 @@
+# Install on RTX 50-series GPUs (torch 2.7.0 + cu128, also works on 3090,4090)
+
+> **Don't want to build the environment locally?** A pre-built Docker
+> image is published; see [README §2.4 "Docker image"](README.md#24-docker-image-alternative-to-2123)
+> for the pull / launch flow. This document is only the native-install
+> reference.
+
+One-command installer:
+
+```
+bash scripts/install_env.sh
+```
+
+This document is the manual step-by-step installation reference. Use it if you want to inspect or run each installation step yourself.
+
+
+# Run the installation commands below
+
+```
+git submodule update --init --recursive
+
+uv venv --python 3.11
+
+source .venv/bin/activate
+
+export PYTHONPATH="$(pwd)/submodule/Sam-3d-objects/notebook:$(pwd)/submodule/Sam-3d-objects:${PYTHONPATH:-}"
+
+uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
+
+# uv pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128
+
+uv pip install -r submodule/AnySplat/requirements.txt --no-build-isolation
+
+export PIP_FIND_LINKS="https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html"
+
+uv pip install hatch-requirements-txt editables wheel
+
+uv pip install -e './submodule/Sam-3d-objects[dev]'
+
+uv pip install -e './submodule/Sam-3d-objects[p3d]' --no-build-isolation
+
+uv pip install -e "./submodule/Sam-3d-objects[inference]" --no-build-isolation --find-links https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html
+
+# Note: do NOT pass -U here. With -U, uv would upgrade transitive deps such
+# as torch (via iopaint) and clobber the CUDA-pinned torch above.
+# Also note transformers is pinned to ==4.48.3 (not >=): transformers 5.x
+# imports `is_offline_mode` from huggingface_hub, which doesn't exist in
+# 0.25.2, and would crash iopaint even with hub pinned below.
+uv pip install --index-strategy unsafe-best-match \
+ "transformers==4.48.3" \
+ "iopaint>=1.2.0" \
+ "diffusers>=0.27.2" \
+ "numpy<2.0" \
+ "opencv-python>=4.8.0" \
+ "pyyaml>=6.0" \
+ "requests>=2.31.0" \
+ "tqdm>=4.66.0" \
+ "setuptools" \
+ "einops"
+
+# Pin huggingface_hub last, with --force-reinstall --no-deps so it can be
+# downgraded past other packages' transitive `>=0.26` constraints.
+# Reason: diffusers 0.27.2 (and the iopaint stack on top) still import
+# `cached_download` from huggingface_hub, which was removed in hub >=0.26.
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+ "huggingface_hub==0.25.2"
+
+uv pip install --index-strategy unsafe-best-match "git+https://github.com/facebookresearch/sam3.git"
+```
+
+## SAM3 model access
+
+`facebook/sam3` is a gated model on HuggingFace. Request access on the model page first, then log in:
+```
+huggingface-cli login
+```
+
+
+## Fix the AnySplat warning: `Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead`
+```
+cd submodule/AnySplat/src/model/encoder/backbone/croco/curope/
+```
+In `kernels.cu`, change:
+
+```
+AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
+```
+
+to:
+
+```
+AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.scalar_type(), "rope_2d_cuda", ([&] {
+```
+
+Then run:
+```
+python setup.py build_ext --inplace
+```
+
+
+## Extra dependencies for `pipeline/mesh2mjcf.py`
+
+`scripts/install_env.sh` already installs `coacd`, `trimesh`, and `mujoco` by
+default, so `mesh2mjcf.py` works out of the box (including `-cd` and
+`--verbose`). The commands below are only useful if you build the environment
+piecemeal and want to add the individual packages on demand:
+
+```
+# Convex decomposition (-cd)
+uv pip install coacd trimesh
+
+# Preview viewer (--verbose)
+uv pip install mujoco
+```
+
+
+# Completed modifications compared to the original repository:
+
+submodule/Sam-3d-objects/pyproject.toml:
+```
+-PIP_EXTRA_INDEX_URL = "https://pypi.ngc.nvidia.com https://download.pytorch.org/whl/cu121"
+
+change to
+
++PIP_EXTRA_INDEX_URL = "https://pypi.ngc.nvidia.com https://download.pytorch.org/whl/cu128"
+```
+requirements.inference.txt:
+```
+kaolin==0.17.0 change to kaolin==0.18.0
+```
+requirements.txt:
+```
+nvidia-pyindex==1.0.9 change to # nvidia-pyindex==1.0.9 (comment it out)
+
+torchaudio==2.5.1+cu121 change to torchaudio,
+xformers==0.0.28.post3 change to xformers (remove the pinned torchaudio and xformers versions)
+```
+requirements.p3d.txt:
+```
+tflash_attn==2.8.3 change to flash_attn==2.7.3
+```
\ No newline at end of file
diff --git a/pipeline/__init__.py b/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pipeline/background_reconstruction.py b/pipeline/background_reconstruction.py
new file mode 100644
index 0000000..16a5472
--- /dev/null
+++ b/pipeline/background_reconstruction.py
@@ -0,0 +1,376 @@
+"""Batch RANSAC-based table alignment + 3D Gaussian export on top of AnySplat.
+
+This is a cleaned-up rewrite of `submodule/AnySplat/inference_ransac_batch.py`.
+The script now lives outside the AnySplat submodule, so it explicitly inserts
+the AnySplat root onto `sys.path` to keep the original imports working.
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import cv2
+import imageio
+import numpy as np
+import torch
+
+# ===== Make AnySplat's `src.*` and `utils.py` importable when running from the
+# repository root (this file no longer lives inside submodule/AnySplat).
+_ANYSPLAT_ROOT = Path(__file__).resolve().parent.parent / "submodule" / "AnySplat"
+sys.path.insert(0, str(_ANYSPLAT_ROOT))
+sys.path.insert(0, str(_ANYSPLAT_ROOT.parent)) # mirrors original sys.path entry
+
+from src.misc.image_io import save_interpolated_video # noqa: E402, F401
+from src.model.ply_export import export_ply # noqa: E402
+from src.model.model.anysplat import AnySplat # noqa: E402
+from src.utils.image import process_image # noqa: E402
+from utils import ( # noqa: E402
+ align_points_to_table,
+ depth_to_points,
+ fit_plane_ransac_safe_2,
+ plane_coordinate_system,
+ render_depth_from_points,
+ shrink_mask_erode,
+)
+
+
+# ===== RANSAC / inner-rectangle hyperparameters =====
+RANSAC_NUM_ITERS = 600
+RANSAC_DIST_THRESH = 0.005 # tabletops are usually very flat
+RANSAC_SAMPLE_N = 40000
+INNER_PERCENTILE = (20, 80) # crop to the central 60% to avoid edges
+MIN_INNER_POINTS = 50
+
+# ===== Scene normalisation =====
+# Quantile of |xyz| used as the reference radius before rescaling, and the
+# target radius the reference is mapped to.
+SCALE_QUANTILE = 0.95
+SCALE_TARGET_RANGE = 0.6
+
+# ===== Post-alignment scene placement =====
+# Offsets applied after table-alignment so the aligned cloud can be shifted
+# from the origin if the downstream consumer needs it (e.g. to place it on a
+# Mujoco table). Defaults are 0, meaning the aligned cloud sits at the origin.
+DEFAULT_X_OFFSET = 0.0
+DEFAULT_Z_OFFSET = 0.0
+
+# ===== Mask shrink before plane fitting =====
+BG_MASK_SHRINK_RATIO = 0.12
+
+# ===== Default model id =====
+DEFAULT_MODEL_ID = "lhjiang/anysplat"
+
+
+def compute_table_geometry_ransac(depth, mask, intrinsic, extrinsic):
+ """Fit a tabletop plane via RANSAC + inner PCA and build a world-aligned
+ transform that maps the original world frame onto a table-aligned frame.
+ """
+ H, W = depth.shape
+
+ # ===== 1. Intrinsics =====
+ fx = intrinsic[0, 0]
+ fy = intrinsic[1, 1]
+ cx = intrinsic[0, 2]
+ cy = intrinsic[1, 2]
+
+ # ===== 2. Depth -> camera-frame points =====
+ points_cam = depth_to_points(depth, mask, fx, fy, cx, cy)
+ print("points_cam:", points_cam.shape)
+
+ # ===== 3. RANSAC plane =====
+ normal_cam, center_cam, inlier_idx = fit_plane_ransac_safe_2(
+ points_cam,
+ num_iters=RANSAC_NUM_ITERS,
+ dist_thresh=RANSAC_DIST_THRESH,
+ sample_N=RANSAC_SAMPLE_N,
+ )
+ print(f"RANSAC normal: {normal_cam}")
+
+ pts_plane = points_cam[inlier_idx]
+
+ # ===== 4. Plane coordinate system =====
+ u, v = plane_coordinate_system(normal_cam)
+ rel = pts_plane - center_cam
+ pts_2d = np.stack([rel @ u, rel @ v], axis=1)
+
+ # ===== 5. Inner rectangle (crop edges) =====
+ x, y = pts_2d[:, 0], pts_2d[:, 1]
+ x_min, x_max = np.percentile(x, list(INNER_PERCENTILE))
+ y_min, y_max = np.percentile(y, list(INNER_PERCENTILE))
+ inner = (x > x_min) & (x < x_max) & (y > y_min) & (y < y_max)
+ pts_inner = pts_2d[inner]
+ if pts_inner.shape[0] < MIN_INNER_POINTS:
+ raise RuntimeError("Too few inner RANSAC points")
+
+ # ===== 6. PCA on the inner points =====
+ mean_2d = pts_inner.mean(axis=0)
+ centered = pts_inner - mean_2d
+ _, _, Vt = np.linalg.svd(centered, full_matrices=False)
+ dir_long_2d = Vt[0]
+
+ # ===== 7. 2D -> 3D =====
+ dir_long_cam = dir_long_2d[0] * u + dir_long_2d[1] * v
+ dir_long_cam /= np.linalg.norm(dir_long_cam)
+ dir_short_cam = np.cross(normal_cam, dir_long_cam)
+ dir_short_cam /= np.linalg.norm(dir_short_cam)
+
+ # ===== 8. World consistency (avoid axis flip) =====
+ R_cw = extrinsic[:3, :3]
+ if (R_cw @ dir_long_cam)[0] < 0:
+ dir_long_cam = -dir_long_cam
+ dir_short_cam = -dir_short_cam
+
+ # ===== 9. OBB extents =====
+ proj = centered @ Vt[:2].T
+ min_xy, max_xy = proj.min(0), proj.max(0)
+ length = max_xy[0] - min_xy[0]
+ width = max_xy[1] - min_xy[1]
+
+ center_plane_cam = center_cam + mean_2d[0] * u + mean_2d[1] * v
+
+ # ===== 10. Build world->table alignment =====
+ R_table_cam = np.stack([dir_long_cam, dir_short_cam, normal_cam], axis=1)
+ R_align_cam = R_table_cam.T
+ t_align_cam = -R_align_cam @ center_plane_cam
+
+ R_align_world = R_align_cam @ R_cw
+ t_align_world = R_align_cam @ extrinsic[:3, 3] + t_align_cam
+
+ print("RANSAC inlier ratio:", len(inlier_idx) / points_cam.shape[0])
+
+ return {
+ "length": float(length),
+ "width": float(width),
+ "normal": normal_cam,
+ "dir_long": dir_long_cam,
+ "dir_short": dir_short_cam,
+ "R_align_cam": R_align_cam,
+ "t_align_cam": t_align_cam,
+ "R_align_world": R_align_world,
+ "t_align_world": t_align_world,
+ }
+
+
+def _save_depth_npy_and_viz(depth, image_folder, base_name):
+ """Save a raw depth array and a normalized 8-bit visualisation."""
+ depth_path = Path(image_folder) / f"{base_name}.npy"
+ np.save(depth_path, depth)
+ viz = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8)
+ viz_path = Path(image_folder) / f"{base_name}_visual.png"
+ imageio.imwrite(viz_path, viz)
+
+
+def process_single_image(image_path, model, device, args):
+ """Run AnySplat on one `clean_background.png` and export aligned assets."""
+ image_folder = os.path.dirname(image_path)
+ image_ori_path = os.path.join(image_folder, "input_image.png")
+
+ # Load images.
+ image = process_image(image_path)
+ image_ori = process_image(image_ori_path)
+ images_ori = torch.stack([image_ori], dim=0).unsqueeze(0).to(device)
+ images = torch.stack([image], dim=0).unsqueeze(0).to(device)
+ b, v, _, H, W = images.shape
+
+ # Inference.
+ with torch.no_grad():
+ gaussians, pred_context_pose, depth_dict = model.inference((images + 1) * 0.5)
+ gaussians_ori, pred_context_pose_ori, depth_dict_ori = model.inference(
+ (images_ori + 1) * 0.5
+ )
+ depth_ori = depth_dict_ori["depth"][0][0].squeeze().cpu().numpy()
+ _save_depth_npy_and_viz(depth_ori, image_folder, "depth_ori")
+
+ # Camera parameters. AnySplat returns camera-to-world; we store world-to-camera.
+ pred_all_extrinsic = pred_context_pose["extrinsic"][0][0].inverse().cpu().numpy()
+ pred_all_intrinsic = pred_context_pose["intrinsic"][0][0].cpu().numpy()
+ print(f"Processing {os.path.basename(image_folder)}: converted intrinsics:")
+ print(
+ f" fx: {pred_all_intrinsic[0, 0] * W:.2f}, "
+ f"fy: {pred_all_intrinsic[1, 1] * H:.2f}"
+ )
+ print(
+ f" cx: {pred_all_intrinsic[0, 2] * W:.2f}, "
+ f"cy: {pred_all_intrinsic[1, 2] * H:.2f}"
+ )
+
+ # Scale normalised intrinsics to pixel units.
+ pred_all_intrinsic[0, :] = pred_all_intrinsic[0, :] * W
+ pred_all_intrinsic[1, :] = pred_all_intrinsic[1, :] * H
+
+ np.save(Path(image_folder) / "extrinsic.npy", pred_all_extrinsic)
+ np.save(Path(image_folder) / "intrinsic.npy", pred_all_intrinsic)
+
+ intrinsic = pred_all_intrinsic
+ extrinsic = pred_all_extrinsic
+ gaussian_xyz = gaussians.means[0].detach().cpu().numpy()
+ depth = depth_dict["depth"][0][0].squeeze().cpu().numpy()
+ _save_depth_npy_and_viz(depth, image_folder, "depth")
+
+ # Asset directory.
+ assets_folder = os.path.join(image_folder, "3d_assets")
+ os.makedirs(assets_folder, exist_ok=True)
+
+ # Export the raw 3DGS reconstruction.
+ export_ply(
+ gaussians.means[0],
+ gaussians.scales[0],
+ gaussians.rotations[0],
+ gaussians.harmonics[0],
+ gaussians.opacities[0],
+ Path(assets_folder) / "bg.ply",
+ )
+
+ if not args.align_table:
+ print(
+ "Table alignment disabled (--no-align-table); "
+ "skipping bg_aligned.ply export."
+ )
+ print(f"Done. Outputs saved under: {image_folder}")
+ return
+
+ # Re-render depth from the splat point cloud (used for plane fitting).
+ depth_point = render_depth_from_points(gaussian_xyz, intrinsic, extrinsic, H, W)
+
+ mask_path = Path(image_folder) / "bg_mask.png"
+ if not mask_path.exists():
+ print(f"Warning: bg_mask.png not found, skipping table alignment: {mask_path}")
+ return
+
+ mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE).astype(np.uint8)
+ mask = shrink_mask_erode(mask, ratio=BG_MASK_SHRINK_RATIO)
+
+ result = compute_table_geometry_ransac(
+ depth=depth_point,
+ mask=mask,
+ intrinsic=intrinsic,
+ extrinsic=extrinsic,
+ )
+ print(f"\n{os.path.basename(image_folder)} table geometry:")
+ print(f" length (m): {result['length']:.3f}")
+ print(f" width (m): {result['width']:.3f}")
+ print(f" normal: {result['normal']}")
+
+ # Align the splat point cloud to the table frame.
+ points_table_world = align_points_to_table(
+ gaussian_xyz,
+ result["R_align_world"],
+ result["t_align_world"],
+ )
+ points_table_world = points_table_world - np.median(points_table_world, axis=0)
+
+ # Use a robust quantile for scale so outliers don't dominate.
+ abs_points = np.abs(points_table_world)
+ ref_range = np.quantile(abs_points, SCALE_QUANTILE)
+ scale_factor = ref_range / SCALE_TARGET_RANGE
+ points_table_world = points_table_world / scale_factor
+ gaussians.scales[0] = gaussians.scales[0] / scale_factor
+
+ np.save(Path(image_folder) / "scale.npy", scale_factor)
+ print(f" scale factor: {scale_factor:.3f}")
+
+ # Swap X/Y, flip Z, then apply optional placement offsets (default 0,0).
+ x = points_table_world[:, 0].copy()
+ y = points_table_world[:, 1].copy()
+ points_table_world[:, 0] = y
+ points_table_world[:, 1] = x
+ points_table_world[:, 2] *= -1
+ points_table_world[:, 2] += args.z_offset
+ points_table_world[:, 0] += args.x_offset
+
+ export_ply(
+ points_table_world,
+ gaussians.scales[0],
+ gaussians.rotations[0],
+ gaussians.harmonics[0],
+ gaussians.opacities[0],
+ Path(assets_folder) / "bg_aligned.ply",
+ )
+
+ print(
+ f" Z range: min={points_table_world[:, 2].min():.3f}, "
+ f"max={points_table_world[:, 2].max():.3f}"
+ )
+ print(f"Done. Outputs saved under: {image_folder}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description=(
+ "Reconstruct a 3D Gaussian model from a single image and emit the "
+ "associated camera intrinsics/extrinsics, depth maps, and an "
+ "optional table-aligned point cloud."
+ )
+ )
+ parser.add_argument(
+ "input_dir",
+ type=str,
+ help="Input directory or single file. Directories are searched recursively for clean_background.{png,jpg}.",
+ )
+ parser.add_argument(
+ "--model-id",
+ type=str,
+ default=DEFAULT_MODEL_ID,
+ help=f"HuggingFace model id to load (default: {DEFAULT_MODEL_ID}).",
+ )
+ parser.add_argument(
+ "--align-table",
+ action=argparse.BooleanOptionalAction,
+ default=True,
+ help=(
+ "Run RANSAC table alignment and export bg_aligned.ply. "
+ "Use --no-align-table to disable (only bg.ply will be emitted). "
+ "Default: enabled."
+ ),
+ )
+ parser.add_argument(
+ "--x-offset",
+ type=float,
+ default=DEFAULT_X_OFFSET,
+ help="X-axis offset (m) applied after table alignment. Default: 0 (origin).",
+ )
+ parser.add_argument(
+ "--z-offset",
+ type=float,
+ default=DEFAULT_Z_OFFSET,
+ help="Z-axis offset (m) applied after table alignment. Default: 0 (origin).",
+ )
+
+ args = parser.parse_args()
+
+ if os.path.isfile(args.input_dir):
+ input_dir = os.path.dirname(args.input_dir)
+ else:
+ input_dir = args.input_dir
+
+ print(f"Loading model: {args.model_id}")
+ model = AnySplat.from_pretrained(args.model_id)
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ model = model.to(device)
+ model.eval()
+ for param in model.parameters():
+ param.requires_grad = False
+ print("Model loaded.")
+
+ clean_background_files = []
+ for root, _dirs, files in os.walk(input_dir):
+ for file in files:
+ if file.lower() in ("clean_background.png", "clean_background.jpg"):
+ clean_background_files.append(os.path.join(root, file))
+
+ print(f"Found {len(clean_background_files)} clean_background images.")
+
+ for idx, image_path in enumerate(clean_background_files, 1):
+ print(f"\nProcessing {idx}/{len(clean_background_files)}: {image_path}")
+ try:
+ process_single_image(image_path, model, device, args)
+ print(f"Successfully processed: {image_path}")
+ except Exception as e:
+ print(f"Error processing {image_path}: {e}")
+ import traceback
+ traceback.print_exc()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/pipeline/mesh2mjcf.py b/pipeline/mesh2mjcf.py
new file mode 100644
index 0000000..7f1795b
--- /dev/null
+++ b/pipeline/mesh2mjcf.py
@@ -0,0 +1,674 @@
+"""Convert a single .obj or .stl mesh into MuJoCo MJCF assets.
+
+This is a generic mesh-to-MJCF converter, derived from
+github.com/discoverse-dev/DISCOVERSE/scripts/mesh2mjcf.py but stripped of any
+DISCOVERSE-specific imports or scene wiring. It is designed to consume the
+per-object meshes that this pipeline emits under `/3d_assets/.obj`,
+but works on any standalone mesh file.
+
+Output layout (under --output-dir, which defaults to the input file's parent —
+typically `scene_dir/3d_assets/` when consuming the v2 pipeline outputs):
+
+ /
+ / (per-asset folder, named after the obj stem)
+ .obj (copy of the input mesh)
+ .mtl (if multi-material)
+ (referenced by the MTL)
+ part_0.obj part_1.obj ... (if --convex_decomposition)
+ mjcf/
+ .xml
+ _dependencies.xml
+
+Mesh paths inside the emitted XML are written as `/`, so the
+consuming MuJoCo scene should set `meshdir` (and `texturedir`) to .
+
+Examples:
+
+ # Basic conversion (default RGBA, mass, inertia; no free joint; no decomp).
+ python pipeline/mesh2mjcf.py path/to/cup.obj
+
+ # Specify RGBA, mass, inertia.
+ python pipeline/mesh2mjcf.py path/to/cup.obj \\
+ --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005
+
+ # Free-floating object.
+ python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint
+
+ # Convex decomposition for accurate collisions.
+ python pipeline/mesh2mjcf.py path/to/cup.obj -cd
+
+ # Preview in MuJoCo viewer after conversion.
+ python pipeline/mesh2mjcf.py path/to/cup.obj --verbose
+
+Notes:
+ - Multi-material OBJ files are auto-detected (via the MTL file) and split
+ into one sub-mesh per material; each material yields a MuJoCo
+ ``, with textures (`map_Kd`) copied alongside.
+ - Convex decomposition requires `pip install coacd trimesh`.
+ - Material splitting requires `pip install trimesh`.
+"""
+
+import argparse
+import logging
+import os
+import re
+import shutil
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+
+# ===== MTL handling =====
+
+# MTL fields relevant to MuJoCo.
+MTL_FIELDS = (
+ "Ka", # Ambient color
+ "Kd", # Diffuse color
+ "Ks", # Specular color
+ "d", # Transparency (alpha)
+ "Tr", # 1 - transparency
+ "Ns", # Shininess
+ "map_Kd", # Diffuse texture map
+)
+
+
+@dataclass
+class Material:
+ """Convenience container for MTL → MuJoCo material conversion."""
+
+ name: str
+ Ka: Optional[str] = None
+ Kd: Optional[str] = None
+ Ks: Optional[str] = None
+ d: Optional[str] = None
+ Tr: Optional[str] = None
+ Ns: Optional[str] = None
+ map_Kd: Optional[str] = None
+
+ @staticmethod
+ def from_string(lines: Sequence[str]) -> "Material":
+ attrs = {"name": lines[0].split(" ")[1].strip()}
+ for line in lines[1:]:
+ for attr in MTL_FIELDS:
+ if line.startswith(attr):
+ elems = line.split(" ")[1:]
+ elems = [elem for elem in elems if elem != ""]
+ attrs[attr] = " ".join(elems)
+ break
+ return Material(**attrs)
+
+ def mjcf_rgba(self) -> str:
+ Kd = self.Kd or "1.0 1.0 1.0"
+ if self.d is not None:
+ alpha = self.d
+ elif self.Tr is not None:
+ alpha = str(1.0 - float(self.Tr))
+ else:
+ alpha = "1.0"
+ return f"{Kd} {alpha}"
+
+ def mjcf_shininess(self) -> str:
+ if self.Ns is not None:
+ # Ns values are typically 0-1000; normalize to [0, 1].
+ ns_val = float(self.Ns) / 1_000
+ else:
+ ns_val = 0.5
+ return f"{ns_val}"
+
+ def mjcf_specular(self) -> str:
+ if self.Ks is not None:
+ # Average the specular RGB to a scalar.
+ ks_val = sum(map(float, self.Ks.split(" "))) / 3
+ else:
+ ks_val = 0.5
+ return f"{ks_val}"
+
+
+def parse_mtl_name(lines: Sequence[str]) -> Optional[str]:
+ """Return the .mtl filename referenced by an OBJ file's `mtllib` directive."""
+ mtl_regex = re.compile(r"^mtllib\s+(.+?\.mtl)(?:\s*#.*)?\s*\n?$")
+ for line in lines:
+ match = mtl_regex.match(line)
+ if match is not None:
+ return match.group(1)
+ return None
+
+
+def copy_obj_with_mtl(obj_source: Path, obj_target: Path) -> None:
+ """Copy an OBJ file, plus the MTL file it references (if any)."""
+ obj_target.parent.mkdir(parents=True, exist_ok=True)
+ shutil.copy2(obj_source, obj_target)
+
+ try:
+ with open(obj_source, "r") as f:
+ lines = f.readlines()
+ for line in lines:
+ if line.strip().startswith("mtllib "):
+ mtl_filename = line.strip().split()[1]
+ mtl_source = obj_source.parent / mtl_filename
+ mtl_target = obj_target.parent / mtl_filename
+ if mtl_source.exists():
+ shutil.copy2(mtl_source, mtl_target)
+ print(f"Copied MTL file: {mtl_source} -> {mtl_target}")
+ break
+ except Exception as e:
+ print(f"Warning: failed to check/copy MTL file for {obj_source}: {e}")
+
+
+def parse_mtl_file(mtl_path: Path) -> Dict[str, Material]:
+ """Parse an MTL file into a name → Material dict."""
+ materials: Dict[str, Material] = {}
+ if not mtl_path.exists():
+ return materials
+
+ with open(mtl_path, "r", encoding="utf-8") as f:
+ lines = f.readlines()
+
+ lines = [line for line in lines if not line.startswith("#")]
+ lines = [line for line in lines if line.strip()]
+ lines = [line.strip() for line in lines]
+
+ sub_mtls: List[List[str]] = []
+ for line in lines:
+ if line.startswith("newmtl"):
+ sub_mtls.append([])
+ if sub_mtls:
+ sub_mtls[-1].append(line)
+
+ for sub_mtl in sub_mtls:
+ if sub_mtl:
+ material = Material.from_string(sub_mtl)
+ materials[material.name] = material
+
+ return materials
+
+
+def split_obj_by_materials(
+ obj_path: Path, output_dir: Path
+) -> Tuple[Dict[str, Material], List[str]]:
+ """Split a multi-material OBJ into one sub-mesh per material.
+
+ Returns (materials, submesh_files). If the OBJ has zero or one materials,
+ submesh_files is empty and the OBJ is left as a single file.
+ """
+ materials: Dict[str, Material] = {}
+ submesh_files: List[str] = []
+
+ with open(obj_path, "r", encoding="utf-8") as f:
+ obj_lines = f.readlines()
+
+ mtl_name = parse_mtl_name(obj_lines)
+ if mtl_name:
+ mtl_path = obj_path.parent / mtl_name
+ materials = parse_mtl_file(mtl_path)
+
+ if len(materials) <= 1:
+ return materials, []
+
+ try:
+ import trimesh
+ except ImportError:
+ print("Warning: trimesh not installed; cannot split multi-material OBJ.")
+ return materials, []
+
+ try:
+ mesh = trimesh.load(
+ obj_path,
+ split_object=True,
+ group_material=True,
+ process=False,
+ maintain_order=False,
+ )
+
+ if isinstance(mesh, trimesh.base.Trimesh):
+ # Single mesh after grouping; nothing to split.
+ target_file = output_dir / f"{obj_path.stem}.obj"
+ shutil.copy(obj_path, target_file)
+ return materials, []
+
+ obj_stem = obj_path.stem
+ print(f"Splitting OBJ by material: {len(mesh.geometry)} sub-meshes")
+ for i, (material_name, geom) in enumerate(mesh.geometry.items()):
+ submesh_file = f"{obj_stem}_{i}.obj"
+ submesh_path = output_dir / submesh_file
+
+ geom.visual.material.name = material_name
+ geom.export(str(submesh_path), include_texture=True, header=None)
+ submesh_files.append(submesh_file)
+ print(f" saved sub-mesh: {submesh_file} (material: {material_name})")
+
+ # trimesh sometimes emits a stray `material.mtl` next to the export.
+ temp_mtl = output_dir / "material.mtl"
+ if temp_mtl.exists():
+ temp_mtl.unlink()
+
+ return materials, submesh_files
+ except Exception as e:
+ print(f"Warning: failed to split OBJ by material: {e}")
+ return materials, []
+
+
+# ===== XML builders =====
+
+def create_asset_xml(asset_name, convex_parts=None, materials=None, submesh_files=None):
+ """Build the `` element listing meshes/materials/textures."""
+ root = ET.Element("mujocoinclude")
+ asset = ET.SubElement(root, "asset")
+
+ if materials:
+ for material_name, material in materials.items():
+ material_elem = ET.SubElement(asset, "material")
+ material_elem.set("name", f"{asset_name}_{material_name}")
+ material_elem.set("rgba", material.mjcf_rgba())
+ material_elem.set("specular", material.mjcf_specular())
+ material_elem.set("shininess", material.mjcf_shininess())
+
+ if material.map_Kd:
+ texture_elem = ET.SubElement(asset, "texture")
+ texture_elem.set("type", "2d")
+ texture_elem.set("name", f"{asset_name}_{material_name}_texture")
+ texture_elem.set("file", f"{asset_name}/{material.map_Kd}")
+
+ material_elem.set("texture", f"{asset_name}_{material_name}_texture")
+ material_elem.attrib.pop("rgba", None)
+
+ # Main mesh (only when not split by material).
+ if not submesh_files:
+ mesh_elem = ET.SubElement(asset, "mesh")
+ mesh_elem.set("name", asset_name)
+ mesh_elem.set("file", f"{asset_name}/{asset_name}.obj")
+
+ # Per-material sub-meshes.
+ if submesh_files:
+ for submesh_file in submesh_files:
+ submesh_name = submesh_file.replace(".obj", "")
+ part_mesh = ET.SubElement(asset, "mesh")
+ part_mesh.set("name", submesh_name)
+ part_mesh.set("file", f"{asset_name}/{submesh_file}")
+
+ # Convex-decomposition parts.
+ if convex_parts:
+ for i in range(convex_parts):
+ part_mesh = ET.SubElement(asset, "mesh")
+ part_mesh.set("name", f"{asset_name}_part_{i}")
+ part_mesh.set("file", f"{asset_name}/part_{i}.obj")
+
+ return root
+
+
+def create_geom_xml(
+ asset_name,
+ mass,
+ diaginertia,
+ rgba,
+ free_joint=False,
+ convex_parts=None,
+ materials=None,
+ submesh_files=None,
+ output_dir=None,
+):
+ """Build the `` element with the body's geoms + inertial."""
+ root = ET.Element("mujocoinclude")
+
+ if free_joint:
+ joint_elem = ET.SubElement(root, "joint")
+ joint_elem.set("type", "free")
+
+ inertial_elem = ET.SubElement(root, "inertial")
+ inertial_elem.set("pos", "0 0 0")
+ inertial_elem.set("mass", str(mass))
+ inertial_elem.set(
+ "diaginertia", f"{diaginertia[0]} {diaginertia[1]} {diaginertia[2]}"
+ )
+
+ if submesh_files and materials:
+ # Multi-material: one geom per sub-mesh.
+ for submesh_file in submesh_files:
+ submesh_name = submesh_file.replace(".obj", "")
+ geom_elem = ET.SubElement(root, "geom")
+ geom_elem.set("type", "mesh")
+ geom_elem.set("mesh", submesh_name)
+ geom_elem.set("class", "obj_visual")
+
+ material_assigned = False
+ submesh_path = Path(output_dir) / asset_name / submesh_file
+ if submesh_path.exists():
+ try:
+ with open(submesh_path, "r", encoding="utf-8") as f:
+ submesh_lines = f.readlines()
+ for line in submesh_lines:
+ line = line.strip()
+ if line.startswith("usemtl "):
+ mtl_name = line.split()[1]
+ geom_elem.set("material", f"{asset_name}_{mtl_name}")
+ material_assigned = True
+ break
+ except Exception as e:
+ print(f"Warning: could not read sub-mesh {submesh_path}: {e}")
+
+ if not material_assigned:
+ geom_elem.set(
+ "rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}"
+ )
+
+ elif materials and len(materials) == 1:
+ # Single material with possible texture.
+ geom_elem = ET.SubElement(root, "geom")
+ geom_elem.set("type", "mesh")
+ geom_elem.set("mesh", asset_name)
+ geom_elem.set("class", "obj_visual")
+ material_name = next(iter(materials))
+ geom_elem.set("material", f"{asset_name}_{material_name}")
+
+ elif convex_parts:
+ # Visual geom (full mesh) + collision geoms (convex parts).
+ visual_geom = ET.SubElement(root, "geom")
+ visual_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}")
+ visual_geom.set("mesh", asset_name)
+ visual_geom.set("class", "obj_visual")
+
+ for i in range(convex_parts):
+ collision_geom = ET.SubElement(root, "geom")
+ collision_geom.set("type", "mesh")
+ collision_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}")
+ collision_geom.set("mesh", f"{asset_name}_part_{i}")
+
+ else:
+ # Simple solid-colour mesh geom.
+ geom_elem = ET.SubElement(root, "geom")
+ geom_elem.set("type", "mesh")
+ geom_elem.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}")
+ geom_elem.set("mesh", asset_name)
+
+ # When a material/sub-mesh path was taken AND convex decomposition is on,
+ # still emit invisible collision geoms.
+ if convex_parts and (submesh_files or (materials and len(materials) == 1)):
+ for i in range(convex_parts):
+ collision_geom = ET.SubElement(root, "geom")
+ collision_geom.set("type", "mesh")
+ collision_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} 0")
+ collision_geom.set("mesh", f"{asset_name}_part_{i}")
+
+ return root
+
+
+def save_xml_with_formatting(root, filepath):
+ """Indent and write an ElementTree XML file (Python 3.9+)."""
+ ET.indent(root, space=" ", level=0)
+ tree = ET.ElementTree(root)
+ tree.write(filepath, encoding="utf-8", xml_declaration=False)
+
+
+def create_preview_xml(asset_name):
+ """Build a minimal preview scene for `mujoco.viewer`."""
+ root = ET.Element("mujoco")
+ root.set("model", "temp_preview_env")
+
+ option = ET.SubElement(root, "option")
+ option.set("gravity", "0 0 -9.81")
+
+ compiler = ET.SubElement(root, "compiler")
+ compiler.set("meshdir", ".")
+ compiler.set("texturedir", ".")
+
+ include = ET.SubElement(root, "include")
+ include.set("file", f"{asset_name}/mjcf/{asset_name}_dependencies.xml")
+
+ default = ET.SubElement(root, "default")
+ obj_default = ET.SubElement(default, "default")
+ obj_default.set("class", "obj_visual")
+ geom_default = ET.SubElement(obj_default, "geom")
+ geom_default.set("group", "2")
+ geom_default.set("type", "mesh")
+ geom_default.set("contype", "0")
+ geom_default.set("conaffinity", "0")
+
+ worldbody = ET.SubElement(root, "worldbody")
+
+ floor_geom = ET.SubElement(worldbody, "geom")
+ floor_geom.set("name", "floor")
+ floor_geom.set("type", "plane")
+ floor_geom.set("size", "2 2 0.1")
+ floor_geom.set("rgba", ".8 .8 .8 1")
+
+ light = ET.SubElement(worldbody, "light")
+ light.set("pos", "0 0 3")
+ light.set("dir", "0 0 -1")
+
+ body = ET.SubElement(worldbody, "body")
+ body.set("name", asset_name)
+ body.set("pos", "0 0 0.5")
+
+ body_include = ET.SubElement(body, "include")
+ body_include.set("file", f"{asset_name}/mjcf/{asset_name}.xml")
+
+ return root
+
+
+# ===== Main =====
+
+def _build_argparser():
+ parser = argparse.ArgumentParser(
+ description="Convert a .obj or .stl mesh into MuJoCo MJCF assets."
+ )
+ parser.add_argument(
+ "input_file", type=str, help="Path to the input mesh (.obj or .stl)."
+ )
+ parser.add_argument(
+ "--rgba",
+ nargs=4,
+ type=float,
+ default=[0.5, 0.5, 0.5, 1],
+ help="Mesh RGBA colour. Default: [0.5, 0.5, 0.5, 1].",
+ )
+ parser.add_argument(
+ "--mass",
+ type=float,
+ default=0.001,
+ help="Mesh mass (kg). Default: 0.001.",
+ )
+ parser.add_argument(
+ "--diaginertia",
+ nargs=3,
+ type=float,
+ default=[0.00002, 0.00002, 0.00002],
+ help="Diagonal inertia tensor. Default: [2e-5, 2e-5, 2e-5].",
+ )
+ parser.add_argument(
+ "-o",
+ "--output",
+ type=str,
+ default=None,
+ help=(
+ "Output assets root. Default: the input file's parent directory, "
+ "so that `scene_dir/3d_assets/foo.obj` writes to `scene_dir/`."
+ ),
+ )
+ parser.add_argument(
+ "--free_joint",
+ action="store_true",
+ help="Add a free joint so the body can move.",
+ )
+ parser.add_argument(
+ "-cd",
+ "--convex_decomposition",
+ action="store_true",
+ help=(
+ "Decompose the mesh into convex parts for accurate collision. "
+ "Requires `coacd` and `trimesh`."
+ ),
+ )
+ parser.add_argument(
+ "--scene",
+ action="store_true",
+ help="Use high-precision CoACD config (smaller threshold).",
+ )
+ parser.add_argument(
+ "--verbose",
+ action="store_true",
+ help="Open MuJoCo viewer with a preview scene after conversion.",
+ )
+ return parser
+
+
+def main():
+ args = _build_argparser().parse_args()
+
+ input_file = args.input_file
+ rgba = args.rgba
+ mass = args.mass
+ diaginertia = args.diaginertia
+ free_joint = args.free_joint
+ convex_de = args.convex_decomposition
+ verbose = args.verbose
+
+ if args.output is None:
+ output_assets_dir = str(Path(input_file).resolve().parent)
+ else:
+ output_assets_dir = args.output
+
+ if convex_de:
+ try:
+ import coacd # noqa: F401
+ import trimesh # noqa: F401
+ except ImportError:
+ print(
+ "Error: `coacd` and `trimesh` are required for "
+ "--convex_decomposition. Install with `pip install coacd trimesh`."
+ )
+ raise SystemExit(1)
+
+ if input_file.endswith(".obj"):
+ asset_name = os.path.basename(input_file)[: -len(".obj")]
+ elif input_file.endswith(".stl"):
+ asset_name = os.path.basename(input_file)[: -len(".stl")]
+ else:
+ raise SystemExit(
+ f"Error: {input_file} is not a supported mesh type. Use .obj or .stl."
+ )
+
+ # Per-asset folder lives directly under , with an `mjcf/` subfolder
+ # for the generated XML files. This way the whole asset (meshes + MTL +
+ # textures + convex parts + MJCF) is self-contained in one directory.
+ output_dir = os.path.join(output_assets_dir, asset_name)
+ mjcf_obj_dir = os.path.join(output_dir, "mjcf")
+ if os.path.exists(output_dir):
+ shutil.rmtree(output_dir)
+ os.makedirs(output_dir)
+ os.makedirs(mjcf_obj_dir, exist_ok=True)
+
+ # Copy the mesh (and MTL if relevant) into the per-asset folder.
+ if os.path.dirname(input_file) != output_dir:
+ if input_file.endswith(".obj"):
+ copy_obj_with_mtl(
+ Path(input_file), Path(output_dir) / Path(input_file).name
+ )
+ else:
+ shutil.copy(input_file, output_dir)
+
+ # Material splitting (OBJ only).
+ materials: Dict[str, Material] = {}
+ submesh_files: List[str] = []
+ if input_file.endswith(".obj"):
+ print("Checking OBJ for multiple materials...")
+ obj_path = Path(output_dir) / f"{asset_name}.obj"
+ materials, submesh_files = split_obj_by_materials(obj_path, Path(output_dir))
+
+ # Copy referenced texture files (single or multi-material case).
+ if materials:
+ input_parent = Path(input_file).parent
+ for _name, material in materials.items():
+ if material.map_Kd:
+ texture_src = input_parent / material.map_Kd
+ if texture_src.exists():
+ texture_dst = Path(output_dir) / material.map_Kd
+ shutil.copy(texture_src, texture_dst)
+ print(f"Copied texture: {material.map_Kd}")
+
+ if submesh_files:
+ print(f"Split into {len(submesh_files)} sub-meshes.")
+ elif len(materials) == 1:
+ print("Single material; no split needed.")
+ else:
+ print("No materials detected.")
+
+ convex_parts_count = 0
+ if convex_de:
+ import coacd
+ import trimesh
+
+ print(f"Running convex decomposition on {asset_name}...")
+ mesh = trimesh.load(input_file, force="mesh")
+ mesh_coacd = coacd.Mesh(mesh.vertices, mesh.faces)
+ coacd_config_scene = {
+ "threshold": 0.01,
+ "preprocess_resolution": 100,
+ }
+ coacd_config = coacd_config_scene if args.scene else {}
+ parts = coacd.run_coacd(mesh_coacd, **coacd_config)
+
+ for i, part in enumerate(parts):
+ part_filename = f"part_{i}.obj"
+ output_part_file = os.path.join(output_dir, part_filename)
+ part_mesh = trimesh.Trimesh(vertices=part[0], faces=part[1])
+ part_mesh.export(output_part_file)
+
+ convex_parts_count = len(parts)
+ print(f"{asset_name} decomposed into {convex_parts_count} convex parts.")
+
+ # Emit the asset dependency XML.
+ asset_xml = create_asset_xml(
+ asset_name,
+ convex_parts_count if convex_de else None,
+ materials if (submesh_files or len(materials) == 1) else None,
+ submesh_files if submesh_files else None,
+ )
+ asset_file_path = os.path.join(mjcf_obj_dir, f"{asset_name}_dependencies.xml")
+ save_xml_with_formatting(asset_xml, asset_file_path)
+
+ # Emit the body geom XML.
+ geom_xml = create_geom_xml(
+ asset_name,
+ mass,
+ diaginertia,
+ rgba,
+ free_joint,
+ convex_parts_count if convex_de else None,
+ materials if (submesh_files or len(materials) == 1) else None,
+ submesh_files if submesh_files else None,
+ output_assets_dir,
+ )
+ geom_file_path = os.path.join(mjcf_obj_dir, f"{asset_name}.xml")
+ save_xml_with_formatting(geom_xml, geom_file_path)
+
+ print(f"Converted {asset_name} to MJCF.")
+ print(f" meshes: {output_dir}")
+ print(f" dependencies: {asset_file_path}")
+ print(f" body geom: {geom_file_path}")
+ if submesh_files:
+ print(
+ f" material split: {len(submesh_files)} sub-meshes, "
+ f"{len(materials)} materials"
+ )
+
+ if verbose:
+ print("\nLaunching MuJoCo viewer...")
+ py_dir = shutil.which("python") or shutil.which("python3")
+ if not py_dir:
+ print("Error: no `python`/`python3` on PATH; cannot launch viewer.")
+ raise SystemExit(1)
+
+ tmp_world_mjcf = os.path.join(output_assets_dir, "_tmp_preview.xml")
+ preview_xml = create_preview_xml(asset_name)
+ save_xml_with_formatting(preview_xml, tmp_world_mjcf)
+
+ cmd_line = f"{py_dir} -m mujoco.viewer --mjcf {tmp_world_mjcf}"
+ print(f"Running: {cmd_line}")
+ os.system(cmd_line)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/pipeline/objects_generation.py b/pipeline/objects_generation.py
new file mode 100644
index 0000000..906c8c5
--- /dev/null
+++ b/pipeline/objects_generation.py
@@ -0,0 +1,455 @@
+import os
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+import argparse
+import copy
+
+import numpy as np
+import torch
+import imageio
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+
+from pipeline.utils import (
+ clean_name,
+ load_image,
+ collect_mask_paths,
+ compute_fov_from_intrinsics,
+ mesh_rendering,
+ get_default_mesh_renderer,
+)
+# `inference` is exposed by submodule/Sam-3d-objects via PYTHONPATH; see the
+# top-level shell scripts.
+from inference import (
+ Inference,
+ make_scene,
+ render_gs_view,
+)
+
+
+# Coordinate-system transform applied to SAM-3D-Objects mesh outputs to bring
+# them into the world frame this pipeline operates in (rotates +Y -> +Z, etc.).
+_SAM3D_TO_WORLD = np.array(
+ [
+ [1, 0, 0, 0],
+ [0, 0, -1, 0],
+ [0, 1, 0, 0],
+ [0, 0, 0, 1],
+ ]
+)
+
+_DEFAULT_IMAGE_SIZE = (448, 448)
+
+
+def _flip_xy(arr):
+ """Negate the X and Y components in-place on an (N, 3+) array or tensor."""
+ arr[:, 0] = -arr[:, 0]
+ arr[:, 1] = -arr[:, 1]
+ return arr
+
+
+def _load_depth_with_fallback(image_dir, required_depth_path):
+ """Prefer `depth_ori.npy` (raw AnySplat output) over `depth.npy`."""
+ depth_ori_path = os.path.join(image_dir, "depth_ori.npy")
+ if os.path.exists(depth_ori_path):
+ return np.load(depth_ori_path)
+ return np.load(required_depth_path)
+
+
+def process_single_image(image_path, inference, args):
+ """Run multi-object inference and asset export for one input image."""
+ image_path = os.path.abspath(image_path)
+ image_dir = os.path.dirname(image_path)
+
+ # Optional scene scale factor produced by the AnySplat stage.
+ scale_factor_path = os.path.join(image_dir, "scale.npy")
+ if os.path.exists(scale_factor_path):
+ scale_factor = float(np.asarray(np.load(scale_factor_path)).squeeze())
+ else:
+ scale_factor = 1.0
+ print(f"Scale factor: {scale_factor}")
+
+ pil_image = load_image(image_path)
+ image_bg = np.array(pil_image)
+
+ masks_dir = os.path.join(image_dir, "masks")
+ mask_paths = collect_mask_paths(masks_dir)
+
+ assets_dir = os.path.join(image_dir, "3d_assets")
+ pt_dir = os.path.join(image_dir, "pt")
+
+ if not mask_paths:
+ print(f"Warning: No mask images found in {masks_dir}")
+ print("Creating placeholder directories and continuing...")
+ os.makedirs(assets_dir, exist_ok=True)
+ os.makedirs(pt_dir, exist_ok=True)
+ return
+
+ os.makedirs(assets_dir, exist_ok=True)
+ os.makedirs(pt_dir, exist_ok=True)
+
+ required_files = {
+ "extrinsic": os.path.join(image_dir, "extrinsic.npy"),
+ "intrinsic": os.path.join(image_dir, "intrinsic.npy"),
+ "depth": os.path.join(image_dir, "depth.npy"),
+ }
+ missing_files = [name for name, p in required_files.items() if not os.path.exists(p)]
+ if missing_files:
+ print(f"Warning: Missing required files: {missing_files}")
+ print("These files should be generated by the AnySplat pipeline first.")
+ return
+
+ extrinsics = np.load(required_files["extrinsic"])
+ intrinsics = np.load(required_files["intrinsic"])
+ depth_anysplat = _load_depth_with_fallback(image_dir, required_files["depth"])
+
+ fx_pixels = intrinsics[0, 0]
+ fy_pixels = intrinsics[1, 1]
+
+ image_size = _DEFAULT_IMAGE_SIZE
+ _, fov_y = compute_fov_from_intrinsics(fx_pixels, fy_pixels, image_size, degrees=True)
+ mesh_renderer = get_default_mesh_renderer(width=image_size[1], height=image_size[0])
+
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ for i, mask_path in enumerate(mask_paths):
+ print(f"\n[{i+1}/{len(mask_paths)}] Processing mask: {mask_path}")
+
+ # ===== Load and binarize mask =====
+ mask_ = np.array(Image.open(mask_path).convert("L"))
+ mask = np.where(mask_ > 0, 1, 0).astype("uint8")
+ size_ori = np.sum(mask)
+
+ depth_fg = depth_anysplat[mask > 0]
+ if len(depth_fg) == 0:
+ print("Warning: Mask has no valid depth values, skipping image.")
+ return
+ mean_depth_ori = depth_fg.mean()
+ min_depth_ori = depth_fg.min()
+ max_depth_ori = depth_fg.max()
+ print(
+ f"Depth in mask region: mean={mean_depth_ori:.4f}, "
+ f"min={min_depth_ori:.4f}, max={max_depth_ori:.4f}"
+ )
+
+ mask_stem = clean_name(os.path.splitext(os.path.basename(mask_path))[0])
+ save_path = os.path.join(pt_dir, f"{mask_stem}.pt")
+
+ # ===== Run or load inference =====
+ if os.path.exists(save_path):
+ print(f"Loading cached inference result: {save_path}")
+ out = torch.load(save_path, map_location=device, weights_only=False)
+ else:
+ print("Running inference on mask...")
+ out = inference(image_bg, mask, seed=args.seed)
+ if args.save_pt:
+ torch.save(out, save_path)
+ print(f"Saved inference result: {save_path}")
+
+ gs_origin = copy.deepcopy(out["gs"])
+
+ # ===== Optional intermediate GS preview before mesh alignment =====
+ if args.save_intermediate:
+ single_scene = make_scene(out)
+ xyz_cv = _flip_xy(single_scene.get_xyz.clone())
+ single_scene.from_xyz(xyz_cv)
+ image_gs = render_gs_view(
+ single_scene, extrinsics=extrinsics, fov_y=fov_y / 180 * np.pi
+ )
+ imageio.imwrite(
+ os.path.join(image_dir, f"{mask_stem}_1_gs.png"), image_gs
+ )
+ single_scene.save_ply(
+ os.path.join(assets_dir, f"{mask_stem}_gs_with_inferenced_pose.ply")
+ )
+ else:
+ single_scene = None
+
+ # ===== Pose parameters from SAM-3D-Objects =====
+ rotation_output = out["rotation"].cpu().numpy()
+ translation_output = out["translation"].cpu().numpy()
+ scale_output = out["scale"].squeeze(0).cpu().numpy()
+
+ print(f"Rotation (quaternion): {rotation_output}")
+ print(f"Translation: {translation_output}")
+ print(f"Scale: {scale_output}")
+
+ if not out["glb"]:
+ # No mesh produced for this object; skip to cleanup.
+ if single_scene is not None:
+ del single_scene
+ del out
+ torch.cuda.empty_cache()
+ continue
+
+ mesh = out["glb"]
+ mesh.apply_transform(_SAM3D_TO_WORLD)
+ mesh_origin = copy.deepcopy(mesh)
+
+ # The pose transform below is only used to estimate object size from the
+ # current view; the exported asset stays at the origin.
+ quat = copy.deepcopy(rotation_output)
+ rot = R.from_quat(quat, scalar_first=True).as_matrix().squeeze(0)
+ inverse_rot = np.linalg.inv(rot)
+
+ scale = np.broadcast_to(np.asarray(scale_output, dtype=float), (3,)).copy()
+ scale_mat = np.diag(scale)
+
+ transform = np.eye(4)
+ transform[:3, :3] = inverse_rot @ scale_mat
+ transform[:3, 3] = copy.deepcopy(translation_output)
+ mesh.apply_transform(transform)
+ _flip_xy(mesh.vertices)
+
+ if args.save_intermediate:
+ mesh.export(os.path.join(assets_dir, f"{mask_stem}_mesh_with_inferenced_pose.obj"))
+
+ # ===== Render to recover scale by area + depth ratio =====
+ mesh_copy = copy.deepcopy(mesh)
+ color, depth = mesh_rendering(
+ mesh=mesh_copy,
+ extrinsics=extrinsics,
+ fov_y=fov_y / 180 * np.pi,
+ renderer=mesh_renderer,
+ )
+ if args.save_intermediate:
+ imageio.imwrite(
+ os.path.join(image_dir, f"{mask_stem}_1_mesh.png"), color
+ )
+
+ valid_depth = depth[depth > 0]
+ if len(valid_depth) == 0:
+ if single_scene is not None:
+ del single_scene
+ del out
+ torch.cuda.empty_cache()
+ continue
+
+ mean_depth_sam3d = np.mean(valid_depth)
+ size_new = np.sum(depth > 0)
+ scale_factor_local = (
+ np.sqrt(size_ori / size_new) * (mean_depth_ori / mean_depth_sam3d)
+ )
+ mesh.apply_scale(scale_factor_local)
+
+ mesh_copy = mesh.copy()
+ color, depth = mesh_rendering(
+ mesh=mesh_copy,
+ extrinsics=extrinsics,
+ fov_y=fov_y / 180 * np.pi,
+ renderer=mesh_renderer,
+ )
+ valid_depth = depth[depth > 0]
+ mean_depth_sam3d_2 = np.mean(valid_depth)
+ z_shift_2 = mean_depth_ori - mean_depth_sam3d_2
+ mesh.vertices = mesh.vertices + np.array([0, 0, z_shift_2])
+
+ if args.save_intermediate:
+ transformed_mesh_path = os.path.join(assets_dir, f"{mask_stem}_mesh_final.obj")
+ mesh.export(transformed_mesh_path)
+ print(f"Saved transformed mesh: {transformed_mesh_path}")
+ color, _ = mesh_rendering(
+ mesh=mesh,
+ extrinsics=extrinsics,
+ fov_y=fov_y / 180 * np.pi,
+ renderer=mesh_renderer,
+ )
+ imageio.imwrite(
+ os.path.join(image_dir, f"{mask_stem}_mesh.png"), color
+ )
+
+ # ===== Final export at origin (mesh + GS) =====
+ total_scale = float(scale_factor_local * scale_output[0]) / scale_factor
+ print(
+ f"Total scaling: {total_scale:.4f} "
+ f"(local_scale={scale_factor_local:.4f}, "
+ f"object_scale={scale_output[0]:.4f}, scene_scale={scale_factor})"
+ )
+
+ mesh_origin.apply_scale(total_scale)
+ resized_mesh_path = os.path.join(assets_dir, f"{mask_stem}.obj")
+ mesh_origin.export(resized_mesh_path)
+ print(f"Saved resized mesh for mujoco: {resized_mesh_path}")
+
+ final_mesh_mean_xyz = np.mean(mesh_origin.vertices, axis=0)
+ mean_xyz_path = os.path.join(assets_dir, f"{mask_stem}_keyframe.npy")
+ np.save(mean_xyz_path, final_mesh_mean_xyz)
+ print(
+ f"Final mesh mean XYZ: "
+ f"[{final_mesh_mean_xyz[0]:.6f}, {final_mesh_mean_xyz[1]:.6f}, "
+ f"{final_mesh_mean_xyz[2]:.6f}]"
+ )
+ print(f"Saved final mesh mean XYZ to: {mean_xyz_path}")
+
+ if args.save_intermediate:
+ # Apply the same transform to the GS scene so the debug snapshot
+ # matches the mesh.
+ xyz_cv = single_scene.get_xyz.clone() * scale_factor_local
+ single_scene.from_xyz(xyz_cv)
+
+ scale_t = single_scene.get_scaling * scale_factor_local
+ single_scene.mininum_kernel_size *= scale_factor_local
+ scale_t = torch.maximum(
+ scale_t,
+ torch.tensor(
+ gs_origin.mininum_kernel_size * 1.1,
+ device=scale_t.device,
+ dtype=scale_t.dtype,
+ ),
+ )
+ single_scene.from_scaling(scale_t)
+
+ xyz_cv = single_scene.get_xyz.clone()
+ xyz_cv[:, 2] = xyz_cv[:, 2] + z_shift_2
+ single_scene.from_xyz(xyz_cv)
+
+ single_ply_path = os.path.join(assets_dir, f"{mask_stem}_gs_final.ply")
+ single_scene.save_ply(single_ply_path)
+ print(f"Saved transformed Gaussian: {single_ply_path}")
+ image_gs = render_gs_view(
+ single_scene, extrinsics=extrinsics, fov_y=fov_y / 180 * np.pi
+ )
+ imageio.imwrite(
+ os.path.join(image_dir, f"{mask_stem}_gs.png"), image_gs
+ )
+
+ # Scale the original GS to mujoco units and save.
+ xyz = gs_origin.get_xyz * total_scale
+ gs_origin.from_xyz(xyz)
+
+ scale_t = gs_origin.get_scaling * total_scale
+ gs_origin.mininum_kernel_size *= total_scale
+ scale_t = torch.maximum(
+ scale_t,
+ torch.tensor(
+ gs_origin.mininum_kernel_size * 1.1,
+ device=scale_t.device,
+ dtype=scale_t.dtype,
+ ),
+ )
+ gs_origin.from_scaling(scale_t)
+
+ origin_ply_path = os.path.join(assets_dir, f"{mask_stem}.ply")
+ gs_origin.save_ply(origin_ply_path)
+ print(f"Saved resized Gaussian for mujoco: {origin_ply_path}")
+
+ if single_scene is not None:
+ del single_scene
+ del out
+ torch.cuda.empty_cache()
+
+ print(f"Completed processing mask: {mask_stem}")
+
+ print(f"\nAll masks processed for image: {image_path}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description=(
+ "Run SAM-3D-Objects multi-object inference, save outputs to .pt, "
+ "and reconstruct per-object Gaussian (.ply) and mesh (.obj) assets."
+ )
+ )
+ parser.add_argument(
+ "--project-root",
+ type=str,
+ default="submodule/Sam-3d-objects",
+ help="Root directory of the sam-3d-objects project.",
+ )
+ parser.add_argument(
+ "--input-dir",
+ type=str,
+ required=True,
+ help="Input directory containing image folders.",
+ )
+ parser.add_argument(
+ "--image-name",
+ type=str,
+ default="input_image.png",
+ help="Name of the image file to process in each folder.",
+ )
+ parser.add_argument(
+ "--tag",
+ type=str,
+ default="hf",
+ help=(
+ "Checkpoint tag, corresponds to "
+ "submodule/Sam-3d-objects/checkpoints/{tag}/pipeline.yaml"
+ ),
+ )
+ parser.add_argument(
+ "--seed",
+ type=int,
+ default=42,
+ help="Random seed passed into Inference.__call__.",
+ )
+ parser.add_argument(
+ "--save-pt",
+ action="store_true",
+ help="Save inference results to pt/*.pt. By default results are not saved.",
+ )
+ parser.add_argument(
+ "--save-intermediate",
+ action="store_true",
+ help="Save intermediate debug outputs (e.g. *_1_gs.png, *_1_mesh.png).",
+ )
+
+ args = parser.parse_args()
+
+ if os.path.isfile(args.input_dir):
+ input_dir = os.path.dirname(os.path.abspath(args.input_dir))
+ else:
+ input_dir = os.path.abspath(args.input_dir)
+
+ project_root = os.path.abspath(args.project_root)
+
+ print(f"Project root (Sam-3d-objects): {project_root}")
+ print(f"Input directory: {input_dir}")
+ print(f"Looking for image files named: {args.image_name}")
+
+ config_path = os.path.join(project_root, "checkpoints", args.tag, "pipeline.yaml")
+ print(f"Loading model from config: {config_path}")
+ inference = Inference(config_path, compile=False)
+ print("Model loaded successfully")
+
+ image_files = []
+ for root, _dirs, files in os.walk(input_dir):
+ for file in files:
+ if file == args.image_name:
+ image_files.append(os.path.join(root, file))
+
+ print(f"Found {len(image_files)} image files to process")
+
+ if not image_files:
+ print(f"No {args.image_name} files found in {input_dir}")
+ print("Directory structure:")
+ for root, _dirs, files in os.walk(input_dir):
+ level = root.replace(input_dir, "").count(os.sep)
+ indent = " " * 2 * level
+ print(f"{indent}{os.path.basename(root)}/")
+ subindent = " " * 2 * (level + 1)
+ for file in files:
+ if file.lower().endswith((".png", ".jpg", ".jpeg")):
+ print(f"{subindent}{file}")
+ return
+
+ for idx, image_path in enumerate(image_files, 1):
+ print(f"\n{'=' * 80}")
+ print(f"Processing image {idx}/{len(image_files)}")
+ print(f"Image path: {image_path}")
+ print(f"{'=' * 80}")
+
+ try:
+ process_single_image(image_path, inference, args)
+ print(f"Successfully processed: {image_path}")
+ except Exception as e:
+ print(f"Error processing {image_path}: {e}")
+ import traceback
+ traceback.print_exc()
+
+ print(f"\n{'=' * 80}")
+ print(f"All processing completed. Processed {len(image_files)} images.")
+ print(f"{'=' * 80}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/pipeline/reconstruct_from_pt.py b/pipeline/reconstruct_from_pt.py
deleted file mode 100644
index 929e426..0000000
--- a/pipeline/reconstruct_from_pt.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import os
-import glob
-import argparse
-
-import torch
-
-from inference import (
- make_scene,
- ready_gaussian_for_video_rendering,
- render_video,
- interactive_visualizer,
-)
-
-
-def main():
- parser = argparse.ArgumentParser(
- description="Load saved *.pt and reconstruct single & multi-object Gaussian .ply"
- )
- parser.add_argument(
- "--project-root",
- type=str,
- default="sam-3d-objects",
- help="Root directory of sam-3d-objects project.",
- )
- parser.add_argument(
- "--save-dir",
- type=str,
- default="sam-3d-objects/torch_save_pt",
- help="Directory containing *.pt files.",
- )
- parser.add_argument(
- "--image-path",
- type=str,
- default="sam3/assets/img.jpg",
- help="Original image path (used only to derive IMAGE_NAME).",
- )
- parser.add_argument(
- "--export-gif",
- action="store_true",
- help="If set, render GIFs for each object and the merged scene.",
- )
- args = parser.parse_args()
-
- project_root = args.project_root
- image_path = args.image_path
- image_name = os.path.basename(os.path.dirname(image_path))
-
- # 这里不再限定 object_*.pt,而是把 save-dir 下所有 .pt 都吃掉
- paths = sorted(glob.glob(os.path.join(args.save_dir, "*.pt")))
- if not paths:
- raise RuntimeError(f"No .pt found under {args.save_dir}")
-
- print(f"Found {len(paths)} .pt files:")
- for p in paths:
- print(" ", p)
-
- device = "cuda" if torch.cuda.is_available() else "cpu"
-
- # 单物体输出目录
- single_gauss_dir = os.path.join(project_root, "gaussians", "single")
- os.makedirs(single_gauss_dir, exist_ok=True)
-
- # 合并场景要用到的 outputs
- outputs = []
-
- if args.export_gif:
- import imageio
-
- # =========================
- # 1️⃣ 遍历每个 .pt:导出单物体 PLY (+ 可选 GIF)
- # =========================
- for idx, p in enumerate(paths):
- print(f"[{idx+1}/{len(paths)}] loading {p}")
- out = torch.load(p, map_location=device)
- # 输出out 的dict键
- print(f" Output keys: {list(out.keys())}")
-
- outputs.append(out)
-
- # 只用 make_scene,不做 ready_gaussian_for_video_rendering
- single_scene = make_scene(out)
-
- stem = os.path.splitext(os.path.basename(p))[0]
- single_ply_path = os.path.join(single_gauss_dir, f"{stem}.ply")
- single_scene.save_ply(single_ply_path)
- print(f"🟢 Saved single-object PLY: {single_ply_path}")
-
- if args.export_gif:
- video = render_video(
- single_scene,
- r=1,
- fov=60,
- resolution=512,
- )["color"]
-
- single_gif_path = os.path.join(single_gauss_dir, f"{stem}.gif")
- imageio.mimsave(
- single_gif_path,
- video,
- format="GIF",
- duration=1000 / 30, # 30fps
- loop=0,
- )
- print(f"🎞️ Saved single-object GIF: {single_gif_path}")
-
- # 如果显存很紧张,可以在这里 del single_scene / video 等
- del single_scene
-
- print("✅ All single-object scenes exported.")
-
- # =========================
- # 2️⃣ 合并多对象场景:PLY (+ 可选 GIF)
- # =========================
- scene_gs = make_scene(*outputs)
- scene_gs = ready_gaussian_for_video_rendering(scene_gs)
-
- gauss_dir = os.path.join(project_root, "gaussians", "multi")
- os.makedirs(gauss_dir, exist_ok=True)
-
- ply_path = os.path.join(gauss_dir, f"{image_name}.ply")
- scene_gs.save_ply(ply_path)
- print(f"✅ Saved merged PLY: {ply_path}")
-
- if args.export_gif:
- video = render_video(
- scene_gs,
- r=1,
- fov=60,
- resolution=512,
- )["color"]
-
- gif_path = os.path.join(gauss_dir, f"{image_name}.gif")
- imageio.mimsave(
- gif_path,
- video,
- format="GIF",
- duration=1000 / 30, # 30fps
- loop=0,
- )
- print(f"✅ Saved merged GIF: {gif_path}")
-
-
-if __name__ == "__main__":
- main()
diff --git a/pipeline/run_sam3_agent_full.py b/pipeline/run_sam3_agent_full.py
deleted file mode 100644
index c6b0290..0000000
--- a/pipeline/run_sam3_agent_full.py
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-"""
-prompt + image -> SAM3 多物体分割 mask 的完整脚本:
-
-1. 用 Qwen3-VL-8B-Thinking 看图,生成若干条英文物体描述 prompt_list
-2. 对每条 prompt 调用 SAM3 分割:
- - 输出到 agent_output_multi/obj_i/*.json
- - json 里包含 pred_masks(RLE)、overlay 图路径等
-3. 将所有 obj_i/*.json 里的 pred_masks 解码为 PNG 二值 mask:
- - 保存到 agent_output_multi/masks/obj_i//mask_k.png
-
-之后,你的 run_sam3d_multi.py 里的 --mask-root
-可以直接指向 agent_output_multi/masks。
-"""
-
-import os
-import ast
-import json
-import argparse
-from functools import partial
-from typing import Optional
-
-import numpy as np
-import torch
-from PIL import Image
-import pycocotools.mask as mask_util
-
-import sam3
-from sam3 import build_sam3_image_model
-from sam3.model.sam3_image_processor import Sam3Processor
-from sam3.agent.client_llm import send_generate_request as send_generate_request_orig
-from sam3.agent.client_sam3 import call_sam_service as call_sam_service_orig
-
-
-# =========================
-# 0. 环境变量(可按需精简)
-# =========================
-
-
-
-# =========================
-# 1. LLM 配置(Qwen3-VL)
-# =========================
-
-LLM_CONFIGS = {
- # vLLM-served models
- "qwen3_vl_8b_thinking": {
- "provider": "vllm",
- # model 不再写死,在 build_llm_config 时通过参数传入
- "model": None,
- },
-}
-
-
-def build_llm_config(
- name: str = "qwen3_vl_8b_thinking",
- model_id: Optional[str] = None,
-):
- """
- 构建 LLM config:
- - name: 在 LLM_CONFIGS 里的 key
- - model_id: 要发给 vLLM 的模型名称(需与 --served-model-name 一致)
- """
- cfg = LLM_CONFIGS[name].copy()
- cfg["name"] = name
- cfg["api_key"] = "LOCAL_VLLM"
-
- if model_id is not None:
- cfg["model"] = model_id
- elif cfg.get("model") is None:
- raise ValueError(
- "LLM model id is not set. Please pass --llm-model-id to match vLLM --served-model-name."
- )
-
- if cfg["provider"] == "vllm":
- server_url = "http://127.0.0.1:8001/v1"
- else:
- server_url = cfg["base_url"]
-
- return cfg, server_url
-
-
-# =========================
-# 2. SAM3 模型构建
-# =========================
-
-def build_sam3_processor() -> Sam3Processor:
- sam3_root = os.path.join(os.path.dirname(sam3.__file__), "..")
- bpe_path = f"{sam3_root}/assets/bpe_simple_vocab_16e6.txt.gz"
- model = build_sam3_image_model(bpe_path=bpe_path)
- processor = Sam3Processor(model, confidence_threshold=0.5)
- return processor
-
-
-# =========================
-# 3. Qwen 生成场景 prompt_list
-# =========================
-
-def generate_scene_prompts_with_qwen(
- image_path: str,
- send_generate_request,
- llm_config: dict,
- max_prompts: int = 12,
- system_prompt_path: str = "examples/system_prompt_scene_prompts.txt",
-):
- """
- 1. 调 Qwen3-VL-8B-Thinking,看图生成可分割对象的英文短 prompt 列表。
- 2. 更鲁棒地解析 ...[...]... ,在缺少 closing tag 时也能工作。
- 3. 自动清洗掉 等无效内容。
- """
-
- # 1) 读取 system prompt
- if not os.path.exists(system_prompt_path):
- raise FileNotFoundError(f"system prompt file not found: {system_prompt_path}")
-
- with open(system_prompt_path, "r", encoding="utf-8") as f:
- system_prompt = f.read().strip()
-
- # 2) 构造 messages(带 image_url)
- image_path = os.path.abspath(image_path)
- image_url = f"file://{image_path}"
-
- messages = [
- {"role": "system", "content": system_prompt},
- {
- "role": "user",
- "content": [
- {
- "type": "text",
- "text": (
- "You are given the image above. "
- "Follow the instructions in the system prompt to analyze the scene, "
- "then output both ... and ... . "
- "Do NOT omit the block. The block must be a valid Python list of strings."
- ),
- },
- {"type": "image_url", "image_url": {"url": image_url}},
- ],
- },
- ]
-
- # 3) 调用 vLLM / Qwen
- resp = send_generate_request(messages=messages)
-
- # 4) 统一拿到 raw_text
- if isinstance(resp, str):
- raw_text = resp
- elif isinstance(resp, dict):
- try:
- raw_text = resp["choices"][0]["message"]["content"]
- except Exception:
- raw_text = str(resp)
- else:
- try:
- raw_text = resp.choices[0].message.content
- except Exception:
- raw_text = str(resp)
-
- raw_text = raw_text.strip()
-
- # ---------------------------
- # 5) 尝试从 中抽取“[...]”这段
- # ---------------------------
- list_block = raw_text
-
- # 先截掉 前面的分析内容
- if "" in raw_text:
- after_tag = raw_text.split("", 1)[1]
- list_block = after_tag
- # 如果有 closing tag,再截掉后面
- if " " in list_block:
- list_block = list_block.split(" ", 1)[0]
-
- # 从 list_block 中找第一个 '[' 和最后一个 ']',尽量拿到一个完整的 Python list 字符串
- inner = None
- start = list_block.find("[")
- end = list_block.rfind("]")
- if start != -1 and end != -1 and end > start:
- inner = list_block[start : end + 1].strip()
-
- # 如果还是没拿到,就 fallback:把整个 list_block 当作 inner
- if inner is None:
- inner = list_block.strip()
-
- # ---------------------------
- # 6) 解析 inner -> Python list[str]
- # ---------------------------
- prompt_list: list[str] = []
-
- # 优先 literal_eval
- try:
- data = ast.literal_eval(inner)
- if isinstance(data, list):
- prompt_list = [
- s.strip()
- for s in data
- if isinstance(s, str) and s.strip()
- ]
- else:
- raise ValueError("parsed object is not a list")
- except Exception:
- # fallback:行级解析(更严格一点,只收“看起来像短 prompt”的行)
- lines = [l.strip() for l in inner.splitlines() if l.strip()]
- tmp: list[str] = []
- for l in lines:
- # 跳过明显是 tag 或分析段落的行
- if l.startswith("<") and l.endswith(">"):
- continue
- if l in ("", " "):
- continue
-
- # 如果是形如 1. xxx / 2) xxx
- if l[0].isdigit():
- parts = l.split(maxsplit=1)
- if len(parts) == 2:
- candidate = parts[1].lstrip(".)").strip()
- else:
- candidate = l
- else:
- candidate = l
-
- # 简单过滤掉过长的整段分析(比如一个大段落 > 200 字符)
- if len(candidate) > 200:
- continue
-
- if candidate:
- tmp.append(candidate)
-
- prompt_list = tmp
-
- # ---------------------------
- # 7) 最后再清洗一遍 prompt_list
- # ---------------------------
- cleaned: list[str] = []
- for s in prompt_list:
- s = s.strip()
- if not s:
- continue
- # 丢掉残余的 tag / think
- if s.startswith("<") and s.endswith(">"):
- continue
- if s in ("", " "):
- continue
- cleaned.append(s)
-
- prompt_list = cleaned[:max_prompts]
- return raw_text, prompt_list
-
-
-# =========================
-# 4. JSON → PNG mask 工具
-# =========================
-
-def safe_name(name: str) -> str:
- """简单处理一下名字中的空格,避免路径问题。"""
- return name.replace(" ", "_")
-
-
-def decode_rle_mask(counts: str, h: int, w: int) -> np.ndarray:
- """将 SAM3/COCO RLE 字符串解码为 (h, w) 的 0/1 uint8 mask。"""
- rle = {"counts": counts.encode("utf-8"), "size": [h, w]}
- mask = mask_util.decode(rle) # (h, w, 1) 或 (h, w)
- if mask.ndim == 3:
- mask = mask[:, :, 0]
- return mask.astype(np.uint8)
-
-
-def convert_agent_json_to_masks(agent_root: str):
- """
- 遍历 agent_root 下的 obj_*/ 目录,
- 把所有 json 里的 pred_masks 解码为 PNG mask。
-
- 输出结构:
- agent_root/masks/obj_i//mask_k.png
- """
- agent_root = os.path.abspath(agent_root)
- mask_root = os.path.join(agent_root, "masks")
- os.makedirs(mask_root, exist_ok=True)
-
- print(f"[INFO] Converting JSON → PNG masks under: {agent_root}")
- print(f"[INFO] Masks will be saved to: {mask_root}")
-
- for obj_name in os.listdir(agent_root):
- obj_dir = os.path.join(agent_root, obj_name)
- if not os.path.isdir(obj_dir):
- continue
- if os.path.abspath(obj_dir) == os.path.abspath(mask_root):
- continue
-
- safe_obj_name = safe_name(obj_name)
- obj_mask_root = os.path.join(mask_root, safe_obj_name)
- os.makedirs(obj_mask_root, exist_ok=True)
-
- print(f"\n=== Scanning folder: {obj_dir} → {obj_mask_root} ===")
-
- for root, _, files in os.walk(obj_dir):
- for fname in files:
- if not fname.endswith(".json"):
- continue
-
- json_path = os.path.join(root, fname)
-
- try:
- with open(json_path, "r") as f:
- data = json.load(f)
- except Exception as e:
- print(f" [SKIP] Failed to load {json_path}: {e}")
- continue
-
- # 某些是 list(history log),直接跳过
- if not isinstance(data, dict):
- print(f" [SKIP] {json_path}: json is list, not mask dict")
- continue
-
- pred_masks = data.get("pred_masks")
- if not pred_masks:
- print(f" [SKIP] {json_path}: no pred_masks")
- continue
-
- h = data.get("orig_img_h")
- w = data.get("orig_img_w")
- if h is None or w is None:
- print(f" [SKIP] {json_path}: missing height/width")
- continue
-
- json_basename = os.path.splitext(os.path.basename(json_path))[0]
- safe_json_basename = safe_name(json_basename)
-
- out_dir = os.path.join(obj_mask_root, safe_json_basename)
- os.makedirs(out_dir, exist_ok=True)
-
- print(f" [OK] {json_path}: {len(pred_masks)} masks → {out_dir}")
-
- scores = data.get("pred_scores", [])
- for i, counts in enumerate(pred_masks):
- mask = decode_rle_mask(counts, h, w)
-
- mask_save_path = os.path.join(out_dir, f"mask_{i+1}.png")
- Image.fromarray(mask * 255).save(mask_save_path)
-
- score_str = f", score={scores[i]:.3f}" if i < len(scores) else ""
- print(f" saved mask_{i+1}.png{score_str}")
-
-
-# =========================
-# 5. 主流程:prompt + img -> mask
-# =========================
-
-def main():
- parser = argparse.ArgumentParser(
- description="Qwen3-VL + SAM3: prompt+image -> multi-object masks"
- )
- parser.add_argument(
- "--image-path",
- type=str,
- default="sam3/assets/img.jpg",
- help="输入图片路径",
- )
- parser.add_argument(
- "--output-root",
- type=str,
- default="sam3/agent_output_multi",
- help="SAM3 多物体输出根目录(内部会建 obj_1, obj_2, ...)",
- )
- parser.add_argument(
- "--system-prompt-path",
- type=str,
- default="sam3/examples/system_prompt_scene_prompts.txt",
- help="Qwen 用的 system prompt 文本路径",
- )
- parser.add_argument(
- "--max-prompts",
- type=int,
- default=12,
- help="最多保留多少个物体 prompt",
- )
- parser.add_argument(
- "--skip-first",
- action="store_true",
- help="是否丢弃 prompt_list 的第一个元素(如果它更像场景描述而不是具体物体)",
- )
- parser.add_argument(
- "--llm-model-id",
- type=str,
- default="sam3/models",
- help="发送给 LLM 服务的模型名称(需与 vLLM --served-model-name 一致)",
- )
-
- args = parser.parse_args()
-
-
- # 构建 LLM & SAM3
- llm_config, llm_server_url = build_llm_config(
- name="qwen3_vl_8b_thinking",
- model_id=args.llm_model_id,
- )
- processor = build_sam3_processor()
-
- send_generate_request = partial(
- send_generate_request_orig,
- server_url=llm_server_url,
- model=llm_config["model"],
- api_key=llm_config["api_key"],
- )
- call_sam_service = partial(call_sam_service_orig, sam3_processor=processor)
-
- image = os.path.abspath(args.image_path)
- output_root = os.path.abspath(args.output_root)
- os.makedirs(output_root, exist_ok=True)
-
- # 1) Qwen 生成场景 prompt_list
- print(f"[INFO] Generating prompts for image: {image}")
- raw_text, prompt_list = generate_scene_prompts_with_qwen(
- image_path=image,
- send_generate_request=send_generate_request,
- llm_config=llm_config,
- max_prompts=args.max_prompts,
- system_prompt_path=args.system_prompt_path,
- )
-
- print("\n====== 原始 Qwen 输出(raw_text,截断开头 800 字) ======")
- print(raw_text[:800])
- print("......\n")
-
- if args.skip_first and len(prompt_list) > 1:
- prompt_list = prompt_list[1:]
-
- print("====== 解析后的 prompt_list ======")
- for i, p in enumerate(prompt_list, start=1):
- print(f"{i}. {p}")
-
- # 2) 逐个 prompt 调用 SAM3,写入 json
- for i, prompt in enumerate(prompt_list, start=1):
- print(f"\n================ [Prompt {i}] {prompt} ================\n")
-
- this_output_dir = os.path.join(output_root, f"obj_{i}")
- os.makedirs(this_output_dir, exist_ok=True)
-
- json_path = call_sam_service(
- image_path=image,
- text_prompt=prompt,
- output_folder_path=this_output_dir,
- )
- print(f"[OK] SAM3 output json: {json_path}")
-
- # 3) 把所有 json 里的 pred_masks 解码为 PNG mask
- convert_agent_json_to_masks(output_root)
-
- print("\n✅ All done. Masks are under:")
- print(f" {os.path.join(output_root, 'masks')}")
-
-
-if __name__ == "__main__":
- main()
diff --git a/pipeline/run_sam3d_multi.py b/pipeline/run_sam3d_multi.py
deleted file mode 100644
index 4ea9f00..0000000
--- a/pipeline/run_sam3d_multi.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import os
-import argparse
-
-import numpy as np
-import torch
-from PIL import Image
-
-from inference import Inference
-
-import re
-
-def clean_name(x: str):
- return re.sub(r'[^0-9a-zA-Z_]', '', x)
-
-
-
-
-def load_image(path: str) -> Image.Image:
- img = Image.open(path).convert("RGB")
- return img
-
-
-def collect_mask_paths(mask_root: str):
- """
- 递归收集 mask_root 下所有 png/jpg/jpeg 的路径。
- """
- all_mask_paths = []
- for root, _, files in os.walk(mask_root):
- for f in files:
- lf = f.lower()
- if lf.endswith(".png") or lf.endswith(".jpg") or lf.endswith(".jpeg"):
- all_mask_paths.append(os.path.join(root, f))
-
- all_mask_paths.sort()
- print(f"Found {len(all_mask_paths)} mask files under {mask_root}")
- return all_mask_paths
-
-
-def load_binary_mask(path: str):
- """
- 单个 mask 文件 → 二值 uint8 数组 (H, W), {0, 1}
- """
- m = np.array(Image.open(path).convert("L"))
- m = (m > 128).astype("uint8")
- return m
-
-
-def main():
- parser = argparse.ArgumentParser(
- description="Run SAM3D multi-object inference and save outputs to .pt"
- )
- parser.add_argument(
- "--image-path",
- type=str,
- default="sam3/assets/img.jpg",
- help="Input image path to lift to 3D.",
- )
- parser.add_argument(
- "--mask-root",
- type=str,
- default="sam3/agent_output_multi/masks",
- help="Directory containing mask PNG/JPGs.",
- )
- parser.add_argument(
- "--save-dir",
- type=str,
- default="sam-3d-objects/torch_save_pt",
- help="Where to save _.pt files.",
- )
- parser.add_argument(
- "--tag",
- type=str,
- default="hf",
- help="Checkpoint tag, corresponds to ../sam-3d-objects/checkpoints/{tag}/pipeline.yaml",
- )
- parser.add_argument(
- "--seed",
- type=int,
- default=42,
- help="Random seed passed into Inference.__call__.",
- )
- parser.add_argument(
- "--project-root",
- type=str,
- default=None,
- help=(
- "Root directory of sam-3d-objects repo. "
- "If not set, will be inferred as /../sam-3d-objects."
- ),
- )
- args = parser.parse_args()
-
-
-
- script_dir = os.path.dirname(os.path.abspath(__file__))
-
- if args.project_root is not None:
- # 如果用户通过命令行显式传入了 --project-root,就直接用它
- project_root = os.path.abspath(args.project_root)
- else:
- # 否则自动推断:假设当前脚本位于 sam3d_gs/pipeline/ 下,
- # sam-3-objects 位于 sam3d_gs/sam-3-objects
- project_root = os.path.abspath(os.path.join(script_dir, "..", "sam-3-objects"))
-
- print(f"Project root (sam-3-objects): {project_root}")
-
- config_path = os.path.join(project_root, "checkpoints", args.tag, "pipeline.yaml")
- print(f"Using config: {config_path}")
- inference = Inference(config_path, compile=False)
-
- pil_image = load_image(args.image_path)
- image = np.array(pil_image)
-
- mask_paths = collect_mask_paths(args.mask_root)
- if not mask_paths:
- raise RuntimeError(f"No mask images found under {args.mask_root}")
-
- os.makedirs(args.save_dir, exist_ok=True)
-
- for i, mask_path in enumerate(mask_paths):
- print(f"[{i+1}/{len(mask_paths)}] running inference on mask: {mask_path}")
-
- mask = load_binary_mask(mask_path)
-
- out = inference(image, mask, seed=args.seed)
-
- # 构造保存名字:父目录名 + "_" + mask 文件名(无扩展)
- parent_name_raw = os.path.basename(os.path.dirname(mask_path))
- parent_name = clean_name(parent_name_raw)
- mask_stem_raw = os.path.splitext(os.path.basename(mask_path))[0]
- mask_stem = clean_name(mask_stem_raw)
- save_name = f"{parent_name}_{mask_stem}.pt"
- save_path = os.path.join(args.save_dir, save_name)
-
- torch.save(out, save_path)
- print(f"✅ Saved: {save_path}")
-
- # 显式释放显存
- del out
- torch.cuda.empty_cache()
-
- print("✅ All objects processed and saved as .pt")
-
-
-if __name__ == "__main__":
- main()
diff --git a/pipeline/utils.py b/pipeline/utils.py
new file mode 100644
index 0000000..bf4c986
--- /dev/null
+++ b/pipeline/utils.py
@@ -0,0 +1,200 @@
+import re
+import os
+import atexit
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+from PIL import Image
+import trimesh
+import pyrender
+import numpy as np
+import imageio
+
+
+_DEFAULT_MESH_RENDERERS = {}
+
+
+class MeshRenderContext:
+ def __init__(
+ self,
+ width=448,
+ height=448,
+ add_axis=False,
+ debug_depth_path=None,
+ verbose=False,
+ ):
+ self.width = width
+ self.height = height
+ self.add_axis = add_axis
+ self.debug_depth_path = debug_depth_path
+ self.verbose = verbose
+ self.renderer = pyrender.OffscreenRenderer(width, height)
+ self.material = pyrender.MetallicRoughnessMaterial(
+ baseColorFactor=[0.7, 0.7, 0.7, 1.0],
+ metallicFactor=0.0,
+ roughnessFactor=1.0,
+ )
+ self.cv_to_gl = np.array(
+ [
+ [1, 0, 0, 0],
+ [0, -1, 0, 0],
+ [0, 0, -1, 0],
+ [0, 0, 0, 1],
+ ],
+ dtype=np.float32,
+ )
+
+ def close(self):
+ if self.renderer is not None:
+ self.renderer.delete()
+ self.renderer = None
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc, tb):
+ self.close()
+
+ def render(self, mesh, extrinsics, fov_y):
+ if self.renderer is None:
+ self.renderer = pyrender.OffscreenRenderer(self.width, self.height)
+
+ if self.verbose:
+ print(
+ f"vertices shape {mesh.vertices.shape} "
+ f"mesh vertices mean {np.mean(mesh.vertices, axis=0)}"
+ )
+
+ render_mesh = pyrender.Mesh.from_trimesh(
+ mesh,
+ material=self.material,
+ smooth=False,
+ )
+
+ scene = pyrender.Scene()
+ scene.add(render_mesh)
+
+ camera = pyrender.PerspectiveCamera(
+ yfov=fov_y,
+ aspectRatio=self.width / self.height,
+ )
+
+ camera_pose = extrinsics @ self.cv_to_gl
+ scene.add(camera, pose=camera_pose)
+
+ if self.add_axis:
+ axis = trimesh.creation.axis(axis_length=0.5)
+ scene.add(pyrender.Mesh.from_trimesh(axis, smooth=False))
+
+ light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0)
+ scene.add(light, pose=camera_pose)
+
+ color, depth = self.renderer.render(scene)
+
+ if self.debug_depth_path:
+ depth_min = depth.min()
+ depth_range = depth.max() - depth_min
+ if depth_range > 0:
+ depth_normalized = (
+ (depth - depth_min) / depth_range * 255
+ ).astype(np.uint8)
+ else:
+ depth_normalized = np.zeros_like(depth, dtype=np.uint8)
+ imageio.imwrite(self.debug_depth_path, depth_normalized)
+
+ if self.verbose:
+ valid_depth = depth[depth > 0]
+ valid_mean = valid_depth.mean() if valid_depth.size > 0 else np.nan
+ print(
+ f"max depth {depth.max()}, min depth {depth.min()}, "
+ f"mean depth {depth.mean()}, valid mean depth {valid_mean}"
+ )
+
+ return color, depth
+
+
+def get_default_mesh_renderer(
+ width=448,
+ height=448,
+ add_axis=False,
+ debug_depth_path=None,
+ verbose=False,
+):
+ key = (width, height, add_axis, debug_depth_path, verbose)
+ renderer = _DEFAULT_MESH_RENDERERS.get(key)
+ if renderer is None:
+ renderer = MeshRenderContext(
+ width=width,
+ height=height,
+ add_axis=add_axis,
+ debug_depth_path=debug_depth_path,
+ verbose=verbose,
+ )
+ _DEFAULT_MESH_RENDERERS[key] = renderer
+ return renderer
+
+
+def close_default_mesh_renderers():
+ for renderer in _DEFAULT_MESH_RENDERERS.values():
+ renderer.close()
+ _DEFAULT_MESH_RENDERERS.clear()
+
+
+atexit.register(close_default_mesh_renderers)
+
+
+def clean_name(x: str):
+ return re.sub(r'[^0-9a-zA-Z_-]', '', x)
+
+
+def load_image(path: str) -> Image.Image:
+ img = Image.open(path).convert("RGB")
+ return img
+
+
+def collect_mask_paths(mask_root: str):
+ """Recursively collect all .png / .jpg / .jpeg paths under mask_root."""
+ all_mask_paths = []
+ for root, _, files in os.walk(mask_root):
+ for f in files:
+ lf = f.lower()
+ if lf.endswith(".png") or lf.endswith(".jpg") or lf.endswith(".jpeg"):
+ all_mask_paths.append(os.path.join(root, f))
+
+ all_mask_paths.sort()
+ print(f"Found {len(all_mask_paths)} mask files under {mask_root}")
+ return all_mask_paths
+
+
+def compute_fov_from_intrinsics(fx, fy, image_size, degrees=True):
+ """Compute horizontal / vertical FOV from pixel-unit fx, fy."""
+ height, width = image_size
+
+ fov_y = 2 * np.arctan(height / (2 * fy))
+ fov_x = 2 * np.arctan(width / (2 * fx))
+
+ if degrees:
+ fov_y = np.degrees(fov_y)
+ fov_x = np.degrees(fov_x)
+
+ return fov_x, fov_y
+
+def mesh_rendering(
+ mesh,
+ extrinsics,
+ fov_y,
+ renderer=None,
+ width=448,
+ height=448,
+ add_axis=False,
+ debug_depth_path=None,
+ verbose=False,
+):
+ if renderer is None:
+ renderer = get_default_mesh_renderer(
+ width=width,
+ height=height,
+ add_axis=add_axis,
+ debug_depth_path=debug_depth_path,
+ verbose=verbose,
+ )
+ return renderer.render(mesh, extrinsics, fov_y)
+
diff --git a/run_agent_with_vllm.sh b/run_agent_with_vllm.sh
deleted file mode 100644
index 3ed5925..0000000
--- a/run_agent_with_vllm.sh
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-############################################
-# 0. Resolve project root (directory of this script)
-############################################
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-############################################
-# 1. Global config (paths are relative to SCRIPT_DIR)
-############################################
-export HF_ENDPOINT="https://hf-mirror.com"
-
-export HF_HOME="${SCRIPT_DIR}/huggingface"
-export TRANSFORMERS_CACHE="${HF_HOME}"
-export HF_DATASETS_CACHE="${HF_HOME}"
-export HF_HUB_CACHE="${HF_HOME}"
-
-# Path to conda initialization script (usually absolute)
-CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh"
-
-# Conda env names
-VLLM_ENV="vllm"
-SAM3_ENV="sam3"
-
-# vLLM model directory (where Qwen3-VL-8B-Thinking will be downloaded)
-VLLM_MODEL_DIR="${SCRIPT_DIR}/models/qwen3_vl_8b_thinking"
-
-# Model name exposed by vLLM and used by the Python script (--llm-model-id)
-SERVED_MODEL_NAME="qwen3-vl-8b-thinking"
-
-# vLLM server port
-VLLM_PORT=8001
-
-# SAM3 agent script (Python entry)
-AGENT_SCRIPT="${SCRIPT_DIR}/pipeline/run_sam3_agent_full.py"
-
-# Input image
-IMAGE_PATH="${SCRIPT_DIR}/assets/img.jpg"
-
-# Output root directory
-OUTPUT_ROOT="${SCRIPT_DIR}/outputs/master_with_vllm"
-
-# System prompt file for Qwen
-SYSTEM_PROMPT_PATH="${SCRIPT_DIR}/assets/system_prompt_scene_prompts.txt"
-
-# vLLM log
-LOG_DIR="${SCRIPT_DIR}/logs"
-mkdir -p "${LOG_DIR}"
-VLLM_LOG="${LOG_DIR}/vllm_server.log"
-
-############################################
-# 2. Initialize conda
-############################################
-if [ -f "${CONDA_SH}" ]; then
- # Enable `conda activate`
- # shellcheck disable=SC1090
- source "${CONDA_SH}"
-else
- echo "ERROR: conda.sh not found at ${CONDA_SH}"
- exit 1
-fi
-
-############################################
-# 3. HuggingFace login (interactive, in vLLM env)
-############################################
-echo ">>> Activating conda env: ${VLLM_ENV}"
-conda activate "${VLLM_ENV}"
-
-echo ">>> Running 'hf auth login' (you may be prompted for a token)..."
-hf auth login
-echo ">>> HuggingFace login finished ✓"
-
-############################################
-# 4. Download Qwen3-VL-8B-Thinking if model dir is empty
-############################################
-if [ ! -d "${VLLM_MODEL_DIR}" ] || [ -z "$(ls -A "${VLLM_MODEL_DIR}" 2>/dev/null)" ]; then
- echo ">>> Model directory is empty: ${VLLM_MODEL_DIR}"
- echo ">>> Auto-downloading Qwen/Qwen3-VL-8B-Thinking ..."
-
- mkdir -p "${VLLM_MODEL_DIR}"
-
- if command -v huggingface-cli >/dev/null 2>&1; then
- huggingface-cli download \
- Qwen/Qwen3-VL-8B-Thinking \
- --local-dir "${VLLM_MODEL_DIR}" \
- --local-dir-use-symlinks False
- elif command -v hf >/dev/null 2>&1; then
- hf snapshot download Qwen/Qwen3-VL-8B-Thinking \
- --local-dir "${VLLM_MODEL_DIR}" \
- --local-dir-use-symlinks False
- else
- echo "ERROR: Neither 'huggingface-cli' nor 'hf' CLI is installed."
- echo "Please install with: pip install -U huggingface_hub"
- exit 1
- fi
-
- echo ">>> Model download complete!"
-else
- echo ">>> Model already exists at ${VLLM_MODEL_DIR}, skip download."
-fi
-
-############################################
-# 5. Start vLLM server (still in vLLM env)
-############################################
-echo ">>> Starting vLLM server on GPUs 6,7 ..."
-CUDA_VISIBLE_DEVICES=6,7 \
-vllm serve "${VLLM_MODEL_DIR}" \
- --tensor-parallel-size 2 \
- --dtype float16 \
- --gpu-memory-utilization 0.9 \
- --max-model-len 65536 \
- --max-num-seqs 4 \
- --port 8001 \
- --allowed-local-media-path / \
- --served-model-name "${SERVED_MODEL_NAME}" \
- > "${VLLM_LOG}" 2>&1 &
-
-VLLM_PID=$!
-echo ">>> vLLM server started. PID = ${VLLM_PID}"
-echo ">>> Logs: ${VLLM_LOG}"
-
-echo ">>> Waiting for vLLM server to become ready..."
-until curl -s "http://localhost:${VLLM_PORT}/v1/models" > /dev/null; do
- echo "vLLM not ready yet, waiting 2s..."
- sleep 2
-done
-echo ">>> vLLM server is ready!"
-
-############################################
-# 6. Run SAM3 agent (in sam3 env)
-############################################
-echo ">>> Activating SAM3 env: ${SAM3_ENV}"
-conda activate "${SAM3_ENV}"
-
-echo ">>> Running SAM3 agent with CUDA_VISIBLE_DEVICES=0 ..."
-CUDA_VISIBLE_DEVICES=0 \
-python "${AGENT_SCRIPT}" \
- --image-path "${IMAGE_PATH}" \
- --output-root "${OUTPUT_ROOT}" \
- --system-prompt-path "${SYSTEM_PROMPT_PATH}" \
- --llm-model-id "${SERVED_MODEL_NAME}" \
- --skip-first
-
-echo ">>> SAM3 agent finished."
-
-############################################
-# 7. Done (vLLM is still running)
-############################################
-echo ">>> All done. vLLM is still running with PID = ${VLLM_PID}"
-echo ">>> To stop it manually, run: kill ${VLLM_PID}"
diff --git a/run_docker.sh b/run_docker.sh
new file mode 100755
index 0000000..e48454e
--- /dev/null
+++ b/run_docker.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# Launch sam3d-gs:latest with host checkpoints + data bind-mounted.
+#
+# Usage:
+# run_docker.sh [PROJECT_DIR] [HF_CACHE_DIR]
+#
+# PROJECT_DIR Path to the sam3d_gs repo on the host.
+# Defaults to the directory this script lives in.
+# HF_CACHE_DIR Path to host HuggingFace cache (so AnySplat and other
+# HF models are reused across container starts).
+# Defaults to ${HF_HOME:-$HOME/.cache/huggingface}.
+#
+# Environment overrides:
+# SAM3D_IMAGE Docker image to run. Default: sam3d-gs:latest
+# TORCH_HOME Host PyTorch hub cache (DINOv2 etc. land here).
+# Default: $HOME/.cache/torch
+
+set -euo pipefail
+
+DEFAULT_REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO="${1:-${DEFAULT_REPO}}"
+HF_CACHE="${2:-${HF_HOME:-${HOME}/.cache/huggingface}}"
+TORCH_CACHE="${TORCH_HOME:-${HOME}/.cache/torch}"
+IMAGE="${SAM3D_IMAGE:-sam3d-gs:latest}"
+
+REPO="$(realpath "${REPO}")"
+HF_CACHE="$(realpath -m "${HF_CACHE}")"
+TORCH_CACHE="$(realpath -m "${TORCH_CACHE}")"
+
+# Sanity-check that PROJECT_DIR really looks like the sam3d_gs repo.
+for marker in submodule/Sam-3d-objects submodule/Prompt-Inpaint scripts/install_env.sh; do
+ if [[ ! -e "${REPO}/${marker}" ]]; then
+ echo "ERROR: ${REPO} does not look like a sam3d_gs checkout (missing ${marker})." >&2
+ echo "Pass the project root explicitly: $0 /path/to/sam3d_gs" >&2
+ exit 1
+ fi
+done
+
+# Ensure host-side bind targets exist (Docker would otherwise create them as root).
+mkdir -p \
+ "${REPO}/submodule/Sam-3d-objects/checkpoints" \
+ "${REPO}/submodule/Prompt-Inpaint/checkpoints" \
+ "${REPO}/data" \
+ "${REPO}/example" \
+ "${HF_CACHE}" \
+ "${TORCH_CACHE}"
+
+echo "==> repo: ${REPO}"
+echo "==> hf cache: ${HF_CACHE}"
+echo "==> torch cache: ${TORCH_CACHE}"
+echo "==> image: ${IMAGE}"
+
+docker run --rm -it \
+ --gpus all \
+ --shm-size=8g \
+ --network host \
+ -v "${REPO}/submodule/Sam-3d-objects/checkpoints":/opt/sam3d_gs/submodule/Sam-3d-objects/checkpoints \
+ -v "${REPO}/submodule/Prompt-Inpaint/checkpoints":/opt/sam3d_gs/submodule/Prompt-Inpaint/checkpoints \
+ -v "${HF_CACHE}":/root/.cache/huggingface \
+ -v "${TORCH_CACHE}":/root/.cache/torch \
+ -v "${REPO}/data":/opt/sam3d_gs/data \
+ -v "${REPO}/example":/opt/sam3d_gs/example \
+ "${IMAGE}"
diff --git a/run_object_generation_pipeline.sh b/run_object_generation_pipeline.sh
new file mode 100755
index 0000000..aac03a5
--- /dev/null
+++ b/run_object_generation_pipeline.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+if [[ $# -lt 1 || $# -gt 2 ]]; then
+ echo "Usage: $0 [path]"
+ echo "Example: $0 data/new-desk/input_image.png"
+ exit 1
+fi
+
+path_img="$1"
+if [[ $# -eq 2 ]]; then
+ path="$2"
+else
+ path="$(dirname "${path_img}")"
+fi
+
+path_img="$(realpath "${path_img}")"
+path="$(realpath "${path}")"
+
+if [[ ! -f "${path_img}" ]]; then
+ echo "Input image not found: ${path_img}"
+ exit 1
+fi
+
+if [[ ! -d "${path}" ]]; then
+ echo "Input directory not found: ${path}"
+ exit 1
+fi
+
+source "${SCRIPT_DIR}/.venv/bin/activate"
+
+export PYTHONPATH="${SCRIPT_DIR}/submodule/Sam-3d-objects/notebook:${SCRIPT_DIR}/submodule/Sam-3d-objects:${PYTHONPATH:-}"
+
+echo "Python: $(which python)"
+echo "Image: ${path_img}"
+echo "Directory: ${path}"
+
+# Bootstrap gated HuggingFace weights on first run.
+# Both models are gated; the user must have run `hf auth login` and accepted
+# the model agreements for facebook/sam-3d-objects and facebook/sam3.
+SAM3D_PIPELINE_YAML="${SCRIPT_DIR}/submodule/Sam-3d-objects/checkpoints/hf/pipeline.yaml"
+SAM3_WEIGHT="${SCRIPT_DIR}/submodule/Prompt-Inpaint/checkpoints/sam3.pt"
+if [[ ! -f "${SAM3D_PIPELINE_YAML}" || ! -f "${SAM3_WEIGHT}" ]]; then
+ echo "==> One or more gated checkpoints missing locally; running bootstrap..."
+ bash "${SCRIPT_DIR}/scripts/download_checkpoints.sh"
+fi
+
+echo "==> Step 1/3: Prompt-Inpaint"
+python "${SCRIPT_DIR}/submodule/Prompt-Inpaint/main.py" \
+ --resize-output \
+ --save-individual-masks \
+ --config "${SCRIPT_DIR}/submodule/Prompt-Inpaint/configs/items.yml" \
+ --image "${path_img}" \
+ --output-dir "${path}"
+
+echo "==> Step 2/3: AnySplat"
+python "${SCRIPT_DIR}/pipeline/background_reconstruction.py" "${path}"
+
+echo "==> Step 3/3: Object generation"
+python "${SCRIPT_DIR}/pipeline/objects_generation.py" --input-dir "${path}"
+
+echo "Done."
diff --git a/run_pipeline.sh b/run_pipeline.sh
deleted file mode 100644
index 547c65b..0000000
--- a/run_pipeline.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-bash run_agent_with_vllm.sh
-bash run_sam3d_from_masks.sh
\ No newline at end of file
diff --git a/run_sam3d_from_masks.sh b/run_sam3d_from_masks.sh
deleted file mode 100644
index 924d1f5..0000000
--- a/run_sam3d_from_masks.sh
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/usr/bin/env bash
-# 不要开 -u,会和 conda activate 脚本打架
-set -eo pipefail
-
-############################################
-# 0. Resolve project root (directory of this script)
-############################################
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# 防止 conda activate 的 binutils 脚本里引用未定义 ADDR2LINE
-export ADDR2LINE=addr2line
-
-############################################
-# 1. Global config (all paths relative to SCRIPT_DIR)
-############################################
-
-# GPU used for SAM3D reconstruction
-export CUDA_VISIBLE_DEVICES="0"
-
-# HF / Torch cache (和 run_agent_with_vllm.sh 共用一套)
-export HF_ENDPOINT="https://hf-mirror.com"
-export HF_HOME="${SCRIPT_DIR}/huggingface"
-export TRANSFORMERS_CACHE="${HF_HOME}"
-export HF_DATASETS_CACHE="${HF_HOME}"
-export HF_HUB_CACHE="${HF_HOME}"
-export HF_HUB_ENABLE_HF_TRANSFER=0
-
-export TORCH_HOME="${SCRIPT_DIR}/torch_hub"
-export TORCH_HUB="${SCRIPT_DIR}/torch_hub"
-
-# Conda init script (absolute)
-CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh"
-
-# Conda env for SAM3D
-SAM3D_ENV="sam3d-objects"
-
-# sam-3d-objects repo root
-PROJECT_ROOT="${SCRIPT_DIR}/sam-3d-objects"
-
-# Where sam-3-objects stores intermediate .pt
-PT_SAVE_DIR="${PROJECT_ROOT}/outputs/torch_save_pt"
-
-# Checkpoints / config paths
-CHECKPOINTS_DIR="${PROJECT_ROOT}/checkpoints"
-PIPELINE_YAML="${CHECKPOINTS_DIR}/hf/pipeline.yaml"
-
-# Python entry scripts (放在 sam3d_gs/pipeline 下)
-SAM3D_MULTI_SCRIPT="${SCRIPT_DIR}/pipeline/run_sam3d_multi.py"
-RECONSTRUCT_SCRIPT="${SCRIPT_DIR}/pipeline/reconstruct_from_pt.py"
-
-# Input image: 使用和 SAM3 agent 一样的图
-IMAGE_PATH="${SCRIPT_DIR}/assets/img.jpg"
-
-# 🔴 关键:mask-root = SAM3 agent 的 mask 输出目录
-# 如果你的 run_sam3_agent_full.py 把 mask 写在:
-# outputs/master_with_vllm/masks
-# 就用这一行:
-MASK_ROOT="${SCRIPT_DIR}/outputs/master_with_vllm/masks"
-# 如果暂时还用旧目录,比如 sam3/agent_output_multi/masks,可以改成:
-# MASK_ROOT="${SCRIPT_DIR}/sam3/agent_output_multi/masks"
-
-# Run configs
-TAG="hf"
-SEED=42
-EXPORT_GIF=1 # 1 = reconstruct 时加 --export-gif,0 = 不导出 GIF
-
-############################################
-# 2. Initialize conda
-############################################
-if [ -f "${CONDA_SH}" ]; then
- # shellcheck disable=SC1090
- source "${CONDA_SH}"
-else
- echo "ERROR: conda.sh not found at ${CONDA_SH}"
- exit 1
-fi
-
-echo ">>> Activating conda env: ${SAM3D_ENV}"
-conda activate "${SAM3D_ENV}"
-
-mkdir -p "${PT_SAVE_DIR}"
-
-############################################
-# 2.5. Ensure checkpoints/${TAG}/pipeline.yaml
-############################################
-if [ ! -f "${PIPELINE_YAML}" ]; then
- echo ">>> pipeline.yaml not found at: ${PIPELINE_YAML}"
- echo ">>> Downloading checkpoints from facebook/sam-3d-objects ..."
- echo ">>> (确保已运行 'hf auth login' 并在网页上接受模型协议)"
-
- # 关闭 hf_transfer(在镜像环境下容易出奇怪错误)
- export HF_HUB_ENABLE_HF_TRANSFER=0
-
- # 临时下载目录(避免直接弄脏 sam-3d-objects 根目录)
- TMP_DIR="${CHECKPOINTS_DIR}/.tmp_download_${TAG}"
- rm -rf "${TMP_DIR}"
- mkdir -p "${TMP_DIR}"
-
- # 1) 把远端的 checkpoints/** 全部下载到临时目录
- if command -v huggingface-cli >/dev/null 2>&1; then
- huggingface-cli download \
- facebook/sam-3d-objects \
- --local-dir "${TMP_DIR}" \
- --local-dir-use-symlinks False \
- --include "checkpoints/**"
- elif command -v hf >/dev/null 2>&1; then
- hf snapshot download \
- facebook/sam-3d-objects \
- --local-dir "${TMP_DIR}" \
- --local-dir-use-symlinks False \
- --include "checkpoints/**"
- else
- echo "ERROR: neither 'huggingface-cli' nor 'hf' CLI is installed."
- echo " Try: pip install -U huggingface_hub"
- rm -rf "${TMP_DIR}"
- exit 1
- fi
-
- # 2) 远端结构:TMP_DIR/checkpoints/...
- # 本地目标:CHECKPOINTS_DIR/TAG/...
- mkdir -p "${CHECKPOINTS_DIR}/${TAG}"
-
- if [ -d "${TMP_DIR}/checkpoints" ]; then
- echo ">>> Moving downloaded checkpoints into checkpoints/${TAG} ..."
- # 把 checkpoints/* 都移到 checkpoints/hf/
- mv "${TMP_DIR}/checkpoints/"* "${CHECKPOINTS_DIR}/${TAG}/"
- else
- echo "ERROR: Expected ${TMP_DIR}/checkpoints directory, but not found."
- rm -rf "${TMP_DIR}"
- exit 1
- fi
-
- # 清理临时目录
- rm -rf "${TMP_DIR}"
-
- echo ">>> Checkpoints downloaded → ${CHECKPOINTS_DIR}/${TAG}"
- echo ">>> Expected config at: ${PIPELINE_YAML}"
-else
- echo ">>> Found existing pipeline config: ${PIPELINE_YAML}"
-fi
-
-
-# 确保 sam-3-objects/notebook 在 PYTHONPATH 里,供 inference 等模块 import
-export PYTHONPATH="${PROJECT_ROOT}/notebook:${PYTHONPATH:-}"
-
-############################################
-# 3. Step 1 – run SAM3D multi-object & save .pt
-############################################
-echo "=== [SAM3D] Step 1: run multi-object reconstruction & save .pt ==="
-python "${SAM3D_MULTI_SCRIPT}" \
- --image-path "${IMAGE_PATH}" \
- --mask-root "${MASK_ROOT}" \
- --save-dir "${PT_SAVE_DIR}" \
- --tag "${TAG}" \
- --seed "${SEED}" \
- --project-root "${PROJECT_ROOT}"
-
-############################################
-# 4. Step 2 – reconstruct from .pt to .ply (and optional .gif)
-############################################
-echo "=== [SAM3D] Step 2: reconstruct from .pt to .ply ==="
-
-RECONSTRUCT_CMD=(
- python "${RECONSTRUCT_SCRIPT}"
- --project-root "${PROJECT_ROOT}"
- --save-dir "${PT_SAVE_DIR}"
- --image-path "${IMAGE_PATH}"
-)
-
-if [ "${EXPORT_GIF}" -eq 1 ]; then
- RECONSTRUCT_CMD+=(--export-gif)
-fi
-
-"${RECONSTRUCT_CMD[@]}"
-
-echo "✅ Pipeline finished. Check ${PROJECT_ROOT}/gaussians/multi 下的 .ply/.gif 文件"
diff --git a/sam-3d-objects b/sam-3d-objects
deleted file mode 160000
index cf06676..0000000
--- a/sam-3d-objects
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit cf066761706cd02b07e2fc6274570ec8cdafb683
diff --git a/sam3 b/sam3
deleted file mode 160000
index 2d1cbae..0000000
--- a/sam3
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2d1cbaeac7b52ca64baf61e58973d0940ae843d0
diff --git a/scripts/download_checkpoints.sh b/scripts/download_checkpoints.sh
new file mode 100755
index 0000000..285b4e8
--- /dev/null
+++ b/scripts/download_checkpoints.sh
@@ -0,0 +1,226 @@
+#!/usr/bin/env bash
+# Bootstrap gated HuggingFace checkpoints needed by the pipeline.
+#
+# This script handles the two models that require explicit local placement:
+#
+# 1. facebook/sam-3d-objects
+# The SAM-3D-Objects codepath expects a Hydra config tree at
+# submodule/Sam-3d-objects/checkpoints//pipeline.yaml
+# which is NOT fetched by `from_pretrained`.
+#
+# 2. facebook/sam3
+# Prompt-Inpaint's _resolve_checkpoint() will fall back to a HuggingFace
+# auto-download, but pulling the 3.3 GB sam3.pt into the local
+# `submodule/Prompt-Inpaint/checkpoints/` keeps the weights co-located
+# with the project and survives `~/.cache` cleanups.
+#
+# 3. lhjiang/anysplat
+# AnySplat.from_pretrained reads from the HuggingFace hub cache
+# (~/.cache/huggingface/hub/). Pre-fetching avoids a multi-GB download
+# on the first pipeline run inside an ephemeral container.
+#
+# The script is idempotent: existing target files are skipped unless --force.
+#
+# Usage:
+# bash scripts/download_checkpoints.sh [options]
+#
+# Options:
+# --tag TAG Sub-directory under submodule/Sam-3d-objects/checkpoints/
+# for the SAM-3D-Objects bundle. Default: hf
+# --skip-sam3d Do not download the SAM-3D-Objects bundle.
+# --skip-sam3 Do not download the SAM3 weight (sam3.pt).
+# --skip-anysplat Do not pre-fetch the AnySplat weights into the HF cache.
+# --force Re-download even if the target files already exist.
+# -h, --help Show this help.
+#
+# Environment overrides:
+# SAM3D_CHECKPOINT_TAG Same as --tag
+# SAM3D_MODEL_ID SAM-3D-Objects repo id (default: facebook/sam-3d-objects)
+# SAM3_MODEL_ID SAM3 repo id (default: facebook/sam3)
+# SAM3_WEIGHT_FILENAME SAM3 weight file name (default: sam3.pt)
+# ANYSPLAT_MODEL_ID AnySplat repo id (default: lhjiang/anysplat)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+TAG="${SAM3D_CHECKPOINT_TAG:-hf}"
+SAM3D_MODEL_ID="${SAM3D_MODEL_ID:-facebook/sam-3d-objects}"
+SAM3_MODEL_ID="${SAM3_MODEL_ID:-facebook/sam3}"
+SAM3_WEIGHT_FILENAME="${SAM3_WEIGHT_FILENAME:-sam3.pt}"
+ANYSPLAT_MODEL_ID="${ANYSPLAT_MODEL_ID:-lhjiang/anysplat}"
+SKIP_SAM3D=0
+SKIP_SAM3=0
+SKIP_ANYSPLAT=0
+FORCE=0
+
+usage() {
+ sed -n '2,42p' "${BASH_SOURCE[0]}" | sed 's/^# //; s/^#$//'
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --tag)
+ TAG="$2"
+ shift 2
+ ;;
+ --skip-sam3d)
+ SKIP_SAM3D=1
+ shift
+ ;;
+ --skip-sam3)
+ SKIP_SAM3=1
+ shift
+ ;;
+ --skip-anysplat)
+ SKIP_ANYSPLAT=1
+ shift
+ ;;
+ --force)
+ FORCE=1
+ shift
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "Unknown option: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+require_hf_cli() {
+ if ! command -v hf >/dev/null 2>&1; then
+ cat >&2 <<'EOF'
+ERROR: the 'hf' CLI is not installed.
+ Fix: pip install -U huggingface_hub
+ Then make sure you've accepted the relevant model agreements on
+ huggingface.co and logged in with: hf auth login
+EOF
+ exit 1
+ fi
+}
+
+# hf_transfer occasionally trips on mirrored networks; disable it for safety.
+export HF_HUB_ENABLE_HF_TRANSFER=0
+
+
+download_sam3d_objects() {
+ local checkpoints_dir="${PROJECT_ROOT}/submodule/Sam-3d-objects/checkpoints"
+ local target_dir="${checkpoints_dir}/${TAG}"
+ local pipeline_yaml="${target_dir}/pipeline.yaml"
+
+ if [[ -f "${pipeline_yaml}" && "${FORCE}" -eq 0 ]]; then
+ echo "==> [sam-3d-objects] already present: ${pipeline_yaml}"
+ return 0
+ fi
+
+ require_hf_cli
+ echo "==> [sam-3d-objects] downloading ${SAM3D_MODEL_ID} into ${target_dir}"
+
+ local tmp_dir="${checkpoints_dir}/.tmp_download_${TAG}"
+ rm -rf "${tmp_dir}"
+ mkdir -p "${tmp_dir}"
+
+ # Local cleanup trap (scoped to this function via a subshell would also
+ # work, but we want the trap to run on Ctrl-C too).
+ trap 'rm -rf "${tmp_dir}"' EXIT
+
+ hf download "${SAM3D_MODEL_ID}" \
+ --local-dir "${tmp_dir}" \
+ --include "checkpoints/**"
+
+ if [[ ! -d "${tmp_dir}/checkpoints" ]]; then
+ echo "ERROR: expected ${tmp_dir}/checkpoints after download." >&2
+ exit 1
+ fi
+
+ mkdir -p "${target_dir}"
+ shopt -s dotglob
+ mv "${tmp_dir}/checkpoints/"* "${target_dir}/"
+ shopt -u dotglob
+
+ if [[ ! -f "${pipeline_yaml}" ]]; then
+ echo "ERROR: pipeline.yaml missing after move: ${pipeline_yaml}" >&2
+ exit 1
+ fi
+
+ rm -rf "${tmp_dir}"
+ trap - EXIT
+
+ echo "==> [sam-3d-objects] done: ${target_dir}"
+}
+
+
+download_sam3() {
+ local target_dir="${PROJECT_ROOT}/submodule/Prompt-Inpaint/checkpoints"
+ local target_file="${target_dir}/${SAM3_WEIGHT_FILENAME}"
+
+ if [[ -f "${target_file}" && "${FORCE}" -eq 0 ]]; then
+ echo "==> [sam3] already present: ${target_file}"
+ return 0
+ fi
+
+ require_hf_cli
+ echo "==> [sam3] downloading ${SAM3_MODEL_ID}/${SAM3_WEIGHT_FILENAME} into ${target_dir}"
+
+ mkdir -p "${target_dir}"
+ hf download "${SAM3_MODEL_ID}" "${SAM3_WEIGHT_FILENAME}" \
+ --local-dir "${target_dir}"
+
+ if [[ ! -f "${target_file}" ]]; then
+ echo "ERROR: ${target_file} missing after download." >&2
+ exit 1
+ fi
+
+ echo "==> [sam3] done: ${target_file}"
+}
+
+
+download_anysplat() {
+ # AnySplat.from_pretrained looks up the model in the HuggingFace hub
+ # cache, so we leave files under the standard cache layout (no
+ # --local-dir). The cache root is HF_HOME if set, otherwise
+ # ~/.cache/huggingface.
+ local hf_root="${HF_HOME:-${HOME}/.cache/huggingface}"
+ # HF cache layout: hub/models----/snapshots//...
+ local hub_dirname="models--$(echo "${ANYSPLAT_MODEL_ID}" | sed 's|/|--|g')"
+ local snapshots_dir="${hf_root}/hub/${hub_dirname}/snapshots"
+
+ if [[ -d "${snapshots_dir}" ]] && \
+ [[ -n "$(ls -A "${snapshots_dir}" 2>/dev/null)" ]] && \
+ [[ "${FORCE}" -eq 0 ]]; then
+ echo "==> [anysplat] already present in HF cache: ${snapshots_dir}"
+ return 0
+ fi
+
+ require_hf_cli
+ echo "==> [anysplat] downloading ${ANYSPLAT_MODEL_ID} into HF cache (${hf_root})"
+ hf download "${ANYSPLAT_MODEL_ID}"
+ echo "==> [anysplat] done."
+}
+
+
+if [[ "${SKIP_SAM3D}" -eq 0 ]]; then
+ download_sam3d_objects
+else
+ echo "==> [sam-3d-objects] skipped (--skip-sam3d)"
+fi
+
+if [[ "${SKIP_SAM3}" -eq 0 ]]; then
+ download_sam3
+else
+ echo "==> [sam3] skipped (--skip-sam3)"
+fi
+
+if [[ "${SKIP_ANYSPLAT}" -eq 0 ]]; then
+ download_anysplat
+else
+ echo "==> [anysplat] skipped (--skip-anysplat)"
+fi
+
+echo "==> All requested checkpoints are in place."
diff --git a/scripts/install_env.sh b/scripts/install_env.sh
new file mode 100755
index 0000000..e2e699b
--- /dev/null
+++ b/scripts/install_env.sh
@@ -0,0 +1,204 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+PYTHON_VERSION="3.11"
+TORCH_VERSION="2.7.0"
+TORCHVISION_VERSION="0.22.0"
+TORCHAUDIO_VERSION="2.7.0"
+PYTORCH_INDEX_URL="https://download.pytorch.org/whl/cu128"
+KAOLIN_FIND_LINKS="https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html"
+
+INSTALL_TORCH=1
+UPDATE_SUBMODULES=1
+COMPILE_CUROPE=1
+
+usage() {
+ cat <<'EOF'
+Usage: bash scripts/install_env.sh [options]
+
+Options:
+ --python VERSION Python version for uv venv. Default: 3.11
+ --skip-torch Do not install torch/torchvision/torchaudio.
+ --skip-submodules Do not run git submodule update --init --recursive.
+ --skip-curope Do NOT patch+compile AnySplat curope CUDA extension
+ (compiled by default; without it AnySplat falls back
+ to a slower PyTorch RoPE2D implementation).
+ -h, --help Show this help.
+
+Examples:
+ bash scripts/install_env.sh
+ bash scripts/install_env.sh --skip-torch
+ bash scripts/install_env.sh --skip-curope
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --python)
+ PYTHON_VERSION="$2"
+ shift 2
+ ;;
+ --skip-torch)
+ INSTALL_TORCH=0
+ shift
+ ;;
+ --skip-submodules)
+ UPDATE_SUBMODULES=0
+ shift
+ ;;
+ --skip-curope)
+ COMPILE_CUROPE=0
+ shift
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "Unknown option: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+cd "${PROJECT_ROOT}"
+
+echo "==> Project root: ${PROJECT_ROOT}"
+
+if ! command -v uv >/dev/null 2>&1; then
+ echo "==> uv not found. Installing uv with pip..."
+ python3 -m pip install -U uv
+fi
+
+if [[ "${UPDATE_SUBMODULES}" -eq 1 ]]; then
+ echo "==> Updating git submodules..."
+ git submodule update --init --recursive
+fi
+
+echo "==> Creating/updating .venv with Python ${PYTHON_VERSION}..."
+uv venv --python "${PYTHON_VERSION}" .venv
+
+# shellcheck disable=SC1091
+source "${PROJECT_ROOT}/.venv/bin/activate"
+
+export PYTHONPATH="${PROJECT_ROOT}/submodule/Sam-3d-objects/notebook:${PROJECT_ROOT}/submodule/Sam-3d-objects:${PYTHONPATH:-}"
+export PIP_FIND_LINKS="${KAOLIN_FIND_LINKS}"
+
+echo "==> Python: $(which python)"
+python --version
+
+if [[ "${INSTALL_TORCH}" -eq 1 ]]; then
+ echo "==> Installing PyTorch ${TORCH_VERSION} from ${PYTORCH_INDEX_URL}..."
+ uv pip install \
+ "torch==${TORCH_VERSION}" \
+ "torchvision==${TORCHVISION_VERSION}" \
+ "torchaudio==${TORCHAUDIO_VERSION}" \
+ --index-url "${PYTORCH_INDEX_URL}"
+else
+ echo "==> Skipping PyTorch install."
+fi
+
+echo "==> Installing AnySplat requirements..."
+uv pip install -r submodule/AnySplat/requirements.txt --no-build-isolation
+
+echo "==> Installing SAM-3D-Objects build helpers..."
+uv pip install hatch-requirements-txt editables wheel
+
+echo "==> Installing SAM-3D-Objects extras..."
+uv pip install -e './submodule/Sam-3d-objects[dev]'
+uv pip install -e './submodule/Sam-3d-objects[p3d]' --no-build-isolation
+uv pip install -e './submodule/Sam-3d-objects[inference]' \
+ --no-build-isolation \
+ --find-links "${KAOLIN_FIND_LINKS}"
+
+echo "==> Installing project-level runtime dependencies..."
+# Do NOT use -U here: that would let uv upgrade transitive deps (notably
+# torch, via iopaint) and clobber the CUDA-pinned torch installed above.
+uv pip install --index-strategy unsafe-best-match \
+ "transformers==4.48.3" \
+ "iopaint>=1.2.0" \
+ "diffusers>=0.27.2" \
+ "numpy<2.0" \
+ "opencv-python>=4.8.0" \
+ "pyyaml>=6.0" \
+ "requests>=2.31.0" \
+ "tqdm>=4.66.0" \
+ "setuptools" \
+ "einops"
+
+# Pin huggingface_hub to 0.25.2 as the very last step: diffusers 0.27.2 (and
+# the iopaint stack on top of it) still imports `cached_download` from
+# huggingface_hub, which was removed in hub >= 0.26. Upstream Sam-3d-objects /
+# iopaint extras may pull in a newer hub transitively, so we force-reinstall
+# last (with --no-deps so it can downgrade without uv complaining) and lock
+# the exact version that was empirically verified to work.
+#
+# Note: transformers above is pinned to ==4.48.3 (not >=) because transformers
+# 5.x imports `is_offline_mode` from huggingface_hub, which doesn't exist in
+# 0.25.2 — using a floor here lets pip resolve to 5.x and breaks iopaint at
+# runtime even though hub stays pinned.
+echo "==> Pinning huggingface_hub==0.25.2 (force-reinstall, no-deps)..."
+uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \
+ "huggingface_hub==0.25.2"
+
+echo "==> Installing SAM3..."
+uv pip install --index-strategy unsafe-best-match \
+ "git+https://github.com/facebookresearch/sam3.git"
+
+# Optional mesh2mjcf extras (installed by default so `-cd` / `--verbose` Just
+# Work; `trimesh` is also used for multi-material OBJ splitting).
+echo "==> Installing mesh2mjcf extras (coacd, trimesh, mujoco)..."
+uv pip install --index-strategy unsafe-best-match \
+ "coacd" \
+ "trimesh" \
+ "mujoco"
+
+if [[ "${COMPILE_CUROPE}" -eq 1 ]]; then
+ CUROPE_DIR="${PROJECT_ROOT}/submodule/AnySplat/src/model/encoder/backbone/croco/curope"
+ KERNELS_CU="${CUROPE_DIR}/kernels.cu"
+
+ if [[ ! -f "${KERNELS_CU}" ]]; then
+ echo "ERROR: kernels.cu not found: ${KERNELS_CU}" >&2
+ exit 1
+ fi
+
+ echo "==> Patching AnySplat curope kernels.cu..."
+ python - "${KERNELS_CU}" <<'PY'
+from pathlib import Path
+import sys
+
+path = Path(sys.argv[1])
+text = path.read_text()
+patched = text.replace(
+ 'AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {',
+ 'AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.scalar_type(), "rope_2d_cuda", ([&] {',
+)
+if patched != text:
+ path.write_text(patched)
+ print(f"patched {path}")
+else:
+ print(f"no patch needed for {path}")
+PY
+
+ echo "==> Building AnySplat curope extension..."
+ (
+ cd "${CUROPE_DIR}"
+ python setup.py build_ext --inplace
+ )
+fi
+
+cat < Install finished.
+
+Next steps:
+ source .venv/bin/activate
+ export PYTHONPATH="${PROJECT_ROOT}/submodule/Sam-3d-objects/notebook:${PROJECT_ROOT}/submodule/Sam-3d-objects:\${PYTHONPATH:-}"
+
+If you use gated HuggingFace models, run:
+ huggingface-cli login
+EOF
diff --git a/submodule/AnySplat b/submodule/AnySplat
new file mode 160000
index 0000000..d29bc6a
--- /dev/null
+++ b/submodule/AnySplat
@@ -0,0 +1 @@
+Subproject commit d29bc6adf82c953f1fd337d8d0ba6259d906b2c9
diff --git a/submodule/Prompt-Inpaint b/submodule/Prompt-Inpaint
new file mode 160000
index 0000000..0dffc4b
--- /dev/null
+++ b/submodule/Prompt-Inpaint
@@ -0,0 +1 @@
+Subproject commit 0dffc4b50c33509d80135159b2b031d94e272e6e
diff --git a/submodule/Sam-3d-objects b/submodule/Sam-3d-objects
new file mode 160000
index 0000000..d4b6362
--- /dev/null
+++ b/submodule/Sam-3d-objects
@@ -0,0 +1 @@
+Subproject commit d4b63627dc2a7ae0a175be482942e6f32633ff55