diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b97641a --- /dev/null +++ b/.gitignore @@ -0,0 +1,195 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +/datasets +/dataset_cache + +# Outputs +/outputs +/lightning_logs +/checkpoints + +.bashrc +/launcher_venv +/slurm_logs +*.torch +*.ckpt +table.tex +/baselines +/test/* + +wandb/ +output* +results* + +*.ply +*.mp4 +!assets/pipeline.jpg +!examples/video/*.mp4 + +src/loss/depth_anything/* + +.vscode/ +.gradio/ +note.txt +anysplat_ckpt* +input_images_* +tmp_scripts/ diff --git a/.gitmodules b/.gitmodules index 9a9af42..ed07a66 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,9 @@ -[submodule "sam-3d-objects"] - path = sam-3d-objects - url = https://github.com/facebookresearch/sam-3d-objects.git -[submodule "sam3"] - path = sam3 - url = https://github.com/facebookresearch/sam3.git +[submodule "Sam-3d-objects"] + path = submodule/Sam-3d-objects + url = https://github.com/Yuchi-Zhang-00/sam-3d-objects.git +[submodule "AnySplat"] + path = submodule/AnySplat + url = https://github.com/Yuchi-Zhang-00/AnySplat.git +[submodule "Prompt-Inpaint"] + path = submodule/Prompt-Inpaint + url = https://github.com/MrZoyo/Prompt-Inpaint.git diff --git a/README.md b/README.md index 424ff4a..bb765b4 100644 --- a/README.md +++ b/README.md @@ -4,257 +4,478 @@

-# **Unified Multi-Stage 2D→3D Perception Pipeline** +# **Unified 2D Single-Image → 3D Object Generation Pipeline** -## *vLLM × SAM3 × SAM-3D-Objects Integration* +## *Prompt-Inpaint × AnySplat × SAM-3D-Objects Integration* + +> This repo was originally forked from [xyys2003/sam3d_gs](https://github.com/xyys2003/sam3d_gs). ------ ## **Abstract** -This repository presents a unified and modular pipeline that couples large-scale vision–language reasoning, high-fidelity 2D segmentation, and multi-object 3D Gaussian splatting. It integrates three independent systems—**vLLM** (for Qwen3-VL inference), **SAM3** (for multi-object 2D segmentation), and **SAM-3D-Objects** (for 3D reconstruction from RGB + masks)—into a complete, end-to-end workflow. To ensure reproducibility, each module runs inside its own Conda environment. The pipeline supports both staged execution and a fully automated one-click execution, with built-in HuggingFace authentication, checkpoint management, and environment initialization. +This repository packages a single-image 2D → 3D object reconstruction pipeline by composing three open-source systems behind one entry script: ------- +- **Prompt-Inpaint** — text-prompted multi-object segmentation (built on SAM3) plus background inpainting, producing per-object masks and a clean background image. +- **AnySplat** — feed-forward 3D Gaussian Splatting from a single image, plus a RANSAC-based table-alignment pass that brings the scene into a Mujoco-friendly world frame. +- **SAM-3D-Objects** — per-object mesh and Gaussian reconstruction from RGB + mask. -# **1. Repository Setup** +The three components are wired together through scripts under `pipeline/` and a single uv-managed virtual environment, so the whole pipeline runs from one shell command. -``` -git clone --recursive https://github.com/xyys2003/sam3d_gs.git -cd sam3d_gs -``` +------ -If cloned without submodules: +# **1. Repository Layout** ``` -git submodule update --init --recursive +. +├── run_object_generation_pipeline.sh # one-shot entry: image → 3D assets +├── pipeline/ +│ ├── background_reconstruction.py # AnySplat + table RANSAC alignment +│ ├── objects_generation.py # SAM-3D-Objects multi-object reconstruction +│ ├── mesh2mjcf.py # optional: convert per-object .obj → MuJoCo MJCF +│ └── utils.py # shared rendering / IO helpers +└── submodule/ + ├── Prompt-Inpaint/ # SAM3 segmentation + inpainting + ├── AnySplat/ # single-image 3DGS reconstruction + └── Sam-3d-objects/ # per-object mesh / GS reconstruction ``` ------ -# **2. Conda Environments** +# **2. Setup** -| Environment | Purpose | Path | -| --------------- | ---------------------------------------- | ----------------- | -| `vllm` | Serve Qwen3-VL-8B-Thinking via vLLM | — | -| `sam3` | Multi-object segmentation (SAM3) | `sam3/` | -| `sam3d-objects` | RGB + masks → 3D Gaussian reconstruction | `sam-3d-objects/` | +The project runs inside a single `uv`-managed virtual environment (`.venv/`). The setup below targets RTX 50-series GPUs (CUDA 12.8, PyTorch 2.7) and is also verified to work on 3090 / 4090. ------- +> **Hardware**: an NVIDIA GPU with **≥ 24 GB VRAM** is recommended. The pipeline loads SAM3, AnySplat, and SAM-3D-Objects sequentially and the SAM-3D-Objects stage in particular is memory-hungry. -# **3. vLLM Environment (Qwen3-VL Server)** +## **2.1 Clone with submodules** -``` -conda create -n vllm python=3.10 -y -conda activate vllm +```bash +git clone --recursive https://github.com/Yuchi-Zhang-00/sam3d_gs.git +cd sam3d_gs ``` -Install PyTorch (CUDA 12.x): +If the submodules were not initialized at clone time: -``` -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \ - --index-url https://download.pytorch.org/whl/cu124 +```bash +git submodule update --init --recursive ``` -Install vLLM: +## **2.2 Install the Python environment** -``` -pip install vllm --extra-index-url https://download.pytorch.org/whl/cu124 -pip install transformers tiktoken sentencepiece xformers flashinfer-python -pip install huggingface_hub +The recommended path is the bundled one-command installer: + +```bash +bash scripts/install_env.sh ``` ------- +It creates `.venv`, installs PyTorch for CUDA 12.8, the submodule dependencies, and the project-level runtime dependencies. -# **4. SAM3 Environment** +If you would rather run each step yourself, see [`install.md`](install.md). It also documents the small SAM-3D-Objects requirements-file patches and the AnySplat `kernels.cu` fix used to build the CUDA RoPE2D kernel. -Reference implementation: - 🔗 https://github.com/facebookresearch/sam3 - 🔗 https://huggingface.co/facebook/sam3 +## **2.3 HuggingFace access** -``` -cd sam3 -conda create -n sam3 python=3.10 -y -conda activate sam3 -``` +The pipeline pulls three models from HuggingFace: -Install SAM3: +| Model | Used by | Access | +| --- | --- | --- | +| [`facebook/sam3`](https://huggingface.co/facebook/sam3) | Prompt-Inpaint (Stage 1) | **Gated** — request access on the model page | +| [`facebook/sam-3d-objects`](https://huggingface.co/facebook/sam-3d-objects) | SAM-3D-Objects (Stage 3) | **Gated** — request access on the model page | +| [`lhjiang/anysplat`](https://huggingface.co/lhjiang/anysplat) | AnySplat (Stage 2) | Public (MIT) | -``` -git clone https://github.com/facebookresearch/sam3.git -cd sam3 -pip install -e . +After accepting the agreements on the two gated pages, log in once: + +```bash +hf auth login ``` -Optional: +The two gated models need explicit local placement and are fetched by a +single bootstrap script (run once, after `hf auth login`): +```bash +bash scripts/download_checkpoints.sh ``` -pip install -e ".[notebooks]" -pip install -e ".[train,dev]" -``` + +| Model | Target | +| --- | --- | +| `facebook/sam-3d-objects` | `submodule/Sam-3d-objects/checkpoints/hf/` (Hydra config tree, not fetched by `from_pretrained`) | +| `facebook/sam3` | `submodule/Prompt-Inpaint/checkpoints/sam3.pt` (~3.3 GB; placed locally so it isn't lost when `~/.cache` is cleaned) | + +The script is idempotent and is also invoked automatically by +`run_object_generation_pipeline.sh` on first run. Use `--skip-sam3d`, +`--skip-sam3`, or `--force` to control individual stages. + +`lhjiang/anysplat` is also fetched by the same bootstrap script (into the +standard HuggingFace hub cache at `~/.cache/huggingface/hub/`). It is public +(MIT), so no `hf auth login` is required for this one — pre-fetching just +keeps the first Stage-2 run from doing a multi-GB download. Pass +`--skip-anysplat` if you'd rather have AnySplat pull it lazily on first run. ------ -# **5. SAM-3D-Objects Environment** +## **2.4 Docker image (alternative to 2.1–2.3)** -Reference implementation: - 🔗 https://github.com/facebookresearch/sam3d - 🔗 https://huggingface.co/facebook/sam-3d-objects +A pre-built image with the full environment (CUDA 12.8 base, the +uv-managed `.venv`, the compiled AnySplat curope CUDA extension, and all +PyPI deps) is published to Aliyun Container Registry: ``` -conda create -n sam_3d_body python=3.10 -y -conda activate sam_3d_body +crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 +crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:latest ``` -Install dependencies (excerpt): +Using the image skips §2.2 entirely; you still need a clone of this repo on +the host (the launcher and the host-side checkpoint directories) and HF +access for the two gated models (§2.3). -``` -pip install pytorch-lightning pyrender opencv-python yacs scikit-image einops timm dill pandas hydra-core ... -``` +### **Prerequisites** -Install Detectron2: +- Docker with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) + installed; an NVIDIA GPU with ≥ 24 GB VRAM +- A local clone of this repo (`git clone --recursive ...`, see §2.1) — used + both for the `run_docker.sh` launcher and as the bind-mount root for + checkpoints, data, and outputs +- One-time HuggingFace setup (§2.3) and a host-side run of + `bash scripts/download_checkpoints.sh`. Checkpoints live on the host and + are bind-mounted into the container, so this only runs once. -``` -pip install 'git+https://github.com/facebookresearch/detectron2.git@a1ce2f9' \ - --no-build-isolation --no-deps +### **Pull the image** + +```bash +docker pull crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 +docker tag crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 sam3d-gs:latest ``` -Optional: MoGe +The re-tag is optional. `run_docker.sh` defaults to `sam3d-gs:latest`; if +you'd rather not re-tag, prefix the launch with +`SAM3D_IMAGE=crpi-.../sam3d_gs:v0.1` instead. -``` -pip install git+https://github.com/microsoft/MoGe.git +### **Launch the container** + +```bash +./run_docker.sh # uses defaults +./run_docker.sh /path/to/sam3d_gs # explicit project dir +./run_docker.sh /path/to/sam3d_gs /mnt/hf_cache # custom HF cache root +SAM3D_IMAGE=sam3d-gs:v0.1 ./run_docker.sh # pick a specific tag +TORCH_HOME=/mnt/torch_cache ./run_docker.sh # custom torch hub cache ``` ------- +The launcher bind-mounts the relevant host paths into the container: -# **6. Required HuggingFace Access** +| Host path | Container path | Purpose | +| --- | --- | --- | +| `/submodule/Sam-3d-objects/checkpoints` | same | SAM-3D-Objects weights (gated) | +| `/submodule/Prompt-Inpaint/checkpoints` | same | SAM3 weight (gated) | +| `${HF_HOME:-$HOME/.cache/huggingface}` | `/root/.cache/huggingface` | AnySplat + other HF downloads | +| `${TORCH_HOME:-$HOME/.cache/torch}` | `/root/.cache/torch` | `torch.hub` cache (DINOv2 etc.) | +| `/data` | `/opt/sam3d_gs/data` | scratch input/output dir | +| `/example` | `/opt/sam3d_gs/example` | bundled demo input/output | -The pipeline requires access to the following models: +Pipeline outputs land in whichever scene directory you point the launcher +at — since `data/` and `example/` are bind-mounted, those outputs persist +on the host after the container exits. -- **SAM3** - 🔗 https://huggingface.co/facebook/sam3 -- **SAM-3D-Objects** - 🔗 https://huggingface.co/facebook/sam-3d-objects +### **Run the pipeline inside the container** -Log in after requesting access: +You land in `/opt/sam3d_gs/`. The image's `PATH` and `PYTHONPATH` already +point at the bundled `.venv`, so you can call `python` and run scripts +directly — **no `source .venv/bin/activate`**. -``` -hf auth login +```bash +# Bundled demo: +bash run_object_generation_pipeline.sh example/example.png + +# Your own image: +bash run_object_generation_pipeline.sh data/my_scene/input_image.png ``` ------- +Stage 1/2/3 each behave exactly as in §3–§4 below. -# **7. Running the Pipeline** +### **What's baked into the image** -Ensure the Conda activation path is correct: +- CUDA 12.8 devel base + Python 3.11 `.venv` with every PyPI dep +- Compiled AnySplat `curope` CUDA extension (sm_80 / 90 / 100 / 120) +- `coacd`, `trimesh`, `mujoco` (so `pipeline/mesh2mjcf.py` works out of the box) +- `sitecustomize.py` patching `torch.hub` to use the local cache without + pinging github first (avoids `RemoteDisconnected` on flaky networks once + the model is in `~/.cache/torch/hub`) +- A global `git insteadOf` rule routing `https://github.com/` through + `https://gh-proxy.com/https://github.com/`, so in-container `git clone` + works on networks where direct github access is unreliable -``` -CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh" -``` +### **What's NOT baked in** + +- The three model checkpoint sets (SAM3, SAM-3D-Objects, AnySplat). They + live on the host and are bind-mounted via the table above. Run + `scripts/download_checkpoints.sh` once on the host. +- Your input data. Drop it into `/data//` and reference + it as `data//input_image.png` inside the container. + +### **Caveats** + +- **Output files end up owned by `root` on the host.** The container runs + as root, so anything the pipeline writes into a bind-mounted directory + (`data/`, `example/`, the checkpoint dirs, etc.) shows up on the host + with uid 0. Two ways to deal with it: + + ```bash + # After the container exits, fix ownership on the host: + sudo chown -R $(id -u):$(id -g) data/ example/ + + # Or run the container as your host user from the start. + # This avoids the chown step but can break EGL / pyrender setup + # in some Sam-3d-objects code paths, so prefer the chown fix. + # (To try anyway: edit run_docker.sh and add `--user $(id -u):$(id -g)` + # to the `docker run` invocation.) + ``` + +- **The `gh-proxy.com` redirect is for users behind the GFW.** The image + bakes a `git config --global url..insteadOf https://github.com/` + rule so in-container `git clone` of github URLs survives flaky direct + access from mainland China. **Outside mainland China this hop is + unnecessary and may slow things down.** Disable it once per container + start: + + ```bash + git config --global --unset url."https://gh-proxy.com/https://github.com/".insteadOf + ``` + + (Or bake your own image variant with the rule removed if you'd rather + not run that every time.) ------ -## **Stage 1 — Qwen3-VL + SAM3 (2D Mask Generation)** +# **3. Quick Start** -``` -bash run_agent_with_vllm.sh -``` +> If you're using the Docker image (§2.4), start the container first with +> `./run_docker.sh` — every command in this section runs **inside** the +> container exactly as written. -Outputs: +Try the bundled demo image (the entry script activates `.venv` internally, so you don't need to do it yourself): +```bash +bash run_object_generation_pipeline.sh example/example.png ``` -outputs/master_with_vllm/masks/ + +By default, all outputs are written next to the input image (in this case, into `example/`). Pass an explicit output directory as the second argument if you want them elsewhere: + +```bash +bash run_object_generation_pipeline.sh example/example.png path/to/scene_dir ``` +The script runs three stages in sequence inside the single `.venv`: + +1. `submodule/Prompt-Inpaint/main.py` — segmentation + inpainting +2. `pipeline/background_reconstruction.py` — AnySplat reconstruction + table alignment +3. `pipeline/objects_generation.py` — per-object mesh + Gaussian export + ------ -## **Stage 2 — SAM-3D-Objects Reconstruction** +# **4. Pipeline Stages** -``` -bash run_sam3d_from_masks.sh +## **Stage 1 — Prompt-Inpaint (SAM3 segmentation + inpainting)** + +```bash +python submodule/Prompt-Inpaint/main.py \ + --resize-output \ + --save-individual-masks \ + --config submodule/Prompt-Inpaint/configs/items.yml \ + --image path/to/input_image.png \ + --output-dir path/to/scene_dir ``` -Outputs: +Outputs (under `scene_dir/`): -``` -sam-3d-objects/outputs/torch_save_pt/ -sam-3d-objects/gaussians/multi/ +- `input_image.png` — resized copy of the input +- `clean_background.png` — inpainted background with all foreground objects removed +- `bg_mask.png` — table / desktop mask used for plane fitting +- `masks/.png` — per-object binary masks + +## **Stage 2 — AnySplat + table-aligned 3D Gaussians** + +```bash +python pipeline/background_reconstruction.py path/to/scene_dir ``` ------- +Behaviour: -## **Optional: One-Click Execution** +- Loads `clean_background.png` (and the matching `input_image.png`) inside each scene folder under the input directory. +- Runs AnySplat to recover camera intrinsics/extrinsics, depth, and a 3DGS reconstruction. +- Fits a RANSAC plane to `bg_mask.png`, derives an OBB via inner PCA, and builds a world-to-table transform. +- Re-emits the splat in a Mujoco-friendly frame. +Useful flags: + +- `--model-id lhjiang/anysplat` — override the AnySplat HuggingFace model id +- `--align-table` / `--no-align-table` — toggle RANSAC table alignment + the `bg_aligned.ply` export (default: enabled). When disabled, only the raw `bg.ply` is written +- `--x-offset`, `--z-offset` — optional placement offsets (m) applied after alignment. Default: 0, so the aligned cloud sits at the origin + +Outputs (under `scene_dir/`): + +- `extrinsic.npy`, `intrinsic.npy` — camera parameters (world-to-camera; pixel-unit intrinsics) +- `depth.npy`, `depth_visual.png` — depth from the splat reconstruction +- `depth_ori.npy`, `depth_ori_visual.png` — depth from the original (non-inpainted) image +- `scale.npy` — scene-level scale factor +- `3d_assets/bg.ply` — raw 3DGS scene from AnySplat +- `3d_assets/bg_aligned.ply` — table-aligned 3DGS scene (only when `--align-table` is on, which is the default) + +## **Stage 3 — SAM-3D-Objects per-object reconstruction** + +```bash +python pipeline/objects_generation.py --input-dir path/to/scene_dir ``` -bash run_pipeline.sh -``` + +Useful flags: + +- `--project-root submodule/Sam-3d-objects` — checkpoint root +- `--tag hf` — checkpoint subdirectory (`submodule/Sam-3d-objects/checkpoints//pipeline.yaml`) +- `--seed 42`, `--save-pt`, `--save-intermediate` + +For each mask, the stage runs SAM-3D-Objects inference, recovers the object's local scale by matching projected area + mean depth against the AnySplat depth map, and exports the asset at the origin. + +Outputs (under `scene_dir/3d_assets/`): + +- `.obj` — per-object mesh sized for Mujoco +- `.ply` — per-object 3D Gaussians sized for Mujoco +- `_keyframe.npy` — mean XYZ of the final mesh +- (with `--save-intermediate`) debug renderings and the pose-applied versions ------ -# **8. Q&A** +# **5. Optional Tools** -## **Q1: Download error “Consistency check failed: file should be XXXX but has size YYYY”?** +## **`pipeline/mesh2mjcf.py` — mesh → MuJoCo MJCF converter** -Cause: corrupted model shards in the HuggingFace cache due to unstable network. +A standalone CLI that turns a single `.obj` or `.stl` mesh into MuJoCo MJCF +assets (a `_dependencies.xml` + `.xml` pair, plus a per-asset +mesh / texture directory). It is **not** wired into +`run_object_generation_pipeline.sh`; use it on demand once Stage 3 has +produced `/3d_assets/.obj`. -Fix: +By default, the output root is the parent directory of the input mesh, so +running it on `scene_dir/3d_assets/cup.obj` writes a self-contained per-asset +folder right next to the input: ``` -rm -rf sam-3d-objects/checkpoints/hf -rm -rf ~/.cache/huggingface/hub # optional -bash run_sam3d_from_masks.sh +scene_dir/3d_assets/ + cup.obj (original input, untouched) + cup/ (per-asset output folder, named after the obj stem) + cup.obj (copy of the input) + cup.mtl (if multi-material) + (referenced by the MTL) + part_0.obj part_1.obj ... (if -cd) + mjcf/ + cup.xml + cup_dependencies.xml ``` -Force fresh download: +Mesh paths inside the emitted XMLs are written as `/`, so the +consuming MuJoCo scene should set `meshdir` (and `texturedir`) to the output +root. Pass `-o/--output ` to redirect. -``` -force_download=True -``` +### Required libraries + +Fresh installs via `scripts/install_env.sh` already include all three optional +packages (`coacd`, `trimesh`, `mujoco`), so the table below is only for +reference if you skip the bundled installer or build the environment +piecemeal: -## **Note on Coordinate System (PLY Output Orientation)** +| Feature | Library | Manual install | +| --- | --- | --- | +| Multi-material OBJ splitting (automatic when an MTL file is present) | `trimesh` | `uv pip install trimesh` | +| Convex decomposition (`-cd`) | `coacd`, `trimesh` | `uv pip install coacd trimesh` | +| Preview viewer (`--verbose`) | `mujoco` | `uv pip install mujoco` | -The 3D Gaussian `.ply` files exported by **SAM-3D-Objects** are expressed in the **camera coordinate system**, where: +### Usage -- **+Z axis** points **forward** from the camera -- **+X axis** points right -- **+Y axis** points downward (typical computer vision convention) +```bash +# Basic conversion (default colour / mass / inertia) +python pipeline/mesh2mjcf.py path/to/cup.obj -This means the reconstructed objects are aligned using **camera-forward Z-axis** rather than a world coordinate frame. +# Custom RGBA, mass, and diagonal inertia +python pipeline/mesh2mjcf.py path/to/cup.obj \ + --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005 -If you want to visualize or place the objects in a global **world coordinate system**, you must apply a **camera-to-world transformation**: -$$ -\mathbf{X}_{world} = \mathbf{R}_{c2w}\ \mathbf{X}_{camera} \ + \ \mathbf{t}_{c2w} -$$ -Where: +# Free-floating body + convex decomposition for accurate collisions +python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint -cd -- $\mathbf{R}_{c2w}$ is the rotation matrix from camera to world -- $\mathbf{t}_{c2w}$ is the translation vector -- $\mathbf{X}_{camera}$ is the Gaussian center in camera coordinates -- $\mathbf{X}_{world}$ is the desired world coordinate position +# Preview in mujoco.viewer after conversion +python pipeline/mesh2mjcf.py path/to/cup.obj --verbose + +# Batch over all per-object meshes in one scene +for obj in scene_dir/3d_assets/*.obj; do + python pipeline/mesh2mjcf.py "$obj" -cd +done +``` -After applying this transformation, the `.ply` will correctly align with your global scene, robotics simulator, or NeRF / COLMAP world frame. ------ -# **Citation** +# **6. FAQ** + +**Q: HuggingFace download fails with “Consistency check failed: file should be XXXX but has size YYYY”.** -### SAM3 +Corrupt shards in the HuggingFace cache. Clear and retry: +```bash +rm -rf submodule/Sam-3d-objects/checkpoints/hf +rm -rf ~/.cache/huggingface/hub # optional, more aggressive +bash run_object_generation_pipeline.sh path/to/input_image.png ``` -@article{kirillov2024sam3, - title={SAM 3: Segment Anything in Images and Videos}, - author={Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others}, - year={2024}, - url={https://github.com/facebookresearch/sam3} -} + +You can also force a fresh download by setting `force_download=True` when invoking the HuggingFace API. + +**Q: AnySplat reports “cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead”.** + +The CUDA extension was not built. Apply the `kernels.cu` patch documented in [`install.md`](install.md) and run `python setup.py build_ext --inplace`. + +**Q: `ImportError: cannot import name 'cached_download' from 'huggingface_hub'` during Stage 1 (Prompt-Inpaint / iopaint).** + +`huggingface_hub` ≥ 0.26 removed `cached_download`, but `diffusers` 0.27.x (which is what `iopaint` pulls in) still imports it. Downgrade `huggingface_hub` to 0.25.2: + +```bash +source .venv/bin/activate +uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \ + "huggingface_hub==0.25.2" ``` -### SAM-3D-Objects +Fresh installs via `scripts/install_env.sh` already include this pin. + +**Q: `ImportError: cannot import name 'is_offline_mode' from 'huggingface_hub'` during Stage 1.** +Same symptom from the other direction: `transformers` 5.x imports `is_offline_mode` from `huggingface_hub`, which doesn't exist in 0.25.2. Pin transformers to 4.48.3: + +```bash +source .venv/bin/activate +uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \ + "transformers==4.48.3" ``` + +Fresh installs via `scripts/install_env.sh` already include this pin. + +------ + +# **Citations** + +```bibtex +@article{kirillov2024sam3, + title = {SAM 3: Segment Anything in Images and Videos}, + author = {Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others}, + year = {2024}, + url = {https://github.com/facebookresearch/sam3} +} + @article{wu2024sam3dobjects, - title={SAM-3D-Objects: Segment Anything in 3D Using 2D Masks}, - author={Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others}, - year={2024}, - url={https://github.com/facebookresearch/sam3d} + title = {SAM-3D-Objects: Segment Anything in 3D Using 2D Masks}, + author = {Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others}, + year = {2024}, + url = {https://github.com/facebookresearch/sam-3d-objects} +} + +@article{jiang2024anysplat, + title = {AnySplat: Feed-forward 3D Gaussian Splatting from Unconstrained Views}, + author = {Jiang, Lihan and others}, + year = {2024}, + url = {https://github.com/OpenRobotLab/AnySplat} } ``` @@ -264,11 +485,9 @@ After applying this transformation, the `.ply` will correctly align with your gl This project is built upon and integrates: -- **SAM3** - GitHub: https://github.com/facebookresearch/sam3 - HuggingFace: https://huggingface.co/facebook/sam3 -- **SAM-3D-Objects** - GitHub: https://github.com/facebookresearch/sam3d - HuggingFace: https://huggingface.co/facebook/sam-3d-objects +- **SAM3** — [GitHub](https://github.com/facebookresearch/sam3) · [HuggingFace](https://huggingface.co/facebook/sam3) +- **SAM-3D-Objects** — [GitHub](https://github.com/facebookresearch/sam3d) · [HuggingFace](https://huggingface.co/facebook/sam-3d-objects) +- **AnySplat** — [HuggingFace](https://huggingface.co/lhjiang/anysplat) +- **Prompt-Inpaint** — [GitHub](https://github.com/MrZoyo/Prompt-Inpaint) -We sincerely thank the authors for making their research and implementations publicly available. \ No newline at end of file +We thank the authors for making their research and implementations publicly available. diff --git a/README_zh.md b/README_zh.md index 5ab1418..0a1d1a6 100644 --- a/README_zh.md +++ b/README_zh.md @@ -4,327 +4,481 @@

-# **统一的多阶段 2D→3D 感知流水线** +# **2D 单图 → 3D 物体生成流水线** -## *vLLM × SAM3 × SAM-3D-Objects 集成* +## *Prompt-Inpaint × AnySplat × SAM-3D-Objects 集成* + +> 本仓库最初 fork 自 [xyys2003/sam3d_gs](https://github.com/xyys2003/sam3d_gs)。 ------ ## **摘要** -本仓库构建了一个完整的 2D → 3D 感知流水线,将 **大模型视觉理解、2D 多物体分割、3D Gaussian Splatting 重建** 三者进行统一整合。流水线由: +本仓库将三个开源系统串联进单条流水线,使用一条命令即可完成单图 → 多物体 3D 资产的生成: -- **vLLM**:提供 Qwen3-VL-8B-Thinking 视觉语言大模型推理 -- **SAM3**:执行高质量多物体 2D 分割 -- **SAM-3D-Objects**:将 RGB + mask 提升为 3D 高斯点(Gaussian Splat) +- **Prompt-Inpaint**:基于 SAM3 的文本提示多物体分割 + 背景补全,产出有每个物体的 mask 与 clean background。 +- **AnySplat**:单图前馈式 3D Gaussian Splatting 重建;额外的 RANSAC 桌面对齐将场景对齐到坐标系原点。 +- **SAM-3D-Objects**:以 RGB + mask 为输入,重建单物体的 mesh 与 Gaussian。 -为确保可复现性,每个模块均独立运行在各自的 Conda 环境中。系统支持 **分阶段执行**(先 2D 分割、再 3D 重建),也支持 **一键式全流程运行**。 +三者通过 `pipeline/` 下的脚本以及一个由 `uv` 管理的单一虚拟环境串联起来,整条流水线由一个 shell 命令驱动。 ------ -# **1. 仓库克隆** +# **1. 仓库结构** ``` -git clone --recursive https://github.com/xyys2003/sam3d_gs.git -cd sam3d_gs +. +├── run_object_generation_pipeline.sh # 主入口:单图 → 3D 资产 +├── pipeline/ +│ ├── background_reconstruction.py # AnySplat + 桌面 RANSAC 对齐 +│ ├── objects_generation.py # SAM-3D-Objects 多物体重建 +│ ├── mesh2mjcf.py # 可选:把单物体 .obj 转成 MuJoCo MJCF +│ └── utils.py # 渲染 / IO 公共工具 +└── submodule/ + ├── Prompt-Inpaint/ # SAM3 分割 + 背景补全 + ├── AnySplat/ # 单图 3DGS 重建 + └── Sam-3d-objects/ # 单物体 mesh / GS 重建 ``` -如果你忘记使用 `--recursive` 克隆,可运行: +------ + +# **2. 环境安装** + +整个项目运行在单个由 `uv` 管理的虚拟环境 `.venv/` 中。下面的步骤面向 RTX 50 系 GPU(CUDA 12.8,PyTorch 2.7),同样在 3090 / 4090 上验证通过。 + +> **硬件**:推荐使用 **显存 ≥ 24 GB** 的 NVIDIA GPU。流水线会依次加载 SAM3、AnySplat、SAM-3D-Objects,其中 SAM-3D-Objects 阶段对显存最敏感。 +## **2.1 克隆仓库(含子模块)** + +```bash +git clone --recursive https://github.com/Yuchi-Zhang-00/sam3d_gs.git +cd sam3d_gs ``` + +如果克隆时忘了 `--recursive`: + +```bash git submodule update --init --recursive ``` ------- +## **2.2 安装 Python 环境** -# **2. Conda 环境说明** +推荐使用一键安装脚本: -本项目使用三个互相隔离的 Conda 环境,以避免依赖冲突。 +```bash +bash scripts/install_env.sh +``` -| 环境名称 | 功能用途 | 路径 | -| --------------- | ---------------------------------- | ----------------- | -| `vllm` | 运行 Qwen3-VL-8B-Thinking 推理服务 | — | -| `sam3` | 运行 SAM3 完成 2D 多物体分割 | `sam3/` | -| `sam3d-objects` | 从 RGB + Mask 生成 3D Gaussian | `sam-3d-objects/` | +脚本会创建 `.venv`、安装 CUDA 12.8 版 PyTorch、子模块依赖以及项目级运行时依赖。 ------- +如果想手动一步步执行,请查阅 [`install.md`](install.md)。该文档同时记录了 SAM-3D-Objects 的几处 requirements 文件 patch 和编译 AnySplat CUDA RoPE2D 内核所需的 `kernels.cu` 修改。 -# **3. vLLM 环境(Qwen3-VL 服务器)** +## **2.3 HuggingFace 权限申请** -### **3.1 创建环境** +流水线依赖以下三个 HuggingFace 模型: -``` -conda create -n vllm python=3.10 -y -conda activate vllm -``` +| 模型 | 使用方 | 访问 | +| --- | --- | --- | +| [`facebook/sam3`](https://huggingface.co/facebook/sam3) | Prompt-Inpaint(Stage 1) | **gated**,需在模型页面申请权限 | +| [`facebook/sam-3d-objects`](https://huggingface.co/facebook/sam-3d-objects) | SAM-3D-Objects(Stage 3) | **gated**,需在模型页面申请权限 | +| [`lhjiang/anysplat`](https://huggingface.co/lhjiang/anysplat) | AnySplat(Stage 2) | 公开(MIT) | -### **3.2 安装 PyTorch(CUDA 12.x)** +在两个 gated 模型页面接受协议后,登录一次: -``` -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \ - --index-url https://download.pytorch.org/whl/cu124 +```bash +hf auth login ``` -### **3.3 安装 vLLM 与相关依赖** +两个 gated 模型需要显式放置到本地,由一个 bootstrap 脚本一次性处理(登录后 +跑一次即可): -``` -pip install vllm --extra-index-url https://download.pytorch.org/whl/cu124 -pip install transformers tiktoken sentencepiece xformers flashinfer-python -pip install huggingface_hub +```bash +bash scripts/download_checkpoints.sh ``` -此配置已验证可稳定运行 **Qwen3-VL-8B-Thinking**。 +| 模型 | 落地位置 | +| --- | --- | +| `facebook/sam-3d-objects` | `submodule/Sam-3d-objects/checkpoints/hf/`(Hydra 配置树,不会被 `from_pretrained` 拉取) | +| `facebook/sam3` | `submodule/Prompt-Inpaint/checkpoints/sam3.pt`(约 3.3 GB;放到本地以免 `~/.cache` 清理后丢失) | ------- +该脚本是幂等的,且 `run_object_generation_pipeline.sh` 在首次运行时也会 +自动调用它。可以通过 `--skip-sam3d`、`--skip-sam3` 或 `--force` 单独控制每 +一个 stage。 -# **4. SAM3 环境** +`lhjiang/anysplat` 也由同一个 bootstrap 脚本拉取(落到标准的 HuggingFace +hub 缓存 `~/.cache/huggingface/hub/` 下)。它是公开模型(MIT),**不需要 +`hf auth login`**;提前拉只是避免 Stage 2 首次运行时做几 GB 的下载。 +传 `--skip-anysplat` 可以跳过这一步、让 AnySplat 首次运行时再 lazy 下载。 + +------ -官方实现: - 🔗 https://github.com/facebookresearch/sam3 - 🔗 https://huggingface.co/facebook/sam3 +## **2.4 Docker 镜像(2.1–2.3 的替代方案)** -### **4.1 创建环境** +仓库提供了一份预构建镜像,包含完整环境(CUDA 12.8 基础镜像、uv 管理的 +`.venv`、编译好的 AnySplat curope CUDA 扩展、所有 PyPI 依赖),已发布到 +阿里云容器镜像服务: ``` -cd sam3 -conda create -n sam3 python=3.10 -y -conda activate sam3 +crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 +crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:latest ``` -### **4.2 安装 PyTorch(CUDA 12.x)** +用镜像可以完全跳过 §2.2;但宿主机仍然需要克隆本仓库(用于 +`run_docker.sh` 启动脚本和 checkpoint 的 bind-mount 目录),以及完成 +§2.3 的 HuggingFace 权限申请。 -``` -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \ - --index-url https://download.pytorch.org/whl/cu124 -``` +### **前置条件** -### **4.3 克隆并安装 SAM3** +- 已安装 Docker 和 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html); + 显存 ≥ 24 GB 的 NVIDIA GPU +- 宿主机上已经 clone 了本仓库(`git clone --recursive ...`,见 §2.1)—— + 用作 `run_docker.sh` 启动脚本所在位置,以及 checkpoint / 数据 / 输出的 + bind-mount 根目录 +- 完成 §2.3 的一次性 HuggingFace 设置,并在宿主机执行过 + `bash scripts/download_checkpoints.sh`。Checkpoint 留在宿主机、通过 + bind-mount 进容器,所以只需要下载一次。 -``` -git clone https://github.com/facebookresearch/sam3.git -cd sam3 -pip install -e . +### **拉取镜像** + +```bash +docker pull crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 +docker tag crpi-3nfi31esiwp28zns.cn-hangzhou.personal.cr.aliyuncs.com/open_projects_yuchi/sam3d_gs:v0.1 sam3d-gs:latest ``` -### **4.4 可选依赖(用于 Notebook 或训练)** +`docker tag` 这一步可选。`run_docker.sh` 默认使用 `sam3d-gs:latest`; +如果不想重 tag,可以在启动时加前缀 +`SAM3D_IMAGE=crpi-.../sam3d_gs:v0.1`。 -``` -pip install -e ".[notebooks]" -pip install -e ".[train,dev]" +### **启动容器** + +```bash +./run_docker.sh # 全默认(推荐) +./run_docker.sh /path/to/sam3d_gs # 显式传项目目录 +./run_docker.sh /path/to/sam3d_gs /mnt/hf_cache # 自定义 HF 缓存根 +SAM3D_IMAGE=sam3d-gs:v0.1 ./run_docker.sh # 指定镜像 tag +TORCH_HOME=/mnt/torch_cache ./run_docker.sh # 自定义 torch hub 缓存 ``` ------- +启动脚本会把宿主机的关键路径 bind-mount 进容器: -# **5. SAM-3D-Objects 环境** +| 宿主机路径 | 容器路径 | 用途 | +| --- | --- | --- | +| `/submodule/Sam-3d-objects/checkpoints` | 同名 | SAM-3D-Objects 权重(gated) | +| `/submodule/Prompt-Inpaint/checkpoints` | 同名 | SAM3 权重(gated) | +| `${HF_HOME:-$HOME/.cache/huggingface}` | `/root/.cache/huggingface` | AnySplat + 其它 HF 下载 | +| `${TORCH_HOME:-$HOME/.cache/torch}` | `/root/.cache/torch` | `torch.hub` 缓存(DINOv2 等) | +| `/data` | `/opt/sam3d_gs/data` | 输入 / 输出工作目录 | +| `/example` | `/opt/sam3d_gs/example` | 自带示例输入 / 输出 | -官方实现: - 🔗 https://github.com/facebookresearch/sam3d - 🔗 https://huggingface.co/facebook/sam-3d-objects +流水线的产物会写到你指定的 scene 目录里。因为 `data/` 和 `example/` +都是 bind-mount,容器退出后这些产物会留在宿主机上。 -### **5.1 创建环境** +### **在容器内运行流水线** -``` -conda create -n sam_3d_body python=3.10 -y -conda activate sam_3d_body -``` +进入容器后你会落到 `/opt/sam3d_gs/`。镜像里 `PATH` 和 `PYTHONPATH` +已经指向自带的 `.venv`,可以直接调用 `python` 和脚本,**不需要 +`source .venv/bin/activate`**。 -### **5.2 安装 PyTorch(CUDA 12.x)** +```bash +# 自带示例: +bash run_object_generation_pipeline.sh example/example.png -``` -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \ - --index-url https://download.pytorch.org/whl/cu124 +# 自己的图: +bash run_object_generation_pipeline.sh data/my_scene/input_image.png ``` -### **5.3 安装其他 Python 依赖** +Stage 1 / 2 / 3 的行为和下面 §3–§4 完全一致。 -``` -pip install pytorch-lightning pyrender opencv-python yacs scikit-image einops timm dill pandas rich \ - hydra-core hydra-submitit-launcher hydra-colorlog pyrootutils webdataset chump networkx==3.2.1 \ - roma joblib seaborn wandb appdirs appnope ffmpeg cython jsonlines pytest xtcocotools loguru \ - optree fvcore black pycocotools tensorboard huggingface_hub -``` +### **镜像里包含什么** -### **5.4 安装 Detectron2(SAM3D 依赖)** +- CUDA 12.8 devel 基础镜像 + Python 3.11 `.venv`,所有 PyPI 依赖 +- 已编译好的 AnySplat `curope` CUDA 扩展(sm_80 / 90 / 100 / 120) +- `coacd`、`trimesh`、`mujoco`(`pipeline/mesh2mjcf.py` 开箱可用) +- 一个 `sitecustomize.py`,monkey-patch `torch.hub`,使其在本地缓存 + 存在时跳过 github 的 branch ping —— 这样网络不稳时也不会再触发 + `RemoteDisconnected`(前提是 `~/.cache/torch/hub` 已有相应模型) +- 全局的 `git insteadOf` 规则,把 `https://github.com/` 重写到 + `https://gh-proxy.com/https://github.com/`,让容器内的 + `git clone` 在 github 不稳的网络上也能工作 -``` -pip install 'git+https://github.com/facebookresearch/detectron2.git@a1ce2f9' \ - --no-build-isolation --no-deps -``` +### **镜像里不包含什么** -### **5.5 可选安装:MoGe** - -``` -pip install git+https://github.com/microsoft/MoGe.git -``` +- 三套模型 checkpoint(SAM3 / SAM-3D-Objects / AnySplat)。它们留在 + 宿主机上、通过上面的 bind-mount 进容器。在宿主机执行一次 + `scripts/download_checkpoints.sh` 即可。 +- 你自己的输入数据。放到 `/data//` 下,容器里通过 + `data//input_image.png` 引用。 ------- +### **使用须知** -# **6. HuggingFace 权限申请** +- **流水线写出的文件在宿主机上属主是 `root`**。容器内是 root 用户跑的, + 所以写进 bind-mount 目录(`data/`、`example/`、checkpoint 目录等) + 的文件,在宿主机上看到的所有者是 uid 0。两种处理方式: -本项目依赖两个需要授权的模型: + ```bash + # 容器退出后,在宿主机改回当前用户: + sudo chown -R $(id -u):$(id -g) data/ example/ -- **SAM3** - 🔗 https://huggingface.co/facebook/sam3 -- **SAM-3D-Objects** - 🔗 https://huggingface.co/facebook/sam-3d-objects + # 或者从一开始就让容器用宿主机的 uid 跑。 + # 优点是不用 chown,缺点是 Sam-3d-objects 里某些 EGL / pyrender + # 代码路径在非 root 下可能跑不通,所以一般建议用上面的 chown 方案。 + # (想试的话: 编辑 run_docker.sh,给 docker run 加上 + # `--user $(id -u):$(id -g)`) + ``` -请在 HuggingFace 对应页面申请权限,并登录: +- **`gh-proxy.com` 这个重写是给国内用户准备的**。镜像里烤了一条 + `git config --global url..insteadOf https://github.com/` 规则, + 让容器里 `git clone` github 仓库在 GFW 网络下也能成功。**在境外网络 + 环境下这个跳转是多余的,可能反而拖慢速度**。每次进容器后执行一次即可 + 禁用: -``` -hf auth login -``` + ```bash + git config --global --unset url."https://gh-proxy.com/https://github.com/".insteadOf + ``` -脚本会自动使用你的 Token。 + (或者自己 commit 一个去掉这条规则的镜像变体,免得每次都跑。) ------ -# **7. 运行流程** +# **3. 快速开始** -运行脚本前,请设置你的 Conda 激活脚本路径: +> 如果你用的是 Docker 镜像(§2.4),先跑 `./run_docker.sh` 进容器; +> 本节后面所有命令都在**容器内**原样执行。 +先用仓库自带的示例图跑一遍即可(入口脚本会自动 `source .venv`,无需手动激活环境): + +```bash +bash run_object_generation_pipeline.sh example/example.png ``` -CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh" + +默认所有产物会写到输入图像所在目录(此例中即 `example/`)。若想显式指定输出目录,可以传第二个参数: + +```bash +bash run_object_generation_pipeline.sh example/example.png path/to/scene_dir ``` +脚本会在同一个 `.venv` 中按顺序执行三个 stage: + +1. `submodule/Prompt-Inpaint/main.py` — 分割 + 背景补全 +2. `pipeline/background_reconstruction.py` — AnySplat 重建 + 桌面对齐 +3. `pipeline/objects_generation.py` — 单物体 mesh / Gaussian 导出 + ------ -## **阶段 1:Qwen3-VL + SAM3 生成 2D Mask** +# **4. 各 Stage 详解** -执行: +## **Stage 1 — Prompt-Inpaint(SAM3 分割 + 背景补全)** -``` -bash run_agent_with_vllm.sh +```bash +python submodule/Prompt-Inpaint/main.py \ + --resize-output \ + --save-individual-masks \ + --config submodule/Prompt-Inpaint/configs/items.yml \ + --image path/to/input_image.png \ + --output-dir path/to/scene_dir ``` -此脚本会: +输出(位于 `scene_dir/`): -1. 激活 `vllm` 环境 -2. 启动 vLLM 服务,加载 Qwen3-VL -3. 激活 `sam3` 环境 -4. 运行 `pipeline/run_sam3_agent_full.py` -5. 生成多物体 mask +- `input_image.png` — 输入图像的 resize 副本 +- `clean_background.png` — 去除所有前景物体后的补全背景 +- `bg_mask.png` — 用于平面拟合的桌面 mask +- `masks/<物体名>.png` — 每个物体的二值 mask -输出目录: +## **Stage 2 — AnySplat + 桌面对齐 3DGS** -``` -outputs/master_with_vllm/masks/ +```bash +python pipeline/background_reconstruction.py path/to/scene_dir ``` ------- +行为: -## **阶段 2:SAM-3D-Objects 重建 3D Gaussian** +- 递归读取输入目录下每个场景文件夹中的 `clean_background.png` 和配套的 `input_image.png`。 +- 运行 AnySplat 恢复相机内外参、深度、3DGS 重建结果。 +- 对 `bg_mask.png` 做 RANSAC 平面拟合,结合内部 PCA 得到 OBB,构建 world → table 变换。 +- 输出 Mujoco 坐标系下的对齐点云。 -执行: +常用参数: -``` -bash run_sam3d_from_masks.sh -``` +- `--model-id lhjiang/anysplat` — 覆盖 AnySplat 的 HuggingFace 模型 id +- `--align-table` / `--no-align-table` — 是否启用 RANSAC 桌面对齐并导出 `bg_aligned.ply`(默认启用)。关闭时只导出原始 `bg.ply` +- `--x-offset`、`--z-offset` — 对齐后可选的放置偏移(米)。默认 0,对齐后的点云落在原点 -此脚本会: +输出(位于 `scene_dir/`): -1. 激活 `sam3d-objects` 环境 -2. 确保 SAM-3D-Objects 的 checkpoint 下载完成 -3. 加载 RGB + masks -4. 生成每个物体的 `.pt` 文件 -5. 重建并导出 3D Gaussian (`.ply`, `.gif`) +- `extrinsic.npy`、`intrinsic.npy` — 相机参数(world-to-camera;像素单位内参) +- `depth.npy`、`depth_visual.png` — 来自 splat 重建的深度 +- `depth_ori.npy`、`depth_ori_visual.png` — 来自原始(未补全)图像的深度 +- `scale.npy` — 场景级缩放因子 +- `3d_assets/bg.ply` — AnySplat 输出的原始 3DGS 场景 +- `3d_assets/bg_aligned.ply` — 桌面对齐后的 3DGS 场景(仅当 `--align-table` 启用时输出,默认启用) -输出目录: +## **Stage 3 — SAM-3D-Objects 单物体重建** -``` -sam-3d-objects/outputs/torch_save_pt/ -sam-3d-objects/gaussians/multi/ +```bash +python pipeline/objects_generation.py --input-dir path/to/scene_dir ``` ------- +常用参数: -## **可选:一键式全流程执行** +- `--project-root submodule/Sam-3d-objects` — checkpoint 根目录 +- `--tag hf` — checkpoint 子目录(`submodule/Sam-3d-objects/checkpoints//pipeline.yaml`) +- `--seed 42`、`--save-pt`、`--save-intermediate` -``` -bash run_pipeline.sh -``` +针对每一个 mask,该 stage 运行 SAM-3D-Objects 推理,通过对比投影面积与平均深度恢复物体局部尺寸,并把资产以原点姿态导出。 -该脚本会自动完成阶段 1 + 阶段 2。 +输出(位于 `scene_dir/3d_assets/`): + +- `<物体名>.obj` — Mujoco 单位的物体 mesh +- `<物体名>.ply` — Mujoco 单位的物体 3D Gaussian +- `<物体名>_keyframe.npy` — 最终 mesh 的平均 XYZ +- 当传入 `--save-intermediate` 时,额外导出调试用的渲染和带姿态的中间产物 ------ -# **Q&A** +# **5. 可选工具** + +## **`pipeline/mesh2mjcf.py` — mesh → MuJoCo MJCF 转换器** -## **Q1:下载模型时报 “Consistency check failed”?** +一个独立的命令行工具,把单个 `.obj` 或 `.stl` 文件转成 MuJoCo MJCF 资产 +(`_dependencies.xml` + `.xml` 两个 XML,以及一个 per-asset 的 +mesh / texture 目录)。它**没有**被串进 +`run_object_generation_pipeline.sh`;当 Stage 3 产出 +`/3d_assets/.obj` 之后按需调用即可。 -**原因:** 下载中断导致 HuggingFace 缓存中出现损坏的模型分片。 - **解决:删除损坏缓存并重新下载。** +默认输出根目录是输入 mesh 的父目录,所以对 +`scene_dir/3d_assets/cup.obj` 运行后会在输入旁边生成一个 per-asset 目录: ``` -rm -rf sam-3d-objects/checkpoints/hf -rm -rf ~/.cache/huggingface/hub # 可选 -bash run_sam3d_from_masks.sh +scene_dir/3d_assets/ + cup.obj (原输入,不变) + cup/ (以 obj 名命名的 per-asset 输出目录) + cup.obj (输入的拷贝) + cup.mtl (若多材质) + <纹理文件> (MTL 引用的贴图) + part_0.obj part_1.obj ... (若 -cd) + mjcf/ + cup.xml + cup_dependencies.xml ``` -若要强制重新下载,可使用: +emitted XML 中的 mesh 路径写作 `/`,所以消费方的 MuJoCo +scene 需要把 `meshdir`(和 `texturedir`)设为输出根目录。通过 +`-o/--output ` 可以重定向。 -``` -force_download=True -``` +### 所需依赖 + +走 `scripts/install_env.sh` 装环境的话,`coacd`、`trimesh`、`mujoco` 三个包 +默认就装好了。下表只在你跳过一键脚本、想手动按需装时作为参考: -## **关于坐标系说明(PLY 输出方向)** +| 功能 | 依赖库 | 手动安装命令 | +| --- | --- | --- | +| 多材质 OBJ 自动拆分(当存在 MTL 文件时触发) | `trimesh` | `uv pip install trimesh` | +| 凸分解(`-cd`) | `coacd`、`trimesh` | `uv pip install coacd trimesh` | +| 预览查看器(`--verbose`) | `mujoco` | `uv pip install mujoco` | -通过 **SAM-3D-Objects** 导出的 3D Gaussian `.ply` 文件默认处于 **相机坐标系** 下,其中: +### 用法 -- **+Z 轴** 为相机前向 -- **+X 轴** 指向右侧 -- **+Y 轴** 指向下方(典型计算机视觉坐标系) +```bash +# 基本用法(使用默认颜色 / 质量 / 惯性) +python pipeline/mesh2mjcf.py path/to/cup.obj -因此,重建的对象是以 **相机前向 Z 轴** 对齐的,而不是世界坐标系。 +# 自定义 RGBA、质量、对角惯性 +python pipeline/mesh2mjcf.py path/to/cup.obj \ + --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005 -如果需要将 `.ply` 放置到全局 **世界坐标系** 中(例如仿真器、机器人场景、NeRF / COLMAP world frame),必须执行一次 **相机 → 世界坐标系转换**: -$$ -\mathbf{X}_{world} = \mathbf{R}_{c2w}\ \mathbf{X}_{camera} \ + \ \mathbf{t}_{c2w} -$$ -其中: +# 自由关节 + 凸分解,得到更精确的碰撞几何 +python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint -cd -- $\mathbf{R}_{c2w}$:相机到世界的旋转矩阵 -- $\mathbf{t}_{c2w}$:相机到世界的平移向量 -- $\mathbf{X}_{camera}$:高斯中心的相机系坐标 -- $\mathbf{X}_{world}$:转换后的世界系坐标 +# 在 mujoco.viewer 中预览 +python pipeline/mesh2mjcf.py path/to/cup.obj --verbose + +# 一键批量转换某个场景下所有物体 +for obj in scene_dir/3d_assets/*.obj; do + python pipeline/mesh2mjcf.py "$obj" -cd +done +``` -完成转换后,你即可将 `.ply` 与全局场景或机器人环境正确对齐。 ------ -# **引用(Citation)** +# **6. 常见问题** -### **SAM3** +**Q:HuggingFace 下载报 "Consistency check failed: file should be XXXX but has size YYYY"。** +HuggingFace 缓存中的 shard 损坏。清理后重试: + +```bash +rm -rf submodule/Sam-3d-objects/checkpoints/hf +rm -rf ~/.cache/huggingface/hub # 可选,更激进 +bash run_object_generation_pipeline.sh path/to/input_image.png ``` -@article{kirillov2024sam3, - title={SAM 3: Segment Anything in Images and Videos}, - author={Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others}, - year={2024}, - url={https://github.com/facebookresearch/sam3} -} + +也可以在调用 HuggingFace API 时通过 `force_download=True` 强制重新下载。 + +**Q:AnySplat 提示 "cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead"。** + +CUDA 扩展没编译。请按 [`install.md`](install.md) 里的说明修改 `kernels.cu`,再执行 `python setup.py build_ext --inplace`。 + +**Q:Stage 1 (Prompt-Inpaint / iopaint) 报 `ImportError: cannot import name 'cached_download' from 'huggingface_hub'`。** + +`huggingface_hub` ≥ 0.26 把 `cached_download` 删掉了,但 `iopaint` 依赖的 `diffusers` 0.27.x 还在 import 它。把 `huggingface_hub` 锁到 0.25.2: + +```bash +source .venv/bin/activate +uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \ + "huggingface_hub==0.25.2" ``` -### **SAM-3D-Objects** +新走 `scripts/install_env.sh` 的环境已经带上这个 pin。 + +**Q:Stage 1 报 `ImportError: cannot import name 'is_offline_mode' from 'huggingface_hub'`。** + +同一根问题的另一侧:`transformers` 5.x 会 import `huggingface_hub.is_offline_mode`,而 0.25.2 没有这个符号。把 transformers 锁到 4.48.3: +```bash +source .venv/bin/activate +uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \ + "transformers==4.48.3" ``` + +新走 `scripts/install_env.sh` 的环境已经带上这个 pin。 + +------ + +# **引用** + +```bibtex +@article{kirillov2024sam3, + title = {SAM 3: Segment Anything in Images and Videos}, + author = {Kirillov, Alexander and Ravi, Nikhila and Mao, Weiyao and others}, + year = {2024}, + url = {https://github.com/facebookresearch/sam3} +} + @article{wu2024sam3dobjects, - title={SAM-3D-Objects: Segment Anything in 3D Using 2D Masks}, - author={Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others}, - year={2024}, - url={https://github.com/facebookresearch/sam3d} + title = {SAM-3D-Objects: Segment Anything in 3D Using 2D Masks}, + author = {Wu, Yu and Mao, Weiyao and Kirillov, Alexander and others}, + year = {2024}, + url = {https://github.com/facebookresearch/sam-3d-objects} +} + +@article{jiang2024anysplat, + title = {AnySplat: Feed-forward 3D Gaussian Splatting from Unconstrained Views}, + author = {Jiang, Lihan and others}, + year = {2024}, + url = {https://github.com/OpenRobotLab/AnySplat} } ``` ------ -# **致谢(Acknowledgements)** +# **致谢** -本项目基于以下官方实现构建: +本项目基于并整合了以下工作: -- **SAM3** - GitHub: https://github.com/facebookresearch/sam3 - HuggingFace: https://huggingface.co/facebook/sam3 -- **SAM-3D-Objects** - GitHub: https://github.com/facebookresearch/sam3d - HuggingFace: https://huggingface.co/facebook/sam-3d-objects +- **SAM3** — [GitHub](https://github.com/facebookresearch/sam3) · [HuggingFace](https://huggingface.co/facebook/sam3) +- **SAM-3D-Objects** — [GitHub](https://github.com/facebookresearch/sam3d) · [HuggingFace](https://huggingface.co/facebook/sam-3d-objects) +- **AnySplat** — [HuggingFace](https://huggingface.co/lhjiang/anysplat) +- **Prompt-Inpaint** — [GitHub](https://github.com/MrZoyo/Prompt-Inpaint) -感谢原作者开放其卓越的研究成果与代码,使本流水线得以实现。 \ No newline at end of file +感谢原作者开放其研究成果与代码。 diff --git a/example/example.png b/example/example.png new file mode 100644 index 0000000..9caff67 Binary files /dev/null and b/example/example.png differ diff --git a/install.md b/install.md new file mode 100644 index 0000000..fa01a9c --- /dev/null +++ b/install.md @@ -0,0 +1,141 @@ +# Install on RTX 50-series GPUs (torch 2.7.0 + cu128, also works on 3090,4090) + +> **Don't want to build the environment locally?** A pre-built Docker +> image is published; see [README §2.4 "Docker image"](README.md#24-docker-image-alternative-to-2123) +> for the pull / launch flow. This document is only the native-install +> reference. + +One-command installer: + +``` +bash scripts/install_env.sh +``` + +This document is the manual step-by-step installation reference. Use it if you want to inspect or run each installation step yourself. + + +# Run the installation commands below + +``` +git submodule update --init --recursive + +uv venv --python 3.11 + +source .venv/bin/activate + +export PYTHONPATH="$(pwd)/submodule/Sam-3d-objects/notebook:$(pwd)/submodule/Sam-3d-objects:${PYTHONPATH:-}" + +uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128 + +# uv pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128 + +uv pip install -r submodule/AnySplat/requirements.txt --no-build-isolation + +export PIP_FIND_LINKS="https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html" + +uv pip install hatch-requirements-txt editables wheel + +uv pip install -e './submodule/Sam-3d-objects[dev]' + +uv pip install -e './submodule/Sam-3d-objects[p3d]' --no-build-isolation + +uv pip install -e "./submodule/Sam-3d-objects[inference]" --no-build-isolation --find-links https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html + +# Note: do NOT pass -U here. With -U, uv would upgrade transitive deps such +# as torch (via iopaint) and clobber the CUDA-pinned torch above. +# Also note transformers is pinned to ==4.48.3 (not >=): transformers 5.x +# imports `is_offline_mode` from huggingface_hub, which doesn't exist in +# 0.25.2, and would crash iopaint even with hub pinned below. +uv pip install --index-strategy unsafe-best-match \ + "transformers==4.48.3" \ + "iopaint>=1.2.0" \ + "diffusers>=0.27.2" \ + "numpy<2.0" \ + "opencv-python>=4.8.0" \ + "pyyaml>=6.0" \ + "requests>=2.31.0" \ + "tqdm>=4.66.0" \ + "setuptools" \ + "einops" + +# Pin huggingface_hub last, with --force-reinstall --no-deps so it can be +# downgraded past other packages' transitive `>=0.26` constraints. +# Reason: diffusers 0.27.2 (and the iopaint stack on top) still import +# `cached_download` from huggingface_hub, which was removed in hub >=0.26. +uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \ + "huggingface_hub==0.25.2" + +uv pip install --index-strategy unsafe-best-match "git+https://github.com/facebookresearch/sam3.git" +``` + +## SAM3 model access + +`facebook/sam3` is a gated model on HuggingFace. Request access on the model page first, then log in: +``` +huggingface-cli login +``` + + +## Fix the AnySplat warning: `Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead` +``` +cd submodule/AnySplat/src/model/encoder/backbone/croco/curope/ +``` +In `kernels.cu`, change: + +``` +AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] { +``` + +to: + +``` +AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.scalar_type(), "rope_2d_cuda", ([&] { +``` + +Then run: +``` +python setup.py build_ext --inplace +``` + + +## Extra dependencies for `pipeline/mesh2mjcf.py` + +`scripts/install_env.sh` already installs `coacd`, `trimesh`, and `mujoco` by +default, so `mesh2mjcf.py` works out of the box (including `-cd` and +`--verbose`). The commands below are only useful if you build the environment +piecemeal and want to add the individual packages on demand: + +``` +# Convex decomposition (-cd) +uv pip install coacd trimesh + +# Preview viewer (--verbose) +uv pip install mujoco +``` + + +# Completed modifications compared to the original repository: + +submodule/Sam-3d-objects/pyproject.toml: +``` +-PIP_EXTRA_INDEX_URL = "https://pypi.ngc.nvidia.com https://download.pytorch.org/whl/cu121" + +change to + ++PIP_EXTRA_INDEX_URL = "https://pypi.ngc.nvidia.com https://download.pytorch.org/whl/cu128" +``` +requirements.inference.txt: +``` +kaolin==0.17.0 change to kaolin==0.18.0 +``` +requirements.txt: +``` +nvidia-pyindex==1.0.9 change to # nvidia-pyindex==1.0.9 (comment it out) + +torchaudio==2.5.1+cu121 change to torchaudio, +xformers==0.0.28.post3 change to xformers (remove the pinned torchaudio and xformers versions) +``` +requirements.p3d.txt: +``` +tflash_attn==2.8.3 change to flash_attn==2.7.3 +``` \ No newline at end of file diff --git a/pipeline/__init__.py b/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pipeline/background_reconstruction.py b/pipeline/background_reconstruction.py new file mode 100644 index 0000000..16a5472 --- /dev/null +++ b/pipeline/background_reconstruction.py @@ -0,0 +1,376 @@ +"""Batch RANSAC-based table alignment + 3D Gaussian export on top of AnySplat. + +This is a cleaned-up rewrite of `submodule/AnySplat/inference_ransac_batch.py`. +The script now lives outside the AnySplat submodule, so it explicitly inserts +the AnySplat root onto `sys.path` to keep the original imports working. +""" + +import argparse +import os +import sys +from pathlib import Path + +import cv2 +import imageio +import numpy as np +import torch + +# ===== Make AnySplat's `src.*` and `utils.py` importable when running from the +# repository root (this file no longer lives inside submodule/AnySplat). +_ANYSPLAT_ROOT = Path(__file__).resolve().parent.parent / "submodule" / "AnySplat" +sys.path.insert(0, str(_ANYSPLAT_ROOT)) +sys.path.insert(0, str(_ANYSPLAT_ROOT.parent)) # mirrors original sys.path entry + +from src.misc.image_io import save_interpolated_video # noqa: E402, F401 +from src.model.ply_export import export_ply # noqa: E402 +from src.model.model.anysplat import AnySplat # noqa: E402 +from src.utils.image import process_image # noqa: E402 +from utils import ( # noqa: E402 + align_points_to_table, + depth_to_points, + fit_plane_ransac_safe_2, + plane_coordinate_system, + render_depth_from_points, + shrink_mask_erode, +) + + +# ===== RANSAC / inner-rectangle hyperparameters ===== +RANSAC_NUM_ITERS = 600 +RANSAC_DIST_THRESH = 0.005 # tabletops are usually very flat +RANSAC_SAMPLE_N = 40000 +INNER_PERCENTILE = (20, 80) # crop to the central 60% to avoid edges +MIN_INNER_POINTS = 50 + +# ===== Scene normalisation ===== +# Quantile of |xyz| used as the reference radius before rescaling, and the +# target radius the reference is mapped to. +SCALE_QUANTILE = 0.95 +SCALE_TARGET_RANGE = 0.6 + +# ===== Post-alignment scene placement ===== +# Offsets applied after table-alignment so the aligned cloud can be shifted +# from the origin if the downstream consumer needs it (e.g. to place it on a +# Mujoco table). Defaults are 0, meaning the aligned cloud sits at the origin. +DEFAULT_X_OFFSET = 0.0 +DEFAULT_Z_OFFSET = 0.0 + +# ===== Mask shrink before plane fitting ===== +BG_MASK_SHRINK_RATIO = 0.12 + +# ===== Default model id ===== +DEFAULT_MODEL_ID = "lhjiang/anysplat" + + +def compute_table_geometry_ransac(depth, mask, intrinsic, extrinsic): + """Fit a tabletop plane via RANSAC + inner PCA and build a world-aligned + transform that maps the original world frame onto a table-aligned frame. + """ + H, W = depth.shape + + # ===== 1. Intrinsics ===== + fx = intrinsic[0, 0] + fy = intrinsic[1, 1] + cx = intrinsic[0, 2] + cy = intrinsic[1, 2] + + # ===== 2. Depth -> camera-frame points ===== + points_cam = depth_to_points(depth, mask, fx, fy, cx, cy) + print("points_cam:", points_cam.shape) + + # ===== 3. RANSAC plane ===== + normal_cam, center_cam, inlier_idx = fit_plane_ransac_safe_2( + points_cam, + num_iters=RANSAC_NUM_ITERS, + dist_thresh=RANSAC_DIST_THRESH, + sample_N=RANSAC_SAMPLE_N, + ) + print(f"RANSAC normal: {normal_cam}") + + pts_plane = points_cam[inlier_idx] + + # ===== 4. Plane coordinate system ===== + u, v = plane_coordinate_system(normal_cam) + rel = pts_plane - center_cam + pts_2d = np.stack([rel @ u, rel @ v], axis=1) + + # ===== 5. Inner rectangle (crop edges) ===== + x, y = pts_2d[:, 0], pts_2d[:, 1] + x_min, x_max = np.percentile(x, list(INNER_PERCENTILE)) + y_min, y_max = np.percentile(y, list(INNER_PERCENTILE)) + inner = (x > x_min) & (x < x_max) & (y > y_min) & (y < y_max) + pts_inner = pts_2d[inner] + if pts_inner.shape[0] < MIN_INNER_POINTS: + raise RuntimeError("Too few inner RANSAC points") + + # ===== 6. PCA on the inner points ===== + mean_2d = pts_inner.mean(axis=0) + centered = pts_inner - mean_2d + _, _, Vt = np.linalg.svd(centered, full_matrices=False) + dir_long_2d = Vt[0] + + # ===== 7. 2D -> 3D ===== + dir_long_cam = dir_long_2d[0] * u + dir_long_2d[1] * v + dir_long_cam /= np.linalg.norm(dir_long_cam) + dir_short_cam = np.cross(normal_cam, dir_long_cam) + dir_short_cam /= np.linalg.norm(dir_short_cam) + + # ===== 8. World consistency (avoid axis flip) ===== + R_cw = extrinsic[:3, :3] + if (R_cw @ dir_long_cam)[0] < 0: + dir_long_cam = -dir_long_cam + dir_short_cam = -dir_short_cam + + # ===== 9. OBB extents ===== + proj = centered @ Vt[:2].T + min_xy, max_xy = proj.min(0), proj.max(0) + length = max_xy[0] - min_xy[0] + width = max_xy[1] - min_xy[1] + + center_plane_cam = center_cam + mean_2d[0] * u + mean_2d[1] * v + + # ===== 10. Build world->table alignment ===== + R_table_cam = np.stack([dir_long_cam, dir_short_cam, normal_cam], axis=1) + R_align_cam = R_table_cam.T + t_align_cam = -R_align_cam @ center_plane_cam + + R_align_world = R_align_cam @ R_cw + t_align_world = R_align_cam @ extrinsic[:3, 3] + t_align_cam + + print("RANSAC inlier ratio:", len(inlier_idx) / points_cam.shape[0]) + + return { + "length": float(length), + "width": float(width), + "normal": normal_cam, + "dir_long": dir_long_cam, + "dir_short": dir_short_cam, + "R_align_cam": R_align_cam, + "t_align_cam": t_align_cam, + "R_align_world": R_align_world, + "t_align_world": t_align_world, + } + + +def _save_depth_npy_and_viz(depth, image_folder, base_name): + """Save a raw depth array and a normalized 8-bit visualisation.""" + depth_path = Path(image_folder) / f"{base_name}.npy" + np.save(depth_path, depth) + viz = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8) + viz_path = Path(image_folder) / f"{base_name}_visual.png" + imageio.imwrite(viz_path, viz) + + +def process_single_image(image_path, model, device, args): + """Run AnySplat on one `clean_background.png` and export aligned assets.""" + image_folder = os.path.dirname(image_path) + image_ori_path = os.path.join(image_folder, "input_image.png") + + # Load images. + image = process_image(image_path) + image_ori = process_image(image_ori_path) + images_ori = torch.stack([image_ori], dim=0).unsqueeze(0).to(device) + images = torch.stack([image], dim=0).unsqueeze(0).to(device) + b, v, _, H, W = images.shape + + # Inference. + with torch.no_grad(): + gaussians, pred_context_pose, depth_dict = model.inference((images + 1) * 0.5) + gaussians_ori, pred_context_pose_ori, depth_dict_ori = model.inference( + (images_ori + 1) * 0.5 + ) + depth_ori = depth_dict_ori["depth"][0][0].squeeze().cpu().numpy() + _save_depth_npy_and_viz(depth_ori, image_folder, "depth_ori") + + # Camera parameters. AnySplat returns camera-to-world; we store world-to-camera. + pred_all_extrinsic = pred_context_pose["extrinsic"][0][0].inverse().cpu().numpy() + pred_all_intrinsic = pred_context_pose["intrinsic"][0][0].cpu().numpy() + print(f"Processing {os.path.basename(image_folder)}: converted intrinsics:") + print( + f" fx: {pred_all_intrinsic[0, 0] * W:.2f}, " + f"fy: {pred_all_intrinsic[1, 1] * H:.2f}" + ) + print( + f" cx: {pred_all_intrinsic[0, 2] * W:.2f}, " + f"cy: {pred_all_intrinsic[1, 2] * H:.2f}" + ) + + # Scale normalised intrinsics to pixel units. + pred_all_intrinsic[0, :] = pred_all_intrinsic[0, :] * W + pred_all_intrinsic[1, :] = pred_all_intrinsic[1, :] * H + + np.save(Path(image_folder) / "extrinsic.npy", pred_all_extrinsic) + np.save(Path(image_folder) / "intrinsic.npy", pred_all_intrinsic) + + intrinsic = pred_all_intrinsic + extrinsic = pred_all_extrinsic + gaussian_xyz = gaussians.means[0].detach().cpu().numpy() + depth = depth_dict["depth"][0][0].squeeze().cpu().numpy() + _save_depth_npy_and_viz(depth, image_folder, "depth") + + # Asset directory. + assets_folder = os.path.join(image_folder, "3d_assets") + os.makedirs(assets_folder, exist_ok=True) + + # Export the raw 3DGS reconstruction. + export_ply( + gaussians.means[0], + gaussians.scales[0], + gaussians.rotations[0], + gaussians.harmonics[0], + gaussians.opacities[0], + Path(assets_folder) / "bg.ply", + ) + + if not args.align_table: + print( + "Table alignment disabled (--no-align-table); " + "skipping bg_aligned.ply export." + ) + print(f"Done. Outputs saved under: {image_folder}") + return + + # Re-render depth from the splat point cloud (used for plane fitting). + depth_point = render_depth_from_points(gaussian_xyz, intrinsic, extrinsic, H, W) + + mask_path = Path(image_folder) / "bg_mask.png" + if not mask_path.exists(): + print(f"Warning: bg_mask.png not found, skipping table alignment: {mask_path}") + return + + mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE).astype(np.uint8) + mask = shrink_mask_erode(mask, ratio=BG_MASK_SHRINK_RATIO) + + result = compute_table_geometry_ransac( + depth=depth_point, + mask=mask, + intrinsic=intrinsic, + extrinsic=extrinsic, + ) + print(f"\n{os.path.basename(image_folder)} table geometry:") + print(f" length (m): {result['length']:.3f}") + print(f" width (m): {result['width']:.3f}") + print(f" normal: {result['normal']}") + + # Align the splat point cloud to the table frame. + points_table_world = align_points_to_table( + gaussian_xyz, + result["R_align_world"], + result["t_align_world"], + ) + points_table_world = points_table_world - np.median(points_table_world, axis=0) + + # Use a robust quantile for scale so outliers don't dominate. + abs_points = np.abs(points_table_world) + ref_range = np.quantile(abs_points, SCALE_QUANTILE) + scale_factor = ref_range / SCALE_TARGET_RANGE + points_table_world = points_table_world / scale_factor + gaussians.scales[0] = gaussians.scales[0] / scale_factor + + np.save(Path(image_folder) / "scale.npy", scale_factor) + print(f" scale factor: {scale_factor:.3f}") + + # Swap X/Y, flip Z, then apply optional placement offsets (default 0,0). + x = points_table_world[:, 0].copy() + y = points_table_world[:, 1].copy() + points_table_world[:, 0] = y + points_table_world[:, 1] = x + points_table_world[:, 2] *= -1 + points_table_world[:, 2] += args.z_offset + points_table_world[:, 0] += args.x_offset + + export_ply( + points_table_world, + gaussians.scales[0], + gaussians.rotations[0], + gaussians.harmonics[0], + gaussians.opacities[0], + Path(assets_folder) / "bg_aligned.ply", + ) + + print( + f" Z range: min={points_table_world[:, 2].min():.3f}, " + f"max={points_table_world[:, 2].max():.3f}" + ) + print(f"Done. Outputs saved under: {image_folder}") + + +def main(): + parser = argparse.ArgumentParser( + description=( + "Reconstruct a 3D Gaussian model from a single image and emit the " + "associated camera intrinsics/extrinsics, depth maps, and an " + "optional table-aligned point cloud." + ) + ) + parser.add_argument( + "input_dir", + type=str, + help="Input directory or single file. Directories are searched recursively for clean_background.{png,jpg}.", + ) + parser.add_argument( + "--model-id", + type=str, + default=DEFAULT_MODEL_ID, + help=f"HuggingFace model id to load (default: {DEFAULT_MODEL_ID}).", + ) + parser.add_argument( + "--align-table", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "Run RANSAC table alignment and export bg_aligned.ply. " + "Use --no-align-table to disable (only bg.ply will be emitted). " + "Default: enabled." + ), + ) + parser.add_argument( + "--x-offset", + type=float, + default=DEFAULT_X_OFFSET, + help="X-axis offset (m) applied after table alignment. Default: 0 (origin).", + ) + parser.add_argument( + "--z-offset", + type=float, + default=DEFAULT_Z_OFFSET, + help="Z-axis offset (m) applied after table alignment. Default: 0 (origin).", + ) + + args = parser.parse_args() + + if os.path.isfile(args.input_dir): + input_dir = os.path.dirname(args.input_dir) + else: + input_dir = args.input_dir + + print(f"Loading model: {args.model_id}") + model = AnySplat.from_pretrained(args.model_id) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = model.to(device) + model.eval() + for param in model.parameters(): + param.requires_grad = False + print("Model loaded.") + + clean_background_files = [] + for root, _dirs, files in os.walk(input_dir): + for file in files: + if file.lower() in ("clean_background.png", "clean_background.jpg"): + clean_background_files.append(os.path.join(root, file)) + + print(f"Found {len(clean_background_files)} clean_background images.") + + for idx, image_path in enumerate(clean_background_files, 1): + print(f"\nProcessing {idx}/{len(clean_background_files)}: {image_path}") + try: + process_single_image(image_path, model, device, args) + print(f"Successfully processed: {image_path}") + except Exception as e: + print(f"Error processing {image_path}: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/pipeline/mesh2mjcf.py b/pipeline/mesh2mjcf.py new file mode 100644 index 0000000..7f1795b --- /dev/null +++ b/pipeline/mesh2mjcf.py @@ -0,0 +1,674 @@ +"""Convert a single .obj or .stl mesh into MuJoCo MJCF assets. + +This is a generic mesh-to-MJCF converter, derived from +github.com/discoverse-dev/DISCOVERSE/scripts/mesh2mjcf.py but stripped of any +DISCOVERSE-specific imports or scene wiring. It is designed to consume the +per-object meshes that this pipeline emits under `/3d_assets/.obj`, +but works on any standalone mesh file. + +Output layout (under --output-dir, which defaults to the input file's parent — +typically `scene_dir/3d_assets/` when consuming the v2 pipeline outputs): + + / + / (per-asset folder, named after the obj stem) + .obj (copy of the input mesh) + .mtl (if multi-material) + (referenced by the MTL) + part_0.obj part_1.obj ... (if --convex_decomposition) + mjcf/ + .xml + _dependencies.xml + +Mesh paths inside the emitted XML are written as `/`, so the +consuming MuJoCo scene should set `meshdir` (and `texturedir`) to . + +Examples: + + # Basic conversion (default RGBA, mass, inertia; no free joint; no decomp). + python pipeline/mesh2mjcf.py path/to/cup.obj + + # Specify RGBA, mass, inertia. + python pipeline/mesh2mjcf.py path/to/cup.obj \\ + --rgba 0.8 0.2 0.2 1.0 --mass 0.5 --diaginertia 0.01 0.01 0.005 + + # Free-floating object. + python pipeline/mesh2mjcf.py path/to/cup.obj --free_joint + + # Convex decomposition for accurate collisions. + python pipeline/mesh2mjcf.py path/to/cup.obj -cd + + # Preview in MuJoCo viewer after conversion. + python pipeline/mesh2mjcf.py path/to/cup.obj --verbose + +Notes: + - Multi-material OBJ files are auto-detected (via the MTL file) and split + into one sub-mesh per material; each material yields a MuJoCo + ``, with textures (`map_Kd`) copied alongside. + - Convex decomposition requires `pip install coacd trimesh`. + - Material splitting requires `pip install trimesh`. +""" + +import argparse +import logging +import os +import re +import shutil +import xml.etree.ElementTree as ET +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + +# ===== MTL handling ===== + +# MTL fields relevant to MuJoCo. +MTL_FIELDS = ( + "Ka", # Ambient color + "Kd", # Diffuse color + "Ks", # Specular color + "d", # Transparency (alpha) + "Tr", # 1 - transparency + "Ns", # Shininess + "map_Kd", # Diffuse texture map +) + + +@dataclass +class Material: + """Convenience container for MTL → MuJoCo material conversion.""" + + name: str + Ka: Optional[str] = None + Kd: Optional[str] = None + Ks: Optional[str] = None + d: Optional[str] = None + Tr: Optional[str] = None + Ns: Optional[str] = None + map_Kd: Optional[str] = None + + @staticmethod + def from_string(lines: Sequence[str]) -> "Material": + attrs = {"name": lines[0].split(" ")[1].strip()} + for line in lines[1:]: + for attr in MTL_FIELDS: + if line.startswith(attr): + elems = line.split(" ")[1:] + elems = [elem for elem in elems if elem != ""] + attrs[attr] = " ".join(elems) + break + return Material(**attrs) + + def mjcf_rgba(self) -> str: + Kd = self.Kd or "1.0 1.0 1.0" + if self.d is not None: + alpha = self.d + elif self.Tr is not None: + alpha = str(1.0 - float(self.Tr)) + else: + alpha = "1.0" + return f"{Kd} {alpha}" + + def mjcf_shininess(self) -> str: + if self.Ns is not None: + # Ns values are typically 0-1000; normalize to [0, 1]. + ns_val = float(self.Ns) / 1_000 + else: + ns_val = 0.5 + return f"{ns_val}" + + def mjcf_specular(self) -> str: + if self.Ks is not None: + # Average the specular RGB to a scalar. + ks_val = sum(map(float, self.Ks.split(" "))) / 3 + else: + ks_val = 0.5 + return f"{ks_val}" + + +def parse_mtl_name(lines: Sequence[str]) -> Optional[str]: + """Return the .mtl filename referenced by an OBJ file's `mtllib` directive.""" + mtl_regex = re.compile(r"^mtllib\s+(.+?\.mtl)(?:\s*#.*)?\s*\n?$") + for line in lines: + match = mtl_regex.match(line) + if match is not None: + return match.group(1) + return None + + +def copy_obj_with_mtl(obj_source: Path, obj_target: Path) -> None: + """Copy an OBJ file, plus the MTL file it references (if any).""" + obj_target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(obj_source, obj_target) + + try: + with open(obj_source, "r") as f: + lines = f.readlines() + for line in lines: + if line.strip().startswith("mtllib "): + mtl_filename = line.strip().split()[1] + mtl_source = obj_source.parent / mtl_filename + mtl_target = obj_target.parent / mtl_filename + if mtl_source.exists(): + shutil.copy2(mtl_source, mtl_target) + print(f"Copied MTL file: {mtl_source} -> {mtl_target}") + break + except Exception as e: + print(f"Warning: failed to check/copy MTL file for {obj_source}: {e}") + + +def parse_mtl_file(mtl_path: Path) -> Dict[str, Material]: + """Parse an MTL file into a name → Material dict.""" + materials: Dict[str, Material] = {} + if not mtl_path.exists(): + return materials + + with open(mtl_path, "r", encoding="utf-8") as f: + lines = f.readlines() + + lines = [line for line in lines if not line.startswith("#")] + lines = [line for line in lines if line.strip()] + lines = [line.strip() for line in lines] + + sub_mtls: List[List[str]] = [] + for line in lines: + if line.startswith("newmtl"): + sub_mtls.append([]) + if sub_mtls: + sub_mtls[-1].append(line) + + for sub_mtl in sub_mtls: + if sub_mtl: + material = Material.from_string(sub_mtl) + materials[material.name] = material + + return materials + + +def split_obj_by_materials( + obj_path: Path, output_dir: Path +) -> Tuple[Dict[str, Material], List[str]]: + """Split a multi-material OBJ into one sub-mesh per material. + + Returns (materials, submesh_files). If the OBJ has zero or one materials, + submesh_files is empty and the OBJ is left as a single file. + """ + materials: Dict[str, Material] = {} + submesh_files: List[str] = [] + + with open(obj_path, "r", encoding="utf-8") as f: + obj_lines = f.readlines() + + mtl_name = parse_mtl_name(obj_lines) + if mtl_name: + mtl_path = obj_path.parent / mtl_name + materials = parse_mtl_file(mtl_path) + + if len(materials) <= 1: + return materials, [] + + try: + import trimesh + except ImportError: + print("Warning: trimesh not installed; cannot split multi-material OBJ.") + return materials, [] + + try: + mesh = trimesh.load( + obj_path, + split_object=True, + group_material=True, + process=False, + maintain_order=False, + ) + + if isinstance(mesh, trimesh.base.Trimesh): + # Single mesh after grouping; nothing to split. + target_file = output_dir / f"{obj_path.stem}.obj" + shutil.copy(obj_path, target_file) + return materials, [] + + obj_stem = obj_path.stem + print(f"Splitting OBJ by material: {len(mesh.geometry)} sub-meshes") + for i, (material_name, geom) in enumerate(mesh.geometry.items()): + submesh_file = f"{obj_stem}_{i}.obj" + submesh_path = output_dir / submesh_file + + geom.visual.material.name = material_name + geom.export(str(submesh_path), include_texture=True, header=None) + submesh_files.append(submesh_file) + print(f" saved sub-mesh: {submesh_file} (material: {material_name})") + + # trimesh sometimes emits a stray `material.mtl` next to the export. + temp_mtl = output_dir / "material.mtl" + if temp_mtl.exists(): + temp_mtl.unlink() + + return materials, submesh_files + except Exception as e: + print(f"Warning: failed to split OBJ by material: {e}") + return materials, [] + + +# ===== XML builders ===== + +def create_asset_xml(asset_name, convex_parts=None, materials=None, submesh_files=None): + """Build the `` element listing meshes/materials/textures.""" + root = ET.Element("mujocoinclude") + asset = ET.SubElement(root, "asset") + + if materials: + for material_name, material in materials.items(): + material_elem = ET.SubElement(asset, "material") + material_elem.set("name", f"{asset_name}_{material_name}") + material_elem.set("rgba", material.mjcf_rgba()) + material_elem.set("specular", material.mjcf_specular()) + material_elem.set("shininess", material.mjcf_shininess()) + + if material.map_Kd: + texture_elem = ET.SubElement(asset, "texture") + texture_elem.set("type", "2d") + texture_elem.set("name", f"{asset_name}_{material_name}_texture") + texture_elem.set("file", f"{asset_name}/{material.map_Kd}") + + material_elem.set("texture", f"{asset_name}_{material_name}_texture") + material_elem.attrib.pop("rgba", None) + + # Main mesh (only when not split by material). + if not submesh_files: + mesh_elem = ET.SubElement(asset, "mesh") + mesh_elem.set("name", asset_name) + mesh_elem.set("file", f"{asset_name}/{asset_name}.obj") + + # Per-material sub-meshes. + if submesh_files: + for submesh_file in submesh_files: + submesh_name = submesh_file.replace(".obj", "") + part_mesh = ET.SubElement(asset, "mesh") + part_mesh.set("name", submesh_name) + part_mesh.set("file", f"{asset_name}/{submesh_file}") + + # Convex-decomposition parts. + if convex_parts: + for i in range(convex_parts): + part_mesh = ET.SubElement(asset, "mesh") + part_mesh.set("name", f"{asset_name}_part_{i}") + part_mesh.set("file", f"{asset_name}/part_{i}.obj") + + return root + + +def create_geom_xml( + asset_name, + mass, + diaginertia, + rgba, + free_joint=False, + convex_parts=None, + materials=None, + submesh_files=None, + output_dir=None, +): + """Build the `` element with the body's geoms + inertial.""" + root = ET.Element("mujocoinclude") + + if free_joint: + joint_elem = ET.SubElement(root, "joint") + joint_elem.set("type", "free") + + inertial_elem = ET.SubElement(root, "inertial") + inertial_elem.set("pos", "0 0 0") + inertial_elem.set("mass", str(mass)) + inertial_elem.set( + "diaginertia", f"{diaginertia[0]} {diaginertia[1]} {diaginertia[2]}" + ) + + if submesh_files and materials: + # Multi-material: one geom per sub-mesh. + for submesh_file in submesh_files: + submesh_name = submesh_file.replace(".obj", "") + geom_elem = ET.SubElement(root, "geom") + geom_elem.set("type", "mesh") + geom_elem.set("mesh", submesh_name) + geom_elem.set("class", "obj_visual") + + material_assigned = False + submesh_path = Path(output_dir) / asset_name / submesh_file + if submesh_path.exists(): + try: + with open(submesh_path, "r", encoding="utf-8") as f: + submesh_lines = f.readlines() + for line in submesh_lines: + line = line.strip() + if line.startswith("usemtl "): + mtl_name = line.split()[1] + geom_elem.set("material", f"{asset_name}_{mtl_name}") + material_assigned = True + break + except Exception as e: + print(f"Warning: could not read sub-mesh {submesh_path}: {e}") + + if not material_assigned: + geom_elem.set( + "rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}" + ) + + elif materials and len(materials) == 1: + # Single material with possible texture. + geom_elem = ET.SubElement(root, "geom") + geom_elem.set("type", "mesh") + geom_elem.set("mesh", asset_name) + geom_elem.set("class", "obj_visual") + material_name = next(iter(materials)) + geom_elem.set("material", f"{asset_name}_{material_name}") + + elif convex_parts: + # Visual geom (full mesh) + collision geoms (convex parts). + visual_geom = ET.SubElement(root, "geom") + visual_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}") + visual_geom.set("mesh", asset_name) + visual_geom.set("class", "obj_visual") + + for i in range(convex_parts): + collision_geom = ET.SubElement(root, "geom") + collision_geom.set("type", "mesh") + collision_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}") + collision_geom.set("mesh", f"{asset_name}_part_{i}") + + else: + # Simple solid-colour mesh geom. + geom_elem = ET.SubElement(root, "geom") + geom_elem.set("type", "mesh") + geom_elem.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} {rgba[3]}") + geom_elem.set("mesh", asset_name) + + # When a material/sub-mesh path was taken AND convex decomposition is on, + # still emit invisible collision geoms. + if convex_parts and (submesh_files or (materials and len(materials) == 1)): + for i in range(convex_parts): + collision_geom = ET.SubElement(root, "geom") + collision_geom.set("type", "mesh") + collision_geom.set("rgba", f"{rgba[0]} {rgba[1]} {rgba[2]} 0") + collision_geom.set("mesh", f"{asset_name}_part_{i}") + + return root + + +def save_xml_with_formatting(root, filepath): + """Indent and write an ElementTree XML file (Python 3.9+).""" + ET.indent(root, space=" ", level=0) + tree = ET.ElementTree(root) + tree.write(filepath, encoding="utf-8", xml_declaration=False) + + +def create_preview_xml(asset_name): + """Build a minimal preview scene for `mujoco.viewer`.""" + root = ET.Element("mujoco") + root.set("model", "temp_preview_env") + + option = ET.SubElement(root, "option") + option.set("gravity", "0 0 -9.81") + + compiler = ET.SubElement(root, "compiler") + compiler.set("meshdir", ".") + compiler.set("texturedir", ".") + + include = ET.SubElement(root, "include") + include.set("file", f"{asset_name}/mjcf/{asset_name}_dependencies.xml") + + default = ET.SubElement(root, "default") + obj_default = ET.SubElement(default, "default") + obj_default.set("class", "obj_visual") + geom_default = ET.SubElement(obj_default, "geom") + geom_default.set("group", "2") + geom_default.set("type", "mesh") + geom_default.set("contype", "0") + geom_default.set("conaffinity", "0") + + worldbody = ET.SubElement(root, "worldbody") + + floor_geom = ET.SubElement(worldbody, "geom") + floor_geom.set("name", "floor") + floor_geom.set("type", "plane") + floor_geom.set("size", "2 2 0.1") + floor_geom.set("rgba", ".8 .8 .8 1") + + light = ET.SubElement(worldbody, "light") + light.set("pos", "0 0 3") + light.set("dir", "0 0 -1") + + body = ET.SubElement(worldbody, "body") + body.set("name", asset_name) + body.set("pos", "0 0 0.5") + + body_include = ET.SubElement(body, "include") + body_include.set("file", f"{asset_name}/mjcf/{asset_name}.xml") + + return root + + +# ===== Main ===== + +def _build_argparser(): + parser = argparse.ArgumentParser( + description="Convert a .obj or .stl mesh into MuJoCo MJCF assets." + ) + parser.add_argument( + "input_file", type=str, help="Path to the input mesh (.obj or .stl)." + ) + parser.add_argument( + "--rgba", + nargs=4, + type=float, + default=[0.5, 0.5, 0.5, 1], + help="Mesh RGBA colour. Default: [0.5, 0.5, 0.5, 1].", + ) + parser.add_argument( + "--mass", + type=float, + default=0.001, + help="Mesh mass (kg). Default: 0.001.", + ) + parser.add_argument( + "--diaginertia", + nargs=3, + type=float, + default=[0.00002, 0.00002, 0.00002], + help="Diagonal inertia tensor. Default: [2e-5, 2e-5, 2e-5].", + ) + parser.add_argument( + "-o", + "--output", + type=str, + default=None, + help=( + "Output assets root. Default: the input file's parent directory, " + "so that `scene_dir/3d_assets/foo.obj` writes to `scene_dir/`." + ), + ) + parser.add_argument( + "--free_joint", + action="store_true", + help="Add a free joint so the body can move.", + ) + parser.add_argument( + "-cd", + "--convex_decomposition", + action="store_true", + help=( + "Decompose the mesh into convex parts for accurate collision. " + "Requires `coacd` and `trimesh`." + ), + ) + parser.add_argument( + "--scene", + action="store_true", + help="Use high-precision CoACD config (smaller threshold).", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Open MuJoCo viewer with a preview scene after conversion.", + ) + return parser + + +def main(): + args = _build_argparser().parse_args() + + input_file = args.input_file + rgba = args.rgba + mass = args.mass + diaginertia = args.diaginertia + free_joint = args.free_joint + convex_de = args.convex_decomposition + verbose = args.verbose + + if args.output is None: + output_assets_dir = str(Path(input_file).resolve().parent) + else: + output_assets_dir = args.output + + if convex_de: + try: + import coacd # noqa: F401 + import trimesh # noqa: F401 + except ImportError: + print( + "Error: `coacd` and `trimesh` are required for " + "--convex_decomposition. Install with `pip install coacd trimesh`." + ) + raise SystemExit(1) + + if input_file.endswith(".obj"): + asset_name = os.path.basename(input_file)[: -len(".obj")] + elif input_file.endswith(".stl"): + asset_name = os.path.basename(input_file)[: -len(".stl")] + else: + raise SystemExit( + f"Error: {input_file} is not a supported mesh type. Use .obj or .stl." + ) + + # Per-asset folder lives directly under , with an `mjcf/` subfolder + # for the generated XML files. This way the whole asset (meshes + MTL + + # textures + convex parts + MJCF) is self-contained in one directory. + output_dir = os.path.join(output_assets_dir, asset_name) + mjcf_obj_dir = os.path.join(output_dir, "mjcf") + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + os.makedirs(output_dir) + os.makedirs(mjcf_obj_dir, exist_ok=True) + + # Copy the mesh (and MTL if relevant) into the per-asset folder. + if os.path.dirname(input_file) != output_dir: + if input_file.endswith(".obj"): + copy_obj_with_mtl( + Path(input_file), Path(output_dir) / Path(input_file).name + ) + else: + shutil.copy(input_file, output_dir) + + # Material splitting (OBJ only). + materials: Dict[str, Material] = {} + submesh_files: List[str] = [] + if input_file.endswith(".obj"): + print("Checking OBJ for multiple materials...") + obj_path = Path(output_dir) / f"{asset_name}.obj" + materials, submesh_files = split_obj_by_materials(obj_path, Path(output_dir)) + + # Copy referenced texture files (single or multi-material case). + if materials: + input_parent = Path(input_file).parent + for _name, material in materials.items(): + if material.map_Kd: + texture_src = input_parent / material.map_Kd + if texture_src.exists(): + texture_dst = Path(output_dir) / material.map_Kd + shutil.copy(texture_src, texture_dst) + print(f"Copied texture: {material.map_Kd}") + + if submesh_files: + print(f"Split into {len(submesh_files)} sub-meshes.") + elif len(materials) == 1: + print("Single material; no split needed.") + else: + print("No materials detected.") + + convex_parts_count = 0 + if convex_de: + import coacd + import trimesh + + print(f"Running convex decomposition on {asset_name}...") + mesh = trimesh.load(input_file, force="mesh") + mesh_coacd = coacd.Mesh(mesh.vertices, mesh.faces) + coacd_config_scene = { + "threshold": 0.01, + "preprocess_resolution": 100, + } + coacd_config = coacd_config_scene if args.scene else {} + parts = coacd.run_coacd(mesh_coacd, **coacd_config) + + for i, part in enumerate(parts): + part_filename = f"part_{i}.obj" + output_part_file = os.path.join(output_dir, part_filename) + part_mesh = trimesh.Trimesh(vertices=part[0], faces=part[1]) + part_mesh.export(output_part_file) + + convex_parts_count = len(parts) + print(f"{asset_name} decomposed into {convex_parts_count} convex parts.") + + # Emit the asset dependency XML. + asset_xml = create_asset_xml( + asset_name, + convex_parts_count if convex_de else None, + materials if (submesh_files or len(materials) == 1) else None, + submesh_files if submesh_files else None, + ) + asset_file_path = os.path.join(mjcf_obj_dir, f"{asset_name}_dependencies.xml") + save_xml_with_formatting(asset_xml, asset_file_path) + + # Emit the body geom XML. + geom_xml = create_geom_xml( + asset_name, + mass, + diaginertia, + rgba, + free_joint, + convex_parts_count if convex_de else None, + materials if (submesh_files or len(materials) == 1) else None, + submesh_files if submesh_files else None, + output_assets_dir, + ) + geom_file_path = os.path.join(mjcf_obj_dir, f"{asset_name}.xml") + save_xml_with_formatting(geom_xml, geom_file_path) + + print(f"Converted {asset_name} to MJCF.") + print(f" meshes: {output_dir}") + print(f" dependencies: {asset_file_path}") + print(f" body geom: {geom_file_path}") + if submesh_files: + print( + f" material split: {len(submesh_files)} sub-meshes, " + f"{len(materials)} materials" + ) + + if verbose: + print("\nLaunching MuJoCo viewer...") + py_dir = shutil.which("python") or shutil.which("python3") + if not py_dir: + print("Error: no `python`/`python3` on PATH; cannot launch viewer.") + raise SystemExit(1) + + tmp_world_mjcf = os.path.join(output_assets_dir, "_tmp_preview.xml") + preview_xml = create_preview_xml(asset_name) + save_xml_with_formatting(preview_xml, tmp_world_mjcf) + + cmd_line = f"{py_dir} -m mujoco.viewer --mjcf {tmp_world_mjcf}" + print(f"Running: {cmd_line}") + os.system(cmd_line) + + +if __name__ == "__main__": + main() diff --git a/pipeline/objects_generation.py b/pipeline/objects_generation.py new file mode 100644 index 0000000..906c8c5 --- /dev/null +++ b/pipeline/objects_generation.py @@ -0,0 +1,455 @@ +import os +os.environ["PYOPENGL_PLATFORM"] = "egl" +import argparse +import copy + +import numpy as np +import torch +import imageio +from PIL import Image +from scipy.spatial.transform import Rotation as R + +from pipeline.utils import ( + clean_name, + load_image, + collect_mask_paths, + compute_fov_from_intrinsics, + mesh_rendering, + get_default_mesh_renderer, +) +# `inference` is exposed by submodule/Sam-3d-objects via PYTHONPATH; see the +# top-level shell scripts. +from inference import ( + Inference, + make_scene, + render_gs_view, +) + + +# Coordinate-system transform applied to SAM-3D-Objects mesh outputs to bring +# them into the world frame this pipeline operates in (rotates +Y -> +Z, etc.). +_SAM3D_TO_WORLD = np.array( + [ + [1, 0, 0, 0], + [0, 0, -1, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + ] +) + +_DEFAULT_IMAGE_SIZE = (448, 448) + + +def _flip_xy(arr): + """Negate the X and Y components in-place on an (N, 3+) array or tensor.""" + arr[:, 0] = -arr[:, 0] + arr[:, 1] = -arr[:, 1] + return arr + + +def _load_depth_with_fallback(image_dir, required_depth_path): + """Prefer `depth_ori.npy` (raw AnySplat output) over `depth.npy`.""" + depth_ori_path = os.path.join(image_dir, "depth_ori.npy") + if os.path.exists(depth_ori_path): + return np.load(depth_ori_path) + return np.load(required_depth_path) + + +def process_single_image(image_path, inference, args): + """Run multi-object inference and asset export for one input image.""" + image_path = os.path.abspath(image_path) + image_dir = os.path.dirname(image_path) + + # Optional scene scale factor produced by the AnySplat stage. + scale_factor_path = os.path.join(image_dir, "scale.npy") + if os.path.exists(scale_factor_path): + scale_factor = float(np.asarray(np.load(scale_factor_path)).squeeze()) + else: + scale_factor = 1.0 + print(f"Scale factor: {scale_factor}") + + pil_image = load_image(image_path) + image_bg = np.array(pil_image) + + masks_dir = os.path.join(image_dir, "masks") + mask_paths = collect_mask_paths(masks_dir) + + assets_dir = os.path.join(image_dir, "3d_assets") + pt_dir = os.path.join(image_dir, "pt") + + if not mask_paths: + print(f"Warning: No mask images found in {masks_dir}") + print("Creating placeholder directories and continuing...") + os.makedirs(assets_dir, exist_ok=True) + os.makedirs(pt_dir, exist_ok=True) + return + + os.makedirs(assets_dir, exist_ok=True) + os.makedirs(pt_dir, exist_ok=True) + + required_files = { + "extrinsic": os.path.join(image_dir, "extrinsic.npy"), + "intrinsic": os.path.join(image_dir, "intrinsic.npy"), + "depth": os.path.join(image_dir, "depth.npy"), + } + missing_files = [name for name, p in required_files.items() if not os.path.exists(p)] + if missing_files: + print(f"Warning: Missing required files: {missing_files}") + print("These files should be generated by the AnySplat pipeline first.") + return + + extrinsics = np.load(required_files["extrinsic"]) + intrinsics = np.load(required_files["intrinsic"]) + depth_anysplat = _load_depth_with_fallback(image_dir, required_files["depth"]) + + fx_pixels = intrinsics[0, 0] + fy_pixels = intrinsics[1, 1] + + image_size = _DEFAULT_IMAGE_SIZE + _, fov_y = compute_fov_from_intrinsics(fx_pixels, fy_pixels, image_size, degrees=True) + mesh_renderer = get_default_mesh_renderer(width=image_size[1], height=image_size[0]) + + device = "cuda" if torch.cuda.is_available() else "cpu" + + for i, mask_path in enumerate(mask_paths): + print(f"\n[{i+1}/{len(mask_paths)}] Processing mask: {mask_path}") + + # ===== Load and binarize mask ===== + mask_ = np.array(Image.open(mask_path).convert("L")) + mask = np.where(mask_ > 0, 1, 0).astype("uint8") + size_ori = np.sum(mask) + + depth_fg = depth_anysplat[mask > 0] + if len(depth_fg) == 0: + print("Warning: Mask has no valid depth values, skipping image.") + return + mean_depth_ori = depth_fg.mean() + min_depth_ori = depth_fg.min() + max_depth_ori = depth_fg.max() + print( + f"Depth in mask region: mean={mean_depth_ori:.4f}, " + f"min={min_depth_ori:.4f}, max={max_depth_ori:.4f}" + ) + + mask_stem = clean_name(os.path.splitext(os.path.basename(mask_path))[0]) + save_path = os.path.join(pt_dir, f"{mask_stem}.pt") + + # ===== Run or load inference ===== + if os.path.exists(save_path): + print(f"Loading cached inference result: {save_path}") + out = torch.load(save_path, map_location=device, weights_only=False) + else: + print("Running inference on mask...") + out = inference(image_bg, mask, seed=args.seed) + if args.save_pt: + torch.save(out, save_path) + print(f"Saved inference result: {save_path}") + + gs_origin = copy.deepcopy(out["gs"]) + + # ===== Optional intermediate GS preview before mesh alignment ===== + if args.save_intermediate: + single_scene = make_scene(out) + xyz_cv = _flip_xy(single_scene.get_xyz.clone()) + single_scene.from_xyz(xyz_cv) + image_gs = render_gs_view( + single_scene, extrinsics=extrinsics, fov_y=fov_y / 180 * np.pi + ) + imageio.imwrite( + os.path.join(image_dir, f"{mask_stem}_1_gs.png"), image_gs + ) + single_scene.save_ply( + os.path.join(assets_dir, f"{mask_stem}_gs_with_inferenced_pose.ply") + ) + else: + single_scene = None + + # ===== Pose parameters from SAM-3D-Objects ===== + rotation_output = out["rotation"].cpu().numpy() + translation_output = out["translation"].cpu().numpy() + scale_output = out["scale"].squeeze(0).cpu().numpy() + + print(f"Rotation (quaternion): {rotation_output}") + print(f"Translation: {translation_output}") + print(f"Scale: {scale_output}") + + if not out["glb"]: + # No mesh produced for this object; skip to cleanup. + if single_scene is not None: + del single_scene + del out + torch.cuda.empty_cache() + continue + + mesh = out["glb"] + mesh.apply_transform(_SAM3D_TO_WORLD) + mesh_origin = copy.deepcopy(mesh) + + # The pose transform below is only used to estimate object size from the + # current view; the exported asset stays at the origin. + quat = copy.deepcopy(rotation_output) + rot = R.from_quat(quat, scalar_first=True).as_matrix().squeeze(0) + inverse_rot = np.linalg.inv(rot) + + scale = np.broadcast_to(np.asarray(scale_output, dtype=float), (3,)).copy() + scale_mat = np.diag(scale) + + transform = np.eye(4) + transform[:3, :3] = inverse_rot @ scale_mat + transform[:3, 3] = copy.deepcopy(translation_output) + mesh.apply_transform(transform) + _flip_xy(mesh.vertices) + + if args.save_intermediate: + mesh.export(os.path.join(assets_dir, f"{mask_stem}_mesh_with_inferenced_pose.obj")) + + # ===== Render to recover scale by area + depth ratio ===== + mesh_copy = copy.deepcopy(mesh) + color, depth = mesh_rendering( + mesh=mesh_copy, + extrinsics=extrinsics, + fov_y=fov_y / 180 * np.pi, + renderer=mesh_renderer, + ) + if args.save_intermediate: + imageio.imwrite( + os.path.join(image_dir, f"{mask_stem}_1_mesh.png"), color + ) + + valid_depth = depth[depth > 0] + if len(valid_depth) == 0: + if single_scene is not None: + del single_scene + del out + torch.cuda.empty_cache() + continue + + mean_depth_sam3d = np.mean(valid_depth) + size_new = np.sum(depth > 0) + scale_factor_local = ( + np.sqrt(size_ori / size_new) * (mean_depth_ori / mean_depth_sam3d) + ) + mesh.apply_scale(scale_factor_local) + + mesh_copy = mesh.copy() + color, depth = mesh_rendering( + mesh=mesh_copy, + extrinsics=extrinsics, + fov_y=fov_y / 180 * np.pi, + renderer=mesh_renderer, + ) + valid_depth = depth[depth > 0] + mean_depth_sam3d_2 = np.mean(valid_depth) + z_shift_2 = mean_depth_ori - mean_depth_sam3d_2 + mesh.vertices = mesh.vertices + np.array([0, 0, z_shift_2]) + + if args.save_intermediate: + transformed_mesh_path = os.path.join(assets_dir, f"{mask_stem}_mesh_final.obj") + mesh.export(transformed_mesh_path) + print(f"Saved transformed mesh: {transformed_mesh_path}") + color, _ = mesh_rendering( + mesh=mesh, + extrinsics=extrinsics, + fov_y=fov_y / 180 * np.pi, + renderer=mesh_renderer, + ) + imageio.imwrite( + os.path.join(image_dir, f"{mask_stem}_mesh.png"), color + ) + + # ===== Final export at origin (mesh + GS) ===== + total_scale = float(scale_factor_local * scale_output[0]) / scale_factor + print( + f"Total scaling: {total_scale:.4f} " + f"(local_scale={scale_factor_local:.4f}, " + f"object_scale={scale_output[0]:.4f}, scene_scale={scale_factor})" + ) + + mesh_origin.apply_scale(total_scale) + resized_mesh_path = os.path.join(assets_dir, f"{mask_stem}.obj") + mesh_origin.export(resized_mesh_path) + print(f"Saved resized mesh for mujoco: {resized_mesh_path}") + + final_mesh_mean_xyz = np.mean(mesh_origin.vertices, axis=0) + mean_xyz_path = os.path.join(assets_dir, f"{mask_stem}_keyframe.npy") + np.save(mean_xyz_path, final_mesh_mean_xyz) + print( + f"Final mesh mean XYZ: " + f"[{final_mesh_mean_xyz[0]:.6f}, {final_mesh_mean_xyz[1]:.6f}, " + f"{final_mesh_mean_xyz[2]:.6f}]" + ) + print(f"Saved final mesh mean XYZ to: {mean_xyz_path}") + + if args.save_intermediate: + # Apply the same transform to the GS scene so the debug snapshot + # matches the mesh. + xyz_cv = single_scene.get_xyz.clone() * scale_factor_local + single_scene.from_xyz(xyz_cv) + + scale_t = single_scene.get_scaling * scale_factor_local + single_scene.mininum_kernel_size *= scale_factor_local + scale_t = torch.maximum( + scale_t, + torch.tensor( + gs_origin.mininum_kernel_size * 1.1, + device=scale_t.device, + dtype=scale_t.dtype, + ), + ) + single_scene.from_scaling(scale_t) + + xyz_cv = single_scene.get_xyz.clone() + xyz_cv[:, 2] = xyz_cv[:, 2] + z_shift_2 + single_scene.from_xyz(xyz_cv) + + single_ply_path = os.path.join(assets_dir, f"{mask_stem}_gs_final.ply") + single_scene.save_ply(single_ply_path) + print(f"Saved transformed Gaussian: {single_ply_path}") + image_gs = render_gs_view( + single_scene, extrinsics=extrinsics, fov_y=fov_y / 180 * np.pi + ) + imageio.imwrite( + os.path.join(image_dir, f"{mask_stem}_gs.png"), image_gs + ) + + # Scale the original GS to mujoco units and save. + xyz = gs_origin.get_xyz * total_scale + gs_origin.from_xyz(xyz) + + scale_t = gs_origin.get_scaling * total_scale + gs_origin.mininum_kernel_size *= total_scale + scale_t = torch.maximum( + scale_t, + torch.tensor( + gs_origin.mininum_kernel_size * 1.1, + device=scale_t.device, + dtype=scale_t.dtype, + ), + ) + gs_origin.from_scaling(scale_t) + + origin_ply_path = os.path.join(assets_dir, f"{mask_stem}.ply") + gs_origin.save_ply(origin_ply_path) + print(f"Saved resized Gaussian for mujoco: {origin_ply_path}") + + if single_scene is not None: + del single_scene + del out + torch.cuda.empty_cache() + + print(f"Completed processing mask: {mask_stem}") + + print(f"\nAll masks processed for image: {image_path}") + + +def main(): + parser = argparse.ArgumentParser( + description=( + "Run SAM-3D-Objects multi-object inference, save outputs to .pt, " + "and reconstruct per-object Gaussian (.ply) and mesh (.obj) assets." + ) + ) + parser.add_argument( + "--project-root", + type=str, + default="submodule/Sam-3d-objects", + help="Root directory of the sam-3d-objects project.", + ) + parser.add_argument( + "--input-dir", + type=str, + required=True, + help="Input directory containing image folders.", + ) + parser.add_argument( + "--image-name", + type=str, + default="input_image.png", + help="Name of the image file to process in each folder.", + ) + parser.add_argument( + "--tag", + type=str, + default="hf", + help=( + "Checkpoint tag, corresponds to " + "submodule/Sam-3d-objects/checkpoints/{tag}/pipeline.yaml" + ), + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed passed into Inference.__call__.", + ) + parser.add_argument( + "--save-pt", + action="store_true", + help="Save inference results to pt/*.pt. By default results are not saved.", + ) + parser.add_argument( + "--save-intermediate", + action="store_true", + help="Save intermediate debug outputs (e.g. *_1_gs.png, *_1_mesh.png).", + ) + + args = parser.parse_args() + + if os.path.isfile(args.input_dir): + input_dir = os.path.dirname(os.path.abspath(args.input_dir)) + else: + input_dir = os.path.abspath(args.input_dir) + + project_root = os.path.abspath(args.project_root) + + print(f"Project root (Sam-3d-objects): {project_root}") + print(f"Input directory: {input_dir}") + print(f"Looking for image files named: {args.image_name}") + + config_path = os.path.join(project_root, "checkpoints", args.tag, "pipeline.yaml") + print(f"Loading model from config: {config_path}") + inference = Inference(config_path, compile=False) + print("Model loaded successfully") + + image_files = [] + for root, _dirs, files in os.walk(input_dir): + for file in files: + if file == args.image_name: + image_files.append(os.path.join(root, file)) + + print(f"Found {len(image_files)} image files to process") + + if not image_files: + print(f"No {args.image_name} files found in {input_dir}") + print("Directory structure:") + for root, _dirs, files in os.walk(input_dir): + level = root.replace(input_dir, "").count(os.sep) + indent = " " * 2 * level + print(f"{indent}{os.path.basename(root)}/") + subindent = " " * 2 * (level + 1) + for file in files: + if file.lower().endswith((".png", ".jpg", ".jpeg")): + print(f"{subindent}{file}") + return + + for idx, image_path in enumerate(image_files, 1): + print(f"\n{'=' * 80}") + print(f"Processing image {idx}/{len(image_files)}") + print(f"Image path: {image_path}") + print(f"{'=' * 80}") + + try: + process_single_image(image_path, inference, args) + print(f"Successfully processed: {image_path}") + except Exception as e: + print(f"Error processing {image_path}: {e}") + import traceback + traceback.print_exc() + + print(f"\n{'=' * 80}") + print(f"All processing completed. Processed {len(image_files)} images.") + print(f"{'=' * 80}") + + +if __name__ == "__main__": + main() diff --git a/pipeline/reconstruct_from_pt.py b/pipeline/reconstruct_from_pt.py deleted file mode 100644 index 929e426..0000000 --- a/pipeline/reconstruct_from_pt.py +++ /dev/null @@ -1,144 +0,0 @@ -import os -import glob -import argparse - -import torch - -from inference import ( - make_scene, - ready_gaussian_for_video_rendering, - render_video, - interactive_visualizer, -) - - -def main(): - parser = argparse.ArgumentParser( - description="Load saved *.pt and reconstruct single & multi-object Gaussian .ply" - ) - parser.add_argument( - "--project-root", - type=str, - default="sam-3d-objects", - help="Root directory of sam-3d-objects project.", - ) - parser.add_argument( - "--save-dir", - type=str, - default="sam-3d-objects/torch_save_pt", - help="Directory containing *.pt files.", - ) - parser.add_argument( - "--image-path", - type=str, - default="sam3/assets/img.jpg", - help="Original image path (used only to derive IMAGE_NAME).", - ) - parser.add_argument( - "--export-gif", - action="store_true", - help="If set, render GIFs for each object and the merged scene.", - ) - args = parser.parse_args() - - project_root = args.project_root - image_path = args.image_path - image_name = os.path.basename(os.path.dirname(image_path)) - - # 这里不再限定 object_*.pt,而是把 save-dir 下所有 .pt 都吃掉 - paths = sorted(glob.glob(os.path.join(args.save_dir, "*.pt"))) - if not paths: - raise RuntimeError(f"No .pt found under {args.save_dir}") - - print(f"Found {len(paths)} .pt files:") - for p in paths: - print(" ", p) - - device = "cuda" if torch.cuda.is_available() else "cpu" - - # 单物体输出目录 - single_gauss_dir = os.path.join(project_root, "gaussians", "single") - os.makedirs(single_gauss_dir, exist_ok=True) - - # 合并场景要用到的 outputs - outputs = [] - - if args.export_gif: - import imageio - - # ========================= - # 1️⃣ 遍历每个 .pt:导出单物体 PLY (+ 可选 GIF) - # ========================= - for idx, p in enumerate(paths): - print(f"[{idx+1}/{len(paths)}] loading {p}") - out = torch.load(p, map_location=device) - # 输出out 的dict键 - print(f" Output keys: {list(out.keys())}") - - outputs.append(out) - - # 只用 make_scene,不做 ready_gaussian_for_video_rendering - single_scene = make_scene(out) - - stem = os.path.splitext(os.path.basename(p))[0] - single_ply_path = os.path.join(single_gauss_dir, f"{stem}.ply") - single_scene.save_ply(single_ply_path) - print(f"🟢 Saved single-object PLY: {single_ply_path}") - - if args.export_gif: - video = render_video( - single_scene, - r=1, - fov=60, - resolution=512, - )["color"] - - single_gif_path = os.path.join(single_gauss_dir, f"{stem}.gif") - imageio.mimsave( - single_gif_path, - video, - format="GIF", - duration=1000 / 30, # 30fps - loop=0, - ) - print(f"🎞️ Saved single-object GIF: {single_gif_path}") - - # 如果显存很紧张,可以在这里 del single_scene / video 等 - del single_scene - - print("✅ All single-object scenes exported.") - - # ========================= - # 2️⃣ 合并多对象场景:PLY (+ 可选 GIF) - # ========================= - scene_gs = make_scene(*outputs) - scene_gs = ready_gaussian_for_video_rendering(scene_gs) - - gauss_dir = os.path.join(project_root, "gaussians", "multi") - os.makedirs(gauss_dir, exist_ok=True) - - ply_path = os.path.join(gauss_dir, f"{image_name}.ply") - scene_gs.save_ply(ply_path) - print(f"✅ Saved merged PLY: {ply_path}") - - if args.export_gif: - video = render_video( - scene_gs, - r=1, - fov=60, - resolution=512, - )["color"] - - gif_path = os.path.join(gauss_dir, f"{image_name}.gif") - imageio.mimsave( - gif_path, - video, - format="GIF", - duration=1000 / 30, # 30fps - loop=0, - ) - print(f"✅ Saved merged GIF: {gif_path}") - - -if __name__ == "__main__": - main() diff --git a/pipeline/run_sam3_agent_full.py b/pipeline/run_sam3_agent_full.py deleted file mode 100644 index c6b0290..0000000 --- a/pipeline/run_sam3_agent_full.py +++ /dev/null @@ -1,456 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -""" -prompt + image -> SAM3 多物体分割 mask 的完整脚本: - -1. 用 Qwen3-VL-8B-Thinking 看图,生成若干条英文物体描述 prompt_list -2. 对每条 prompt 调用 SAM3 分割: - - 输出到 agent_output_multi/obj_i/*.json - - json 里包含 pred_masks(RLE)、overlay 图路径等 -3. 将所有 obj_i/*.json 里的 pred_masks 解码为 PNG 二值 mask: - - 保存到 agent_output_multi/masks/obj_i//mask_k.png - -之后,你的 run_sam3d_multi.py 里的 --mask-root -可以直接指向 agent_output_multi/masks。 -""" - -import os -import ast -import json -import argparse -from functools import partial -from typing import Optional - -import numpy as np -import torch -from PIL import Image -import pycocotools.mask as mask_util - -import sam3 -from sam3 import build_sam3_image_model -from sam3.model.sam3_image_processor import Sam3Processor -from sam3.agent.client_llm import send_generate_request as send_generate_request_orig -from sam3.agent.client_sam3 import call_sam_service as call_sam_service_orig - - -# ========================= -# 0. 环境变量(可按需精简) -# ========================= - - - -# ========================= -# 1. LLM 配置(Qwen3-VL) -# ========================= - -LLM_CONFIGS = { - # vLLM-served models - "qwen3_vl_8b_thinking": { - "provider": "vllm", - # model 不再写死,在 build_llm_config 时通过参数传入 - "model": None, - }, -} - - -def build_llm_config( - name: str = "qwen3_vl_8b_thinking", - model_id: Optional[str] = None, -): - """ - 构建 LLM config: - - name: 在 LLM_CONFIGS 里的 key - - model_id: 要发给 vLLM 的模型名称(需与 --served-model-name 一致) - """ - cfg = LLM_CONFIGS[name].copy() - cfg["name"] = name - cfg["api_key"] = "LOCAL_VLLM" - - if model_id is not None: - cfg["model"] = model_id - elif cfg.get("model") is None: - raise ValueError( - "LLM model id is not set. Please pass --llm-model-id to match vLLM --served-model-name." - ) - - if cfg["provider"] == "vllm": - server_url = "http://127.0.0.1:8001/v1" - else: - server_url = cfg["base_url"] - - return cfg, server_url - - -# ========================= -# 2. SAM3 模型构建 -# ========================= - -def build_sam3_processor() -> Sam3Processor: - sam3_root = os.path.join(os.path.dirname(sam3.__file__), "..") - bpe_path = f"{sam3_root}/assets/bpe_simple_vocab_16e6.txt.gz" - model = build_sam3_image_model(bpe_path=bpe_path) - processor = Sam3Processor(model, confidence_threshold=0.5) - return processor - - -# ========================= -# 3. Qwen 生成场景 prompt_list -# ========================= - -def generate_scene_prompts_with_qwen( - image_path: str, - send_generate_request, - llm_config: dict, - max_prompts: int = 12, - system_prompt_path: str = "examples/system_prompt_scene_prompts.txt", -): - """ - 1. 调 Qwen3-VL-8B-Thinking,看图生成可分割对象的英文短 prompt 列表。 - 2. 更鲁棒地解析 ...[...]...,在缺少 closing tag 时也能工作。 - 3. 自动清洗掉 等无效内容。 - """ - - # 1) 读取 system prompt - if not os.path.exists(system_prompt_path): - raise FileNotFoundError(f"system prompt file not found: {system_prompt_path}") - - with open(system_prompt_path, "r", encoding="utf-8") as f: - system_prompt = f.read().strip() - - # 2) 构造 messages(带 image_url) - image_path = os.path.abspath(image_path) - image_url = f"file://{image_path}" - - messages = [ - {"role": "system", "content": system_prompt}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": ( - "You are given the image above. " - "Follow the instructions in the system prompt to analyze the scene, " - "then output both ... and .... " - "Do NOT omit the block. The block must be a valid Python list of strings." - ), - }, - {"type": "image_url", "image_url": {"url": image_url}}, - ], - }, - ] - - # 3) 调用 vLLM / Qwen - resp = send_generate_request(messages=messages) - - # 4) 统一拿到 raw_text - if isinstance(resp, str): - raw_text = resp - elif isinstance(resp, dict): - try: - raw_text = resp["choices"][0]["message"]["content"] - except Exception: - raw_text = str(resp) - else: - try: - raw_text = resp.choices[0].message.content - except Exception: - raw_text = str(resp) - - raw_text = raw_text.strip() - - # --------------------------- - # 5) 尝试从 中抽取“[...]”这段 - # --------------------------- - list_block = raw_text - - # 先截掉 前面的分析内容 - if "" in raw_text: - after_tag = raw_text.split("", 1)[1] - list_block = after_tag - # 如果有 closing tag,再截掉后面 - if "" in list_block: - list_block = list_block.split("", 1)[0] - - # 从 list_block 中找第一个 '[' 和最后一个 ']',尽量拿到一个完整的 Python list 字符串 - inner = None - start = list_block.find("[") - end = list_block.rfind("]") - if start != -1 and end != -1 and end > start: - inner = list_block[start : end + 1].strip() - - # 如果还是没拿到,就 fallback:把整个 list_block 当作 inner - if inner is None: - inner = list_block.strip() - - # --------------------------- - # 6) 解析 inner -> Python list[str] - # --------------------------- - prompt_list: list[str] = [] - - # 优先 literal_eval - try: - data = ast.literal_eval(inner) - if isinstance(data, list): - prompt_list = [ - s.strip() - for s in data - if isinstance(s, str) and s.strip() - ] - else: - raise ValueError("parsed object is not a list") - except Exception: - # fallback:行级解析(更严格一点,只收“看起来像短 prompt”的行) - lines = [l.strip() for l in inner.splitlines() if l.strip()] - tmp: list[str] = [] - for l in lines: - # 跳过明显是 tag 或分析段落的行 - if l.startswith("<") and l.endswith(">"): - continue - if l in ("", ""): - continue - - # 如果是形如 1. xxx / 2) xxx - if l[0].isdigit(): - parts = l.split(maxsplit=1) - if len(parts) == 2: - candidate = parts[1].lstrip(".)").strip() - else: - candidate = l - else: - candidate = l - - # 简单过滤掉过长的整段分析(比如一个大段落 > 200 字符) - if len(candidate) > 200: - continue - - if candidate: - tmp.append(candidate) - - prompt_list = tmp - - # --------------------------- - # 7) 最后再清洗一遍 prompt_list - # --------------------------- - cleaned: list[str] = [] - for s in prompt_list: - s = s.strip() - if not s: - continue - # 丢掉残余的 tag / think - if s.startswith("<") and s.endswith(">"): - continue - if s in ("", ""): - continue - cleaned.append(s) - - prompt_list = cleaned[:max_prompts] - return raw_text, prompt_list - - -# ========================= -# 4. JSON → PNG mask 工具 -# ========================= - -def safe_name(name: str) -> str: - """简单处理一下名字中的空格,避免路径问题。""" - return name.replace(" ", "_") - - -def decode_rle_mask(counts: str, h: int, w: int) -> np.ndarray: - """将 SAM3/COCO RLE 字符串解码为 (h, w) 的 0/1 uint8 mask。""" - rle = {"counts": counts.encode("utf-8"), "size": [h, w]} - mask = mask_util.decode(rle) # (h, w, 1) 或 (h, w) - if mask.ndim == 3: - mask = mask[:, :, 0] - return mask.astype(np.uint8) - - -def convert_agent_json_to_masks(agent_root: str): - """ - 遍历 agent_root 下的 obj_*/ 目录, - 把所有 json 里的 pred_masks 解码为 PNG mask。 - - 输出结构: - agent_root/masks/obj_i//mask_k.png - """ - agent_root = os.path.abspath(agent_root) - mask_root = os.path.join(agent_root, "masks") - os.makedirs(mask_root, exist_ok=True) - - print(f"[INFO] Converting JSON → PNG masks under: {agent_root}") - print(f"[INFO] Masks will be saved to: {mask_root}") - - for obj_name in os.listdir(agent_root): - obj_dir = os.path.join(agent_root, obj_name) - if not os.path.isdir(obj_dir): - continue - if os.path.abspath(obj_dir) == os.path.abspath(mask_root): - continue - - safe_obj_name = safe_name(obj_name) - obj_mask_root = os.path.join(mask_root, safe_obj_name) - os.makedirs(obj_mask_root, exist_ok=True) - - print(f"\n=== Scanning folder: {obj_dir} → {obj_mask_root} ===") - - for root, _, files in os.walk(obj_dir): - for fname in files: - if not fname.endswith(".json"): - continue - - json_path = os.path.join(root, fname) - - try: - with open(json_path, "r") as f: - data = json.load(f) - except Exception as e: - print(f" [SKIP] Failed to load {json_path}: {e}") - continue - - # 某些是 list(history log),直接跳过 - if not isinstance(data, dict): - print(f" [SKIP] {json_path}: json is list, not mask dict") - continue - - pred_masks = data.get("pred_masks") - if not pred_masks: - print(f" [SKIP] {json_path}: no pred_masks") - continue - - h = data.get("orig_img_h") - w = data.get("orig_img_w") - if h is None or w is None: - print(f" [SKIP] {json_path}: missing height/width") - continue - - json_basename = os.path.splitext(os.path.basename(json_path))[0] - safe_json_basename = safe_name(json_basename) - - out_dir = os.path.join(obj_mask_root, safe_json_basename) - os.makedirs(out_dir, exist_ok=True) - - print(f" [OK] {json_path}: {len(pred_masks)} masks → {out_dir}") - - scores = data.get("pred_scores", []) - for i, counts in enumerate(pred_masks): - mask = decode_rle_mask(counts, h, w) - - mask_save_path = os.path.join(out_dir, f"mask_{i+1}.png") - Image.fromarray(mask * 255).save(mask_save_path) - - score_str = f", score={scores[i]:.3f}" if i < len(scores) else "" - print(f" saved mask_{i+1}.png{score_str}") - - -# ========================= -# 5. 主流程:prompt + img -> mask -# ========================= - -def main(): - parser = argparse.ArgumentParser( - description="Qwen3-VL + SAM3: prompt+image -> multi-object masks" - ) - parser.add_argument( - "--image-path", - type=str, - default="sam3/assets/img.jpg", - help="输入图片路径", - ) - parser.add_argument( - "--output-root", - type=str, - default="sam3/agent_output_multi", - help="SAM3 多物体输出根目录(内部会建 obj_1, obj_2, ...)", - ) - parser.add_argument( - "--system-prompt-path", - type=str, - default="sam3/examples/system_prompt_scene_prompts.txt", - help="Qwen 用的 system prompt 文本路径", - ) - parser.add_argument( - "--max-prompts", - type=int, - default=12, - help="最多保留多少个物体 prompt", - ) - parser.add_argument( - "--skip-first", - action="store_true", - help="是否丢弃 prompt_list 的第一个元素(如果它更像场景描述而不是具体物体)", - ) - parser.add_argument( - "--llm-model-id", - type=str, - default="sam3/models", - help="发送给 LLM 服务的模型名称(需与 vLLM --served-model-name 一致)", - ) - - args = parser.parse_args() - - - # 构建 LLM & SAM3 - llm_config, llm_server_url = build_llm_config( - name="qwen3_vl_8b_thinking", - model_id=args.llm_model_id, - ) - processor = build_sam3_processor() - - send_generate_request = partial( - send_generate_request_orig, - server_url=llm_server_url, - model=llm_config["model"], - api_key=llm_config["api_key"], - ) - call_sam_service = partial(call_sam_service_orig, sam3_processor=processor) - - image = os.path.abspath(args.image_path) - output_root = os.path.abspath(args.output_root) - os.makedirs(output_root, exist_ok=True) - - # 1) Qwen 生成场景 prompt_list - print(f"[INFO] Generating prompts for image: {image}") - raw_text, prompt_list = generate_scene_prompts_with_qwen( - image_path=image, - send_generate_request=send_generate_request, - llm_config=llm_config, - max_prompts=args.max_prompts, - system_prompt_path=args.system_prompt_path, - ) - - print("\n====== 原始 Qwen 输出(raw_text,截断开头 800 字) ======") - print(raw_text[:800]) - print("......\n") - - if args.skip_first and len(prompt_list) > 1: - prompt_list = prompt_list[1:] - - print("====== 解析后的 prompt_list ======") - for i, p in enumerate(prompt_list, start=1): - print(f"{i}. {p}") - - # 2) 逐个 prompt 调用 SAM3,写入 json - for i, prompt in enumerate(prompt_list, start=1): - print(f"\n================ [Prompt {i}] {prompt} ================\n") - - this_output_dir = os.path.join(output_root, f"obj_{i}") - os.makedirs(this_output_dir, exist_ok=True) - - json_path = call_sam_service( - image_path=image, - text_prompt=prompt, - output_folder_path=this_output_dir, - ) - print(f"[OK] SAM3 output json: {json_path}") - - # 3) 把所有 json 里的 pred_masks 解码为 PNG mask - convert_agent_json_to_masks(output_root) - - print("\n✅ All done. Masks are under:") - print(f" {os.path.join(output_root, 'masks')}") - - -if __name__ == "__main__": - main() diff --git a/pipeline/run_sam3d_multi.py b/pipeline/run_sam3d_multi.py deleted file mode 100644 index 4ea9f00..0000000 --- a/pipeline/run_sam3d_multi.py +++ /dev/null @@ -1,146 +0,0 @@ -import os -import argparse - -import numpy as np -import torch -from PIL import Image - -from inference import Inference - -import re - -def clean_name(x: str): - return re.sub(r'[^0-9a-zA-Z_]', '', x) - - - - -def load_image(path: str) -> Image.Image: - img = Image.open(path).convert("RGB") - return img - - -def collect_mask_paths(mask_root: str): - """ - 递归收集 mask_root 下所有 png/jpg/jpeg 的路径。 - """ - all_mask_paths = [] - for root, _, files in os.walk(mask_root): - for f in files: - lf = f.lower() - if lf.endswith(".png") or lf.endswith(".jpg") or lf.endswith(".jpeg"): - all_mask_paths.append(os.path.join(root, f)) - - all_mask_paths.sort() - print(f"Found {len(all_mask_paths)} mask files under {mask_root}") - return all_mask_paths - - -def load_binary_mask(path: str): - """ - 单个 mask 文件 → 二值 uint8 数组 (H, W), {0, 1} - """ - m = np.array(Image.open(path).convert("L")) - m = (m > 128).astype("uint8") - return m - - -def main(): - parser = argparse.ArgumentParser( - description="Run SAM3D multi-object inference and save outputs to .pt" - ) - parser.add_argument( - "--image-path", - type=str, - default="sam3/assets/img.jpg", - help="Input image path to lift to 3D.", - ) - parser.add_argument( - "--mask-root", - type=str, - default="sam3/agent_output_multi/masks", - help="Directory containing mask PNG/JPGs.", - ) - parser.add_argument( - "--save-dir", - type=str, - default="sam-3d-objects/torch_save_pt", - help="Where to save _.pt files.", - ) - parser.add_argument( - "--tag", - type=str, - default="hf", - help="Checkpoint tag, corresponds to ../sam-3d-objects/checkpoints/{tag}/pipeline.yaml", - ) - parser.add_argument( - "--seed", - type=int, - default=42, - help="Random seed passed into Inference.__call__.", - ) - parser.add_argument( - "--project-root", - type=str, - default=None, - help=( - "Root directory of sam-3d-objects repo. " - "If not set, will be inferred as /../sam-3d-objects." - ), - ) - args = parser.parse_args() - - - - script_dir = os.path.dirname(os.path.abspath(__file__)) - - if args.project_root is not None: - # 如果用户通过命令行显式传入了 --project-root,就直接用它 - project_root = os.path.abspath(args.project_root) - else: - # 否则自动推断:假设当前脚本位于 sam3d_gs/pipeline/ 下, - # sam-3-objects 位于 sam3d_gs/sam-3-objects - project_root = os.path.abspath(os.path.join(script_dir, "..", "sam-3-objects")) - - print(f"Project root (sam-3-objects): {project_root}") - - config_path = os.path.join(project_root, "checkpoints", args.tag, "pipeline.yaml") - print(f"Using config: {config_path}") - inference = Inference(config_path, compile=False) - - pil_image = load_image(args.image_path) - image = np.array(pil_image) - - mask_paths = collect_mask_paths(args.mask_root) - if not mask_paths: - raise RuntimeError(f"No mask images found under {args.mask_root}") - - os.makedirs(args.save_dir, exist_ok=True) - - for i, mask_path in enumerate(mask_paths): - print(f"[{i+1}/{len(mask_paths)}] running inference on mask: {mask_path}") - - mask = load_binary_mask(mask_path) - - out = inference(image, mask, seed=args.seed) - - # 构造保存名字:父目录名 + "_" + mask 文件名(无扩展) - parent_name_raw = os.path.basename(os.path.dirname(mask_path)) - parent_name = clean_name(parent_name_raw) - mask_stem_raw = os.path.splitext(os.path.basename(mask_path))[0] - mask_stem = clean_name(mask_stem_raw) - save_name = f"{parent_name}_{mask_stem}.pt" - save_path = os.path.join(args.save_dir, save_name) - - torch.save(out, save_path) - print(f"✅ Saved: {save_path}") - - # 显式释放显存 - del out - torch.cuda.empty_cache() - - print("✅ All objects processed and saved as .pt") - - -if __name__ == "__main__": - main() diff --git a/pipeline/utils.py b/pipeline/utils.py new file mode 100644 index 0000000..bf4c986 --- /dev/null +++ b/pipeline/utils.py @@ -0,0 +1,200 @@ +import re +import os +import atexit +os.environ["PYOPENGL_PLATFORM"] = "egl" +from PIL import Image +import trimesh +import pyrender +import numpy as np +import imageio + + +_DEFAULT_MESH_RENDERERS = {} + + +class MeshRenderContext: + def __init__( + self, + width=448, + height=448, + add_axis=False, + debug_depth_path=None, + verbose=False, + ): + self.width = width + self.height = height + self.add_axis = add_axis + self.debug_depth_path = debug_depth_path + self.verbose = verbose + self.renderer = pyrender.OffscreenRenderer(width, height) + self.material = pyrender.MetallicRoughnessMaterial( + baseColorFactor=[0.7, 0.7, 0.7, 1.0], + metallicFactor=0.0, + roughnessFactor=1.0, + ) + self.cv_to_gl = np.array( + [ + [1, 0, 0, 0], + [0, -1, 0, 0], + [0, 0, -1, 0], + [0, 0, 0, 1], + ], + dtype=np.float32, + ) + + def close(self): + if self.renderer is not None: + self.renderer.delete() + self.renderer = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def render(self, mesh, extrinsics, fov_y): + if self.renderer is None: + self.renderer = pyrender.OffscreenRenderer(self.width, self.height) + + if self.verbose: + print( + f"vertices shape {mesh.vertices.shape} " + f"mesh vertices mean {np.mean(mesh.vertices, axis=0)}" + ) + + render_mesh = pyrender.Mesh.from_trimesh( + mesh, + material=self.material, + smooth=False, + ) + + scene = pyrender.Scene() + scene.add(render_mesh) + + camera = pyrender.PerspectiveCamera( + yfov=fov_y, + aspectRatio=self.width / self.height, + ) + + camera_pose = extrinsics @ self.cv_to_gl + scene.add(camera, pose=camera_pose) + + if self.add_axis: + axis = trimesh.creation.axis(axis_length=0.5) + scene.add(pyrender.Mesh.from_trimesh(axis, smooth=False)) + + light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0) + scene.add(light, pose=camera_pose) + + color, depth = self.renderer.render(scene) + + if self.debug_depth_path: + depth_min = depth.min() + depth_range = depth.max() - depth_min + if depth_range > 0: + depth_normalized = ( + (depth - depth_min) / depth_range * 255 + ).astype(np.uint8) + else: + depth_normalized = np.zeros_like(depth, dtype=np.uint8) + imageio.imwrite(self.debug_depth_path, depth_normalized) + + if self.verbose: + valid_depth = depth[depth > 0] + valid_mean = valid_depth.mean() if valid_depth.size > 0 else np.nan + print( + f"max depth {depth.max()}, min depth {depth.min()}, " + f"mean depth {depth.mean()}, valid mean depth {valid_mean}" + ) + + return color, depth + + +def get_default_mesh_renderer( + width=448, + height=448, + add_axis=False, + debug_depth_path=None, + verbose=False, +): + key = (width, height, add_axis, debug_depth_path, verbose) + renderer = _DEFAULT_MESH_RENDERERS.get(key) + if renderer is None: + renderer = MeshRenderContext( + width=width, + height=height, + add_axis=add_axis, + debug_depth_path=debug_depth_path, + verbose=verbose, + ) + _DEFAULT_MESH_RENDERERS[key] = renderer + return renderer + + +def close_default_mesh_renderers(): + for renderer in _DEFAULT_MESH_RENDERERS.values(): + renderer.close() + _DEFAULT_MESH_RENDERERS.clear() + + +atexit.register(close_default_mesh_renderers) + + +def clean_name(x: str): + return re.sub(r'[^0-9a-zA-Z_-]', '', x) + + +def load_image(path: str) -> Image.Image: + img = Image.open(path).convert("RGB") + return img + + +def collect_mask_paths(mask_root: str): + """Recursively collect all .png / .jpg / .jpeg paths under mask_root.""" + all_mask_paths = [] + for root, _, files in os.walk(mask_root): + for f in files: + lf = f.lower() + if lf.endswith(".png") or lf.endswith(".jpg") or lf.endswith(".jpeg"): + all_mask_paths.append(os.path.join(root, f)) + + all_mask_paths.sort() + print(f"Found {len(all_mask_paths)} mask files under {mask_root}") + return all_mask_paths + + +def compute_fov_from_intrinsics(fx, fy, image_size, degrees=True): + """Compute horizontal / vertical FOV from pixel-unit fx, fy.""" + height, width = image_size + + fov_y = 2 * np.arctan(height / (2 * fy)) + fov_x = 2 * np.arctan(width / (2 * fx)) + + if degrees: + fov_y = np.degrees(fov_y) + fov_x = np.degrees(fov_x) + + return fov_x, fov_y + +def mesh_rendering( + mesh, + extrinsics, + fov_y, + renderer=None, + width=448, + height=448, + add_axis=False, + debug_depth_path=None, + verbose=False, +): + if renderer is None: + renderer = get_default_mesh_renderer( + width=width, + height=height, + add_axis=add_axis, + debug_depth_path=debug_depth_path, + verbose=verbose, + ) + return renderer.render(mesh, extrinsics, fov_y) + diff --git a/run_agent_with_vllm.sh b/run_agent_with_vllm.sh deleted file mode 100644 index 3ed5925..0000000 --- a/run_agent_with_vllm.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env bash -set -e - -############################################ -# 0. Resolve project root (directory of this script) -############################################ -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -############################################ -# 1. Global config (paths are relative to SCRIPT_DIR) -############################################ -export HF_ENDPOINT="https://hf-mirror.com" - -export HF_HOME="${SCRIPT_DIR}/huggingface" -export TRANSFORMERS_CACHE="${HF_HOME}" -export HF_DATASETS_CACHE="${HF_HOME}" -export HF_HUB_CACHE="${HF_HOME}" - -# Path to conda initialization script (usually absolute) -CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh" - -# Conda env names -VLLM_ENV="vllm" -SAM3_ENV="sam3" - -# vLLM model directory (where Qwen3-VL-8B-Thinking will be downloaded) -VLLM_MODEL_DIR="${SCRIPT_DIR}/models/qwen3_vl_8b_thinking" - -# Model name exposed by vLLM and used by the Python script (--llm-model-id) -SERVED_MODEL_NAME="qwen3-vl-8b-thinking" - -# vLLM server port -VLLM_PORT=8001 - -# SAM3 agent script (Python entry) -AGENT_SCRIPT="${SCRIPT_DIR}/pipeline/run_sam3_agent_full.py" - -# Input image -IMAGE_PATH="${SCRIPT_DIR}/assets/img.jpg" - -# Output root directory -OUTPUT_ROOT="${SCRIPT_DIR}/outputs/master_with_vllm" - -# System prompt file for Qwen -SYSTEM_PROMPT_PATH="${SCRIPT_DIR}/assets/system_prompt_scene_prompts.txt" - -# vLLM log -LOG_DIR="${SCRIPT_DIR}/logs" -mkdir -p "${LOG_DIR}" -VLLM_LOG="${LOG_DIR}/vllm_server.log" - -############################################ -# 2. Initialize conda -############################################ -if [ -f "${CONDA_SH}" ]; then - # Enable `conda activate` - # shellcheck disable=SC1090 - source "${CONDA_SH}" -else - echo "ERROR: conda.sh not found at ${CONDA_SH}" - exit 1 -fi - -############################################ -# 3. HuggingFace login (interactive, in vLLM env) -############################################ -echo ">>> Activating conda env: ${VLLM_ENV}" -conda activate "${VLLM_ENV}" - -echo ">>> Running 'hf auth login' (you may be prompted for a token)..." -hf auth login -echo ">>> HuggingFace login finished ✓" - -############################################ -# 4. Download Qwen3-VL-8B-Thinking if model dir is empty -############################################ -if [ ! -d "${VLLM_MODEL_DIR}" ] || [ -z "$(ls -A "${VLLM_MODEL_DIR}" 2>/dev/null)" ]; then - echo ">>> Model directory is empty: ${VLLM_MODEL_DIR}" - echo ">>> Auto-downloading Qwen/Qwen3-VL-8B-Thinking ..." - - mkdir -p "${VLLM_MODEL_DIR}" - - if command -v huggingface-cli >/dev/null 2>&1; then - huggingface-cli download \ - Qwen/Qwen3-VL-8B-Thinking \ - --local-dir "${VLLM_MODEL_DIR}" \ - --local-dir-use-symlinks False - elif command -v hf >/dev/null 2>&1; then - hf snapshot download Qwen/Qwen3-VL-8B-Thinking \ - --local-dir "${VLLM_MODEL_DIR}" \ - --local-dir-use-symlinks False - else - echo "ERROR: Neither 'huggingface-cli' nor 'hf' CLI is installed." - echo "Please install with: pip install -U huggingface_hub" - exit 1 - fi - - echo ">>> Model download complete!" -else - echo ">>> Model already exists at ${VLLM_MODEL_DIR}, skip download." -fi - -############################################ -# 5. Start vLLM server (still in vLLM env) -############################################ -echo ">>> Starting vLLM server on GPUs 6,7 ..." -CUDA_VISIBLE_DEVICES=6,7 \ -vllm serve "${VLLM_MODEL_DIR}" \ - --tensor-parallel-size 2 \ - --dtype float16 \ - --gpu-memory-utilization 0.9 \ - --max-model-len 65536 \ - --max-num-seqs 4 \ - --port 8001 \ - --allowed-local-media-path / \ - --served-model-name "${SERVED_MODEL_NAME}" \ - > "${VLLM_LOG}" 2>&1 & - -VLLM_PID=$! -echo ">>> vLLM server started. PID = ${VLLM_PID}" -echo ">>> Logs: ${VLLM_LOG}" - -echo ">>> Waiting for vLLM server to become ready..." -until curl -s "http://localhost:${VLLM_PORT}/v1/models" > /dev/null; do - echo "vLLM not ready yet, waiting 2s..." - sleep 2 -done -echo ">>> vLLM server is ready!" - -############################################ -# 6. Run SAM3 agent (in sam3 env) -############################################ -echo ">>> Activating SAM3 env: ${SAM3_ENV}" -conda activate "${SAM3_ENV}" - -echo ">>> Running SAM3 agent with CUDA_VISIBLE_DEVICES=0 ..." -CUDA_VISIBLE_DEVICES=0 \ -python "${AGENT_SCRIPT}" \ - --image-path "${IMAGE_PATH}" \ - --output-root "${OUTPUT_ROOT}" \ - --system-prompt-path "${SYSTEM_PROMPT_PATH}" \ - --llm-model-id "${SERVED_MODEL_NAME}" \ - --skip-first - -echo ">>> SAM3 agent finished." - -############################################ -# 7. Done (vLLM is still running) -############################################ -echo ">>> All done. vLLM is still running with PID = ${VLLM_PID}" -echo ">>> To stop it manually, run: kill ${VLLM_PID}" diff --git a/run_docker.sh b/run_docker.sh new file mode 100755 index 0000000..e48454e --- /dev/null +++ b/run_docker.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Launch sam3d-gs:latest with host checkpoints + data bind-mounted. +# +# Usage: +# run_docker.sh [PROJECT_DIR] [HF_CACHE_DIR] +# +# PROJECT_DIR Path to the sam3d_gs repo on the host. +# Defaults to the directory this script lives in. +# HF_CACHE_DIR Path to host HuggingFace cache (so AnySplat and other +# HF models are reused across container starts). +# Defaults to ${HF_HOME:-$HOME/.cache/huggingface}. +# +# Environment overrides: +# SAM3D_IMAGE Docker image to run. Default: sam3d-gs:latest +# TORCH_HOME Host PyTorch hub cache (DINOv2 etc. land here). +# Default: $HOME/.cache/torch + +set -euo pipefail + +DEFAULT_REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO="${1:-${DEFAULT_REPO}}" +HF_CACHE="${2:-${HF_HOME:-${HOME}/.cache/huggingface}}" +TORCH_CACHE="${TORCH_HOME:-${HOME}/.cache/torch}" +IMAGE="${SAM3D_IMAGE:-sam3d-gs:latest}" + +REPO="$(realpath "${REPO}")" +HF_CACHE="$(realpath -m "${HF_CACHE}")" +TORCH_CACHE="$(realpath -m "${TORCH_CACHE}")" + +# Sanity-check that PROJECT_DIR really looks like the sam3d_gs repo. +for marker in submodule/Sam-3d-objects submodule/Prompt-Inpaint scripts/install_env.sh; do + if [[ ! -e "${REPO}/${marker}" ]]; then + echo "ERROR: ${REPO} does not look like a sam3d_gs checkout (missing ${marker})." >&2 + echo "Pass the project root explicitly: $0 /path/to/sam3d_gs" >&2 + exit 1 + fi +done + +# Ensure host-side bind targets exist (Docker would otherwise create them as root). +mkdir -p \ + "${REPO}/submodule/Sam-3d-objects/checkpoints" \ + "${REPO}/submodule/Prompt-Inpaint/checkpoints" \ + "${REPO}/data" \ + "${REPO}/example" \ + "${HF_CACHE}" \ + "${TORCH_CACHE}" + +echo "==> repo: ${REPO}" +echo "==> hf cache: ${HF_CACHE}" +echo "==> torch cache: ${TORCH_CACHE}" +echo "==> image: ${IMAGE}" + +docker run --rm -it \ + --gpus all \ + --shm-size=8g \ + --network host \ + -v "${REPO}/submodule/Sam-3d-objects/checkpoints":/opt/sam3d_gs/submodule/Sam-3d-objects/checkpoints \ + -v "${REPO}/submodule/Prompt-Inpaint/checkpoints":/opt/sam3d_gs/submodule/Prompt-Inpaint/checkpoints \ + -v "${HF_CACHE}":/root/.cache/huggingface \ + -v "${TORCH_CACHE}":/root/.cache/torch \ + -v "${REPO}/data":/opt/sam3d_gs/data \ + -v "${REPO}/example":/opt/sam3d_gs/example \ + "${IMAGE}" diff --git a/run_object_generation_pipeline.sh b/run_object_generation_pipeline.sh new file mode 100755 index 0000000..aac03a5 --- /dev/null +++ b/run_object_generation_pipeline.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [[ $# -lt 1 || $# -gt 2 ]]; then + echo "Usage: $0 [path]" + echo "Example: $0 data/new-desk/input_image.png" + exit 1 +fi + +path_img="$1" +if [[ $# -eq 2 ]]; then + path="$2" +else + path="$(dirname "${path_img}")" +fi + +path_img="$(realpath "${path_img}")" +path="$(realpath "${path}")" + +if [[ ! -f "${path_img}" ]]; then + echo "Input image not found: ${path_img}" + exit 1 +fi + +if [[ ! -d "${path}" ]]; then + echo "Input directory not found: ${path}" + exit 1 +fi + +source "${SCRIPT_DIR}/.venv/bin/activate" + +export PYTHONPATH="${SCRIPT_DIR}/submodule/Sam-3d-objects/notebook:${SCRIPT_DIR}/submodule/Sam-3d-objects:${PYTHONPATH:-}" + +echo "Python: $(which python)" +echo "Image: ${path_img}" +echo "Directory: ${path}" + +# Bootstrap gated HuggingFace weights on first run. +# Both models are gated; the user must have run `hf auth login` and accepted +# the model agreements for facebook/sam-3d-objects and facebook/sam3. +SAM3D_PIPELINE_YAML="${SCRIPT_DIR}/submodule/Sam-3d-objects/checkpoints/hf/pipeline.yaml" +SAM3_WEIGHT="${SCRIPT_DIR}/submodule/Prompt-Inpaint/checkpoints/sam3.pt" +if [[ ! -f "${SAM3D_PIPELINE_YAML}" || ! -f "${SAM3_WEIGHT}" ]]; then + echo "==> One or more gated checkpoints missing locally; running bootstrap..." + bash "${SCRIPT_DIR}/scripts/download_checkpoints.sh" +fi + +echo "==> Step 1/3: Prompt-Inpaint" +python "${SCRIPT_DIR}/submodule/Prompt-Inpaint/main.py" \ + --resize-output \ + --save-individual-masks \ + --config "${SCRIPT_DIR}/submodule/Prompt-Inpaint/configs/items.yml" \ + --image "${path_img}" \ + --output-dir "${path}" + +echo "==> Step 2/3: AnySplat" +python "${SCRIPT_DIR}/pipeline/background_reconstruction.py" "${path}" + +echo "==> Step 3/3: Object generation" +python "${SCRIPT_DIR}/pipeline/objects_generation.py" --input-dir "${path}" + +echo "Done." diff --git a/run_pipeline.sh b/run_pipeline.sh deleted file mode 100644 index 547c65b..0000000 --- a/run_pipeline.sh +++ /dev/null @@ -1,2 +0,0 @@ -bash run_agent_with_vllm.sh -bash run_sam3d_from_masks.sh \ No newline at end of file diff --git a/run_sam3d_from_masks.sh b/run_sam3d_from_masks.sh deleted file mode 100644 index 924d1f5..0000000 --- a/run_sam3d_from_masks.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env bash -# 不要开 -u,会和 conda activate 脚本打架 -set -eo pipefail - -############################################ -# 0. Resolve project root (directory of this script) -############################################ -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# 防止 conda activate 的 binutils 脚本里引用未定义 ADDR2LINE -export ADDR2LINE=addr2line - -############################################ -# 1. Global config (all paths relative to SCRIPT_DIR) -############################################ - -# GPU used for SAM3D reconstruction -export CUDA_VISIBLE_DEVICES="0" - -# HF / Torch cache (和 run_agent_with_vllm.sh 共用一套) -export HF_ENDPOINT="https://hf-mirror.com" -export HF_HOME="${SCRIPT_DIR}/huggingface" -export TRANSFORMERS_CACHE="${HF_HOME}" -export HF_DATASETS_CACHE="${HF_HOME}" -export HF_HUB_CACHE="${HF_HOME}" -export HF_HUB_ENABLE_HF_TRANSFER=0 - -export TORCH_HOME="${SCRIPT_DIR}/torch_hub" -export TORCH_HUB="${SCRIPT_DIR}/torch_hub" - -# Conda init script (absolute) -CONDA_SH="/your_path/miniconda3/etc/profile.d/conda.sh" - -# Conda env for SAM3D -SAM3D_ENV="sam3d-objects" - -# sam-3d-objects repo root -PROJECT_ROOT="${SCRIPT_DIR}/sam-3d-objects" - -# Where sam-3-objects stores intermediate .pt -PT_SAVE_DIR="${PROJECT_ROOT}/outputs/torch_save_pt" - -# Checkpoints / config paths -CHECKPOINTS_DIR="${PROJECT_ROOT}/checkpoints" -PIPELINE_YAML="${CHECKPOINTS_DIR}/hf/pipeline.yaml" - -# Python entry scripts (放在 sam3d_gs/pipeline 下) -SAM3D_MULTI_SCRIPT="${SCRIPT_DIR}/pipeline/run_sam3d_multi.py" -RECONSTRUCT_SCRIPT="${SCRIPT_DIR}/pipeline/reconstruct_from_pt.py" - -# Input image: 使用和 SAM3 agent 一样的图 -IMAGE_PATH="${SCRIPT_DIR}/assets/img.jpg" - -# 🔴 关键:mask-root = SAM3 agent 的 mask 输出目录 -# 如果你的 run_sam3_agent_full.py 把 mask 写在: -# outputs/master_with_vllm/masks -# 就用这一行: -MASK_ROOT="${SCRIPT_DIR}/outputs/master_with_vllm/masks" -# 如果暂时还用旧目录,比如 sam3/agent_output_multi/masks,可以改成: -# MASK_ROOT="${SCRIPT_DIR}/sam3/agent_output_multi/masks" - -# Run configs -TAG="hf" -SEED=42 -EXPORT_GIF=1 # 1 = reconstruct 时加 --export-gif,0 = 不导出 GIF - -############################################ -# 2. Initialize conda -############################################ -if [ -f "${CONDA_SH}" ]; then - # shellcheck disable=SC1090 - source "${CONDA_SH}" -else - echo "ERROR: conda.sh not found at ${CONDA_SH}" - exit 1 -fi - -echo ">>> Activating conda env: ${SAM3D_ENV}" -conda activate "${SAM3D_ENV}" - -mkdir -p "${PT_SAVE_DIR}" - -############################################ -# 2.5. Ensure checkpoints/${TAG}/pipeline.yaml -############################################ -if [ ! -f "${PIPELINE_YAML}" ]; then - echo ">>> pipeline.yaml not found at: ${PIPELINE_YAML}" - echo ">>> Downloading checkpoints from facebook/sam-3d-objects ..." - echo ">>> (确保已运行 'hf auth login' 并在网页上接受模型协议)" - - # 关闭 hf_transfer(在镜像环境下容易出奇怪错误) - export HF_HUB_ENABLE_HF_TRANSFER=0 - - # 临时下载目录(避免直接弄脏 sam-3d-objects 根目录) - TMP_DIR="${CHECKPOINTS_DIR}/.tmp_download_${TAG}" - rm -rf "${TMP_DIR}" - mkdir -p "${TMP_DIR}" - - # 1) 把远端的 checkpoints/** 全部下载到临时目录 - if command -v huggingface-cli >/dev/null 2>&1; then - huggingface-cli download \ - facebook/sam-3d-objects \ - --local-dir "${TMP_DIR}" \ - --local-dir-use-symlinks False \ - --include "checkpoints/**" - elif command -v hf >/dev/null 2>&1; then - hf snapshot download \ - facebook/sam-3d-objects \ - --local-dir "${TMP_DIR}" \ - --local-dir-use-symlinks False \ - --include "checkpoints/**" - else - echo "ERROR: neither 'huggingface-cli' nor 'hf' CLI is installed." - echo " Try: pip install -U huggingface_hub" - rm -rf "${TMP_DIR}" - exit 1 - fi - - # 2) 远端结构:TMP_DIR/checkpoints/... - # 本地目标:CHECKPOINTS_DIR/TAG/... - mkdir -p "${CHECKPOINTS_DIR}/${TAG}" - - if [ -d "${TMP_DIR}/checkpoints" ]; then - echo ">>> Moving downloaded checkpoints into checkpoints/${TAG} ..." - # 把 checkpoints/* 都移到 checkpoints/hf/ - mv "${TMP_DIR}/checkpoints/"* "${CHECKPOINTS_DIR}/${TAG}/" - else - echo "ERROR: Expected ${TMP_DIR}/checkpoints directory, but not found." - rm -rf "${TMP_DIR}" - exit 1 - fi - - # 清理临时目录 - rm -rf "${TMP_DIR}" - - echo ">>> Checkpoints downloaded → ${CHECKPOINTS_DIR}/${TAG}" - echo ">>> Expected config at: ${PIPELINE_YAML}" -else - echo ">>> Found existing pipeline config: ${PIPELINE_YAML}" -fi - - -# 确保 sam-3-objects/notebook 在 PYTHONPATH 里,供 inference 等模块 import -export PYTHONPATH="${PROJECT_ROOT}/notebook:${PYTHONPATH:-}" - -############################################ -# 3. Step 1 – run SAM3D multi-object & save .pt -############################################ -echo "=== [SAM3D] Step 1: run multi-object reconstruction & save .pt ===" -python "${SAM3D_MULTI_SCRIPT}" \ - --image-path "${IMAGE_PATH}" \ - --mask-root "${MASK_ROOT}" \ - --save-dir "${PT_SAVE_DIR}" \ - --tag "${TAG}" \ - --seed "${SEED}" \ - --project-root "${PROJECT_ROOT}" - -############################################ -# 4. Step 2 – reconstruct from .pt to .ply (and optional .gif) -############################################ -echo "=== [SAM3D] Step 2: reconstruct from .pt to .ply ===" - -RECONSTRUCT_CMD=( - python "${RECONSTRUCT_SCRIPT}" - --project-root "${PROJECT_ROOT}" - --save-dir "${PT_SAVE_DIR}" - --image-path "${IMAGE_PATH}" -) - -if [ "${EXPORT_GIF}" -eq 1 ]; then - RECONSTRUCT_CMD+=(--export-gif) -fi - -"${RECONSTRUCT_CMD[@]}" - -echo "✅ Pipeline finished. Check ${PROJECT_ROOT}/gaussians/multi 下的 .ply/.gif 文件" diff --git a/sam-3d-objects b/sam-3d-objects deleted file mode 160000 index cf06676..0000000 --- a/sam-3d-objects +++ /dev/null @@ -1 +0,0 @@ -Subproject commit cf066761706cd02b07e2fc6274570ec8cdafb683 diff --git a/sam3 b/sam3 deleted file mode 160000 index 2d1cbae..0000000 --- a/sam3 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2d1cbaeac7b52ca64baf61e58973d0940ae843d0 diff --git a/scripts/download_checkpoints.sh b/scripts/download_checkpoints.sh new file mode 100755 index 0000000..285b4e8 --- /dev/null +++ b/scripts/download_checkpoints.sh @@ -0,0 +1,226 @@ +#!/usr/bin/env bash +# Bootstrap gated HuggingFace checkpoints needed by the pipeline. +# +# This script handles the two models that require explicit local placement: +# +# 1. facebook/sam-3d-objects +# The SAM-3D-Objects codepath expects a Hydra config tree at +# submodule/Sam-3d-objects/checkpoints//pipeline.yaml +# which is NOT fetched by `from_pretrained`. +# +# 2. facebook/sam3 +# Prompt-Inpaint's _resolve_checkpoint() will fall back to a HuggingFace +# auto-download, but pulling the 3.3 GB sam3.pt into the local +# `submodule/Prompt-Inpaint/checkpoints/` keeps the weights co-located +# with the project and survives `~/.cache` cleanups. +# +# 3. lhjiang/anysplat +# AnySplat.from_pretrained reads from the HuggingFace hub cache +# (~/.cache/huggingface/hub/). Pre-fetching avoids a multi-GB download +# on the first pipeline run inside an ephemeral container. +# +# The script is idempotent: existing target files are skipped unless --force. +# +# Usage: +# bash scripts/download_checkpoints.sh [options] +# +# Options: +# --tag TAG Sub-directory under submodule/Sam-3d-objects/checkpoints/ +# for the SAM-3D-Objects bundle. Default: hf +# --skip-sam3d Do not download the SAM-3D-Objects bundle. +# --skip-sam3 Do not download the SAM3 weight (sam3.pt). +# --skip-anysplat Do not pre-fetch the AnySplat weights into the HF cache. +# --force Re-download even if the target files already exist. +# -h, --help Show this help. +# +# Environment overrides: +# SAM3D_CHECKPOINT_TAG Same as --tag +# SAM3D_MODEL_ID SAM-3D-Objects repo id (default: facebook/sam-3d-objects) +# SAM3_MODEL_ID SAM3 repo id (default: facebook/sam3) +# SAM3_WEIGHT_FILENAME SAM3 weight file name (default: sam3.pt) +# ANYSPLAT_MODEL_ID AnySplat repo id (default: lhjiang/anysplat) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +TAG="${SAM3D_CHECKPOINT_TAG:-hf}" +SAM3D_MODEL_ID="${SAM3D_MODEL_ID:-facebook/sam-3d-objects}" +SAM3_MODEL_ID="${SAM3_MODEL_ID:-facebook/sam3}" +SAM3_WEIGHT_FILENAME="${SAM3_WEIGHT_FILENAME:-sam3.pt}" +ANYSPLAT_MODEL_ID="${ANYSPLAT_MODEL_ID:-lhjiang/anysplat}" +SKIP_SAM3D=0 +SKIP_SAM3=0 +SKIP_ANYSPLAT=0 +FORCE=0 + +usage() { + sed -n '2,42p' "${BASH_SOURCE[0]}" | sed 's/^# //; s/^#$//' +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --tag) + TAG="$2" + shift 2 + ;; + --skip-sam3d) + SKIP_SAM3D=1 + shift + ;; + --skip-sam3) + SKIP_SAM3=1 + shift + ;; + --skip-anysplat) + SKIP_ANYSPLAT=1 + shift + ;; + --force) + FORCE=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +require_hf_cli() { + if ! command -v hf >/dev/null 2>&1; then + cat >&2 <<'EOF' +ERROR: the 'hf' CLI is not installed. + Fix: pip install -U huggingface_hub + Then make sure you've accepted the relevant model agreements on + huggingface.co and logged in with: hf auth login +EOF + exit 1 + fi +} + +# hf_transfer occasionally trips on mirrored networks; disable it for safety. +export HF_HUB_ENABLE_HF_TRANSFER=0 + + +download_sam3d_objects() { + local checkpoints_dir="${PROJECT_ROOT}/submodule/Sam-3d-objects/checkpoints" + local target_dir="${checkpoints_dir}/${TAG}" + local pipeline_yaml="${target_dir}/pipeline.yaml" + + if [[ -f "${pipeline_yaml}" && "${FORCE}" -eq 0 ]]; then + echo "==> [sam-3d-objects] already present: ${pipeline_yaml}" + return 0 + fi + + require_hf_cli + echo "==> [sam-3d-objects] downloading ${SAM3D_MODEL_ID} into ${target_dir}" + + local tmp_dir="${checkpoints_dir}/.tmp_download_${TAG}" + rm -rf "${tmp_dir}" + mkdir -p "${tmp_dir}" + + # Local cleanup trap (scoped to this function via a subshell would also + # work, but we want the trap to run on Ctrl-C too). + trap 'rm -rf "${tmp_dir}"' EXIT + + hf download "${SAM3D_MODEL_ID}" \ + --local-dir "${tmp_dir}" \ + --include "checkpoints/**" + + if [[ ! -d "${tmp_dir}/checkpoints" ]]; then + echo "ERROR: expected ${tmp_dir}/checkpoints after download." >&2 + exit 1 + fi + + mkdir -p "${target_dir}" + shopt -s dotglob + mv "${tmp_dir}/checkpoints/"* "${target_dir}/" + shopt -u dotglob + + if [[ ! -f "${pipeline_yaml}" ]]; then + echo "ERROR: pipeline.yaml missing after move: ${pipeline_yaml}" >&2 + exit 1 + fi + + rm -rf "${tmp_dir}" + trap - EXIT + + echo "==> [sam-3d-objects] done: ${target_dir}" +} + + +download_sam3() { + local target_dir="${PROJECT_ROOT}/submodule/Prompt-Inpaint/checkpoints" + local target_file="${target_dir}/${SAM3_WEIGHT_FILENAME}" + + if [[ -f "${target_file}" && "${FORCE}" -eq 0 ]]; then + echo "==> [sam3] already present: ${target_file}" + return 0 + fi + + require_hf_cli + echo "==> [sam3] downloading ${SAM3_MODEL_ID}/${SAM3_WEIGHT_FILENAME} into ${target_dir}" + + mkdir -p "${target_dir}" + hf download "${SAM3_MODEL_ID}" "${SAM3_WEIGHT_FILENAME}" \ + --local-dir "${target_dir}" + + if [[ ! -f "${target_file}" ]]; then + echo "ERROR: ${target_file} missing after download." >&2 + exit 1 + fi + + echo "==> [sam3] done: ${target_file}" +} + + +download_anysplat() { + # AnySplat.from_pretrained looks up the model in the HuggingFace hub + # cache, so we leave files under the standard cache layout (no + # --local-dir). The cache root is HF_HOME if set, otherwise + # ~/.cache/huggingface. + local hf_root="${HF_HOME:-${HOME}/.cache/huggingface}" + # HF cache layout: hub/models----/snapshots//... + local hub_dirname="models--$(echo "${ANYSPLAT_MODEL_ID}" | sed 's|/|--|g')" + local snapshots_dir="${hf_root}/hub/${hub_dirname}/snapshots" + + if [[ -d "${snapshots_dir}" ]] && \ + [[ -n "$(ls -A "${snapshots_dir}" 2>/dev/null)" ]] && \ + [[ "${FORCE}" -eq 0 ]]; then + echo "==> [anysplat] already present in HF cache: ${snapshots_dir}" + return 0 + fi + + require_hf_cli + echo "==> [anysplat] downloading ${ANYSPLAT_MODEL_ID} into HF cache (${hf_root})" + hf download "${ANYSPLAT_MODEL_ID}" + echo "==> [anysplat] done." +} + + +if [[ "${SKIP_SAM3D}" -eq 0 ]]; then + download_sam3d_objects +else + echo "==> [sam-3d-objects] skipped (--skip-sam3d)" +fi + +if [[ "${SKIP_SAM3}" -eq 0 ]]; then + download_sam3 +else + echo "==> [sam3] skipped (--skip-sam3)" +fi + +if [[ "${SKIP_ANYSPLAT}" -eq 0 ]]; then + download_anysplat +else + echo "==> [anysplat] skipped (--skip-anysplat)" +fi + +echo "==> All requested checkpoints are in place." diff --git a/scripts/install_env.sh b/scripts/install_env.sh new file mode 100755 index 0000000..e2e699b --- /dev/null +++ b/scripts/install_env.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +PYTHON_VERSION="3.11" +TORCH_VERSION="2.7.0" +TORCHVISION_VERSION="0.22.0" +TORCHAUDIO_VERSION="2.7.0" +PYTORCH_INDEX_URL="https://download.pytorch.org/whl/cu128" +KAOLIN_FIND_LINKS="https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.7.0_cu128.html" + +INSTALL_TORCH=1 +UPDATE_SUBMODULES=1 +COMPILE_CUROPE=1 + +usage() { + cat <<'EOF' +Usage: bash scripts/install_env.sh [options] + +Options: + --python VERSION Python version for uv venv. Default: 3.11 + --skip-torch Do not install torch/torchvision/torchaudio. + --skip-submodules Do not run git submodule update --init --recursive. + --skip-curope Do NOT patch+compile AnySplat curope CUDA extension + (compiled by default; without it AnySplat falls back + to a slower PyTorch RoPE2D implementation). + -h, --help Show this help. + +Examples: + bash scripts/install_env.sh + bash scripts/install_env.sh --skip-torch + bash scripts/install_env.sh --skip-curope +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --python) + PYTHON_VERSION="$2" + shift 2 + ;; + --skip-torch) + INSTALL_TORCH=0 + shift + ;; + --skip-submodules) + UPDATE_SUBMODULES=0 + shift + ;; + --skip-curope) + COMPILE_CUROPE=0 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +cd "${PROJECT_ROOT}" + +echo "==> Project root: ${PROJECT_ROOT}" + +if ! command -v uv >/dev/null 2>&1; then + echo "==> uv not found. Installing uv with pip..." + python3 -m pip install -U uv +fi + +if [[ "${UPDATE_SUBMODULES}" -eq 1 ]]; then + echo "==> Updating git submodules..." + git submodule update --init --recursive +fi + +echo "==> Creating/updating .venv with Python ${PYTHON_VERSION}..." +uv venv --python "${PYTHON_VERSION}" .venv + +# shellcheck disable=SC1091 +source "${PROJECT_ROOT}/.venv/bin/activate" + +export PYTHONPATH="${PROJECT_ROOT}/submodule/Sam-3d-objects/notebook:${PROJECT_ROOT}/submodule/Sam-3d-objects:${PYTHONPATH:-}" +export PIP_FIND_LINKS="${KAOLIN_FIND_LINKS}" + +echo "==> Python: $(which python)" +python --version + +if [[ "${INSTALL_TORCH}" -eq 1 ]]; then + echo "==> Installing PyTorch ${TORCH_VERSION} from ${PYTORCH_INDEX_URL}..." + uv pip install \ + "torch==${TORCH_VERSION}" \ + "torchvision==${TORCHVISION_VERSION}" \ + "torchaudio==${TORCHAUDIO_VERSION}" \ + --index-url "${PYTORCH_INDEX_URL}" +else + echo "==> Skipping PyTorch install." +fi + +echo "==> Installing AnySplat requirements..." +uv pip install -r submodule/AnySplat/requirements.txt --no-build-isolation + +echo "==> Installing SAM-3D-Objects build helpers..." +uv pip install hatch-requirements-txt editables wheel + +echo "==> Installing SAM-3D-Objects extras..." +uv pip install -e './submodule/Sam-3d-objects[dev]' +uv pip install -e './submodule/Sam-3d-objects[p3d]' --no-build-isolation +uv pip install -e './submodule/Sam-3d-objects[inference]' \ + --no-build-isolation \ + --find-links "${KAOLIN_FIND_LINKS}" + +echo "==> Installing project-level runtime dependencies..." +# Do NOT use -U here: that would let uv upgrade transitive deps (notably +# torch, via iopaint) and clobber the CUDA-pinned torch installed above. +uv pip install --index-strategy unsafe-best-match \ + "transformers==4.48.3" \ + "iopaint>=1.2.0" \ + "diffusers>=0.27.2" \ + "numpy<2.0" \ + "opencv-python>=4.8.0" \ + "pyyaml>=6.0" \ + "requests>=2.31.0" \ + "tqdm>=4.66.0" \ + "setuptools" \ + "einops" + +# Pin huggingface_hub to 0.25.2 as the very last step: diffusers 0.27.2 (and +# the iopaint stack on top of it) still imports `cached_download` from +# huggingface_hub, which was removed in hub >= 0.26. Upstream Sam-3d-objects / +# iopaint extras may pull in a newer hub transitively, so we force-reinstall +# last (with --no-deps so it can downgrade without uv complaining) and lock +# the exact version that was empirically verified to work. +# +# Note: transformers above is pinned to ==4.48.3 (not >=) because transformers +# 5.x imports `is_offline_mode` from huggingface_hub, which doesn't exist in +# 0.25.2 — using a floor here lets pip resolve to 5.x and breaks iopaint at +# runtime even though hub stays pinned. +echo "==> Pinning huggingface_hub==0.25.2 (force-reinstall, no-deps)..." +uv pip install --index-strategy unsafe-best-match --force-reinstall --no-deps \ + "huggingface_hub==0.25.2" + +echo "==> Installing SAM3..." +uv pip install --index-strategy unsafe-best-match \ + "git+https://github.com/facebookresearch/sam3.git" + +# Optional mesh2mjcf extras (installed by default so `-cd` / `--verbose` Just +# Work; `trimesh` is also used for multi-material OBJ splitting). +echo "==> Installing mesh2mjcf extras (coacd, trimesh, mujoco)..." +uv pip install --index-strategy unsafe-best-match \ + "coacd" \ + "trimesh" \ + "mujoco" + +if [[ "${COMPILE_CUROPE}" -eq 1 ]]; then + CUROPE_DIR="${PROJECT_ROOT}/submodule/AnySplat/src/model/encoder/backbone/croco/curope" + KERNELS_CU="${CUROPE_DIR}/kernels.cu" + + if [[ ! -f "${KERNELS_CU}" ]]; then + echo "ERROR: kernels.cu not found: ${KERNELS_CU}" >&2 + exit 1 + fi + + echo "==> Patching AnySplat curope kernels.cu..." + python - "${KERNELS_CU}" <<'PY' +from pathlib import Path +import sys + +path = Path(sys.argv[1]) +text = path.read_text() +patched = text.replace( + 'AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {', + 'AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.scalar_type(), "rope_2d_cuda", ([&] {', +) +if patched != text: + path.write_text(patched) + print(f"patched {path}") +else: + print(f"no patch needed for {path}") +PY + + echo "==> Building AnySplat curope extension..." + ( + cd "${CUROPE_DIR}" + python setup.py build_ext --inplace + ) +fi + +cat < Install finished. + +Next steps: + source .venv/bin/activate + export PYTHONPATH="${PROJECT_ROOT}/submodule/Sam-3d-objects/notebook:${PROJECT_ROOT}/submodule/Sam-3d-objects:\${PYTHONPATH:-}" + +If you use gated HuggingFace models, run: + huggingface-cli login +EOF diff --git a/submodule/AnySplat b/submodule/AnySplat new file mode 160000 index 0000000..d29bc6a --- /dev/null +++ b/submodule/AnySplat @@ -0,0 +1 @@ +Subproject commit d29bc6adf82c953f1fd337d8d0ba6259d906b2c9 diff --git a/submodule/Prompt-Inpaint b/submodule/Prompt-Inpaint new file mode 160000 index 0000000..0dffc4b --- /dev/null +++ b/submodule/Prompt-Inpaint @@ -0,0 +1 @@ +Subproject commit 0dffc4b50c33509d80135159b2b031d94e272e6e diff --git a/submodule/Sam-3d-objects b/submodule/Sam-3d-objects new file mode 160000 index 0000000..d4b6362 --- /dev/null +++ b/submodule/Sam-3d-objects @@ -0,0 +1 @@ +Subproject commit d4b63627dc2a7ae0a175be482942e6f32633ff55