Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
73fd577
some thing idk
sami-bg Jun 4, 2025
ba153a2
sampling and video generation
sami-bg Jun 4, 2025
74b0b3f
window sampler, utilities
sami-bg Jun 5, 2025
69dd1de
webapp backend
sami-bg Jun 5, 2025
d7b5a61
webapp stuff
sami-bg Jun 6, 2025
b4645ab
slight cleanup
sami-bg Jun 6, 2025
96a6b01
stray refactor victim
sami-bg Jun 6, 2025
57e491f
stray refactor victim
sami-bg Jun 6, 2025
7a6cc49
Very basic webapp
sami-bg Jun 8, 2025
c53b6b8
dont load models if debugging
sami-bg Jun 8, 2025
260114d
fixed some issues with webapp and better mouse tracking
sami-bg Jun 8, 2025
f923778
cleanup code, add action display margin
sami-bg Jun 8, 2025
bf3eb09
kitchen sink,round 2
sami-bg Jun 9, 2025
32f4358
fixes to parseargs
sami-bg Jun 9, 2025
2cc3218
add causvid trainer
shahbuland Jun 9, 2025
dcb7510
Merge branch 'main' into sami-dev
sami-bg Jun 9, 2025
25e74f3
slightly more changes after new samplers
sami-bg Jun 9, 2025
e856fe4
renaming for clarity
sami-bg Jun 9, 2025
12327c6
renaming for clarity
sami-bg Jun 9, 2025
d0b73da
forgot file
sami-bg Jun 10, 2025
cd9f1b5
idk
sami-bg Jun 10, 2025
5a7af71
Allow data to have keyframes
shahbuland Jun 10, 2025
aacc01b
fix kv caching
shahbuland Jun 10, 2025
e86756e
shortcut model, mmdit, kv cache for mmdit
shahbuland Jun 10, 2025
0fc78ea
add shortcut sampler and debug kv cache
shahbuland Jun 10, 2025
11aa5f1
fix mask for batched input
shahbuland Jun 10, 2025
11dfba7
debug shortcut model
shahbuland Jun 10, 2025
666e20e
add shortcut sampler without cache
shahbuland Jun 10, 2025
511305e
add shortcut sampler without cache
shahbuland Jun 10, 2025
1126c7f
switch default to rope, remove other pos encs
shahbuland Jun 10, 2025
98e12a6
fix mask, add new trainer to init
shahbuland Jun 10, 2025
cc2be00
remove wrong comment
shahbuland Jun 11, 2025
f0f6882
add shortcut trainer
shahbuland Jun 11, 2025
1286ec0
add shortcut config
shahbuland Jun 11, 2025
c81b012
add rft shortcut
shahbuland Jun 11, 2025
0800614
add bucket name to config
shahbuland Jun 11, 2025
91b7b8a
remove redundant args
shahbuland Jun 11, 2025
e3edba3
merge
sami-bg Jun 11, 2025
f0fcf42
debugs
shahbuland Jun 11, 2025
60496b5
inference shortcut diffusion sampler but with kv cache
sami-bg Jun 11, 2025
7f51b54
test
sami-bg Jun 11, 2025
3c9994a
debug
shahbuland Jun 11, 2025
cde4a59
add model
shahbuland Jun 11, 2025
2fa9d5b
debug
shahbuland Jun 11, 2025
67b4375
debug:
shahbuland Jun 11, 2025
0dd7e91
debug
shahbuland Jun 11, 2025
ac6119d
debug
shahbuland Jun 11, 2025
d4257ef
debug stuff
shahbuland Jun 11, 2025
f4fde6d
debug
shahbuland Jun 11, 2025
2b7142c
unused params thing
shahbuland Jun 11, 2025
867851c
slight refactors, setting up for latents and history
sami-bg Jun 11, 2025
6cb62f4
Merge branch 'causvid' into sami-dev
sami-bg Jun 11, 2025
a91e698
add alt shortcut sampler
shahbuland Jun 11, 2025
ba0a32e
Merge branch 'causvid' of https://github.com/openworld-labs/owl-wms i…
shahbuland Jun 11, 2025
5b5a04d
push it
shahbuland Jun 11, 2025
f02ef0b
a
shahbuland Jun 11, 2025
1ca3e6f
self-forcing
sami-bg Jun 11, 2025
eb58fc8
add back muon
shahbuland Jun 11, 2025
c9d5c08
fix it
shahbuland Jun 11, 2025
36aa9e4
self forcing cfg
sami-bg Jun 11, 2025
2a6410d
stupid inference window shortcut sampler without keyframe
sami-bg Jun 11, 2025
c368f55
update shortcut config, fix rope, add reqs
shahbuland Jun 11, 2025
6715564
audio stuffs
shahbuland Jun 11, 2025
28c6e82
add new config
shahbuland Jun 11, 2025
93bb0d7
fix config
shahbuland Jun 11, 2025
a0f498b
audio dataset added
shahbuland Jun 11, 2025
d84c8ff
av
shahbuland Jun 11, 2025
7022ce2
debug
shahbuland Jun 11, 2025
3ee6d10
configs for av model
shahbuland Jun 11, 2025
8ed4715
merge causvid audio
sami-bg Jun 12, 2025
8f7bd7d
cfg
sami-bg Jun 12, 2025
e5ae56d
smapler
sami-bg Jun 12, 2025
3f6b22a
demogit status!
sami-bg Jun 12, 2025
28def2f
hodgepodge to get new history contexts
sami-bg Jun 12, 2025
5f3ccf5
merge with cvpr demo
sami-bg Jun 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
checkpoints/*
venv/
__pycache__/*
generated_videos/
owl-vaes/
*.pt
*.env
*.pyc
.vscode/*
57 changes: 57 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Use CUDA 12.8 runtime as base image for lightweight deployment
FROM nvidia/cuda:12.8.1-runtime-ubuntu22.04

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONPATH=/app

# Install system dependencies (without python3.12 first)
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
curl \
python3-pip \
git \
software-properties-common \
&& rm -rf /var/lib/apt/lists/*

# Add deadsnakes PPA and install Python 3.12
RUN add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
python3.12 \
python3.12-dev \
python3.12-venv \
&& rm -rf /var/lib/apt/lists/*

# Set Python 3.12 as default
RUN ln -sf /usr/bin/python3.12 /usr/bin/python3 && \
ln -sf /usr/bin/python3 /usr/bin/python

# Install uv
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/root/.cargo/bin sh
ENV PATH="/root/.cargo/bin:${PATH}"

# Create working directory
WORKDIR /app

# Copy requirements file first for better layer caching
COPY requirements.txt .

# Install PyTorch with CUDA 12.8 support and sm120 architecture support
RUN uv pip install --system torch torchvision --index-url https://download.pytorch.org/whl/cu128

# Install other requirements from requirements.txt
RUN uv pip install --system -r requirements.txt

RUN git submodule update --init --recursive

# Copy the entire application
COPY . /app

# Expose the port that the FastAPI server runs on
EXPOSE 8000

# Set the default command to run the web server
CMD ["python3", "webapp/server.py", "--port", "8000", "--no-debug"]
67 changes: 67 additions & 0 deletions checkpoints/wm/dcae_hf_cod/basic.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Config for a simple 256 -> 16 autoencoder
model:
model_id: game_rft_core
sample_size: 4
channels: 128

n_layers: 17
n_heads: 16
d_model: 1024

tokens_per_frame: 16
n_buttons: 11
n_mouse_axes: 2

cfg_prob: 0.1
n_frames: 60

causal: false

train:
trainer_id: rft
data_id: cod_latent
data_kwargs:
window_length: 60
root: ../cod_data/BlackOpsColdWar
add_optical_flow: false

target_batch_size: 320
batch_size: 40

epochs: 200

opt: Muon
opt_kwargs:
lr: 1.0e-3
momentum: 0.95
adamw_lr: 1.0e-4
adamw_wd: 1.0e-4
adamw_eps: 1.0e-15
adamw_betas: [0.9, 0.95]
adamw_keys: [core.proj_in, core.proj_out.proj]

scheduler: null

checkpoint_dir: checkpoints/v2
resume_ckpt: checkpoints/v2/step_165000.pt

sample_interval: 1000
save_interval: 5000

sampler_id: window
sampler_kwargs:
n_steps: 32
cfg_scale: 1.3
window_length: 60
num_frames: 120
noise_prev: 0.2
only_return_generated: true

vae_batch_size: 16
vae_scale: 2.17
n_samples: 8

wandb:
name: shahbuland
project: video_models
run_name: v2
70 changes: 70 additions & 0 deletions configs/360p_v2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Config for a simple 256 -> 16 autoencoder
model:
model_id: game_rft
sample_size: 4
channels: 128

n_layers: 13
n_heads: 16
d_model: 1024

tokens_per_frame: 16
n_buttons: 11
n_mouse_axes: 2

cfg_prob: 0.1
n_frames: 30

causal: false

train:
trainer_id: rft
data_id: cod_s3
data_kwargs:
window_length: 30
bucket_name: cod-data-latent-360x640to4x4
include_keyframe: false

target_batch_size: 256
batch_size: 32

epochs: 200

opt: Muon
opt_kwargs:
lr: 1.0e-3
momentum: 0.95
adamw_lr: 1.0e-4
adamw_wd: 1.0e-4
adamw_eps: 1.0e-15
adamw_betas: [0.9, 0.95]
adamw_keys: [core.proj_in, core.proj_out.proj]

scheduler: null

checkpoint_dir: checkpoints/360p

sample_interval: 1000
save_interval: 5000

sampler_id: window
sampler_kwargs:
n_steps: 10
cfg_scale: 1.3
window_length: 30
num_frames: 60
noise_prev: 0.2
only_return_generated: false

n_samples: 8

vae_id: 720pr3dc
vae_batch_size: 4
vae_scale: 0.13
vae_cfg_path: configs/owl_vaes/cod_128x.yml
vae_ckpt_path: checkpoints/owl_vaes/cod_128x_30k_ema.pt

wandb:
name: shahbuland
project: video_models
run_name: v3
75 changes: 75 additions & 0 deletions configs/av.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
model:
model_id: game_rft_audio
sample_size: 4
channels: 128
audio_channels: 64

n_layers: 13
n_heads: 16
d_model: 1024

tokens_per_frame: 17
n_buttons: 11
n_mouse_axes: 2

cfg_prob: 0.1
n_frames: 30

causal: false

train:
trainer_id: av
data_id: cod_s3_audio
data_kwargs:
window_length: 30
bucket_name: cod-data-latent-360x640to4x4

target_batch_size: 256
batch_size: 32

epochs: 200

opt: Muon
opt_kwargs:
lr: 1.0e-3
momentum: 0.95
adamw_lr: 1.0e-4
adamw_wd: 1.0e-4
adamw_eps: 1.0e-15
adamw_betas: [0.9, 0.95]
adamw_keys: [core.proj_in, core.proj_out.proj]

scheduler: null

checkpoint_dir: checkpoints/360p

sample_interval: 1000
save_interval: 5000

sampler_id: av_window
sampler_kwargs:
n_steps: 10
cfg_scale: 1.3
window_length: 30
num_frames: 60
noise_prev: 0.2
only_return_generated: false

n_samples: 8

vae_id: null
vae_batch_size: 4
vae_scale: 0.13
audio_vae_scale: 0.17

vae_cfg_path: configs/owl_vaes/cod_128x.yml
vae_ckpt_path: checkpoints/owl_vaes/cod_128x_30k_ema.pt

audio_vae_id: null
audio_vae_cfg_path: configs/owl_vaes/cod_audio.yml
audio_vae_ckpt_path: checkpoints/owl_vaes/cod_audio_20k_ema.pt

wandb:
name: shahbuland
project: video_models
run_name: av
69 changes: 69 additions & 0 deletions configs/causvid.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Config for a simple 256 -> 16 autoencoder
model:
model_id: game_rft
sample_size: 4
channels: 128

n_layers: 17
n_heads: 16
d_model: 1024

tokens_per_frame: 16
n_buttons: 11
n_mouse_axes: 2

cfg_prob: 0.0
n_frames: 30

causal: false

train:
trainer_id: causvid
data_id: cod_latent
data_kwargs:
window_length: 30
root: ../cod_data/BlackOpsColdWar
add_optical_flow: false

target_batch_size: 256
batch_size: 32

epochs: 200

opt: AdamW
opt_kwargs:
lr: 2.0e-6
weight_decay: 1.0e-4
eps: 1.0e-15
betas: [0.9, 0.95]

scheduler: null

checkpoint_dir: checkpoints/360p

sample_interval: 1000
save_interval: 5000

sampler_id: window
sampler_kwargs:
n_steps: 20
cfg_scale: 1.3
window_length: 30
num_frames: 60
noise_prev: 0.2
only_return_generated: true

n_samples: 8

vae_id: 720pr3dc
vae_batch_size: 4
vae_scale: 0.35
vae_cfg_path: configs/owl_vaes/128x_cod_stage2.yml
vae_ckpt_path: 720p_cod_vae_30m_35k_steps.pt

teacher_ckpt: null # Set later TODO

wandb:
name: shahbuland
project: video_models
run_name: v2
Loading