Overworldai · sami-bg · Jun 4, 2025 · Jun 4, 2025 · Jun 5, 2025 · Jun 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+checkpoints/*
+venv/
+__pycache__/*
+generated_videos/
+owl-vaes/
+*.pt
+*.env
+*.pyc
+.vscode/*
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,57 @@
+# Use CUDA 12.8 runtime as base image for lightweight deployment
+FROM nvidia/cuda:12.8.1-runtime-ubuntu22.04
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONPATH=/app
+
+# Install system dependencies (without python3.12 first)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    curl \
+    python3-pip \
+    git \
+    software-properties-common \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add deadsnakes PPA and install Python 3.12
+RUN add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends \
+    python3.12 \
+    python3.12-dev \
+    python3.12-venv \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set Python 3.12 as default
+RUN ln -sf /usr/bin/python3.12 /usr/bin/python3 && \
+    ln -sf /usr/bin/python3 /usr/bin/python
+
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/root/.cargo/bin sh
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Create working directory
+WORKDIR /app
+
+# Copy requirements file first for better layer caching
+COPY requirements.txt .
+
+# Install PyTorch with CUDA 12.8 support and sm120 architecture support
+RUN uv pip install --system torch torchvision --index-url https://download.pytorch.org/whl/cu128
+
+# Install other requirements from requirements.txt
+RUN uv pip install --system -r requirements.txt
+
+RUN git submodule update --init --recursive
+
+# Copy the entire application
+COPY . /app
+
+# Expose the port that the FastAPI server runs on
+EXPOSE 8000
+
+# Set the default command to run the web server
+CMD ["python3", "webapp/server.py", "--port", "8000", "--no-debug"]
diff --git a/checkpoints/wm/dcae_hf_cod/basic.yml b/checkpoints/wm/dcae_hf_cod/basic.yml
@@ -0,0 +1,67 @@
+# Config for a simple 256 -> 16 autoencoder
+model:
+  model_id: game_rft_core
+  sample_size: 4
+  channels: 128
+
+  n_layers: 17
+  n_heads: 16
+  d_model: 1024
+
+  tokens_per_frame: 16
+  n_buttons: 11
+  n_mouse_axes: 2
+
+  cfg_prob: 0.1
+  n_frames: 60
+
+  causal: false
+
+train:
+  trainer_id: rft
+  data_id: cod_latent
+  data_kwargs:
+    window_length: 60
+    root: ../cod_data/BlackOpsColdWar
+    add_optical_flow: false
+
+  target_batch_size: 320
+  batch_size: 40
+
+  epochs: 200
+
+  opt: Muon
+  opt_kwargs:
+    lr: 1.0e-3
+    momentum: 0.95
+    adamw_lr: 1.0e-4
+    adamw_wd: 1.0e-4
+    adamw_eps: 1.0e-15
+    adamw_betas: [0.9, 0.95]
+    adamw_keys: [core.proj_in, core.proj_out.proj]
+
+  scheduler: null
+
+  checkpoint_dir: checkpoints/v2
+  resume_ckpt: checkpoints/v2/step_165000.pt
+
+  sample_interval: 1000
+  save_interval: 5000
+
+  sampler_id: window
+  sampler_kwargs:
+    n_steps: 32
+    cfg_scale: 1.3
+    window_length: 60
+    num_frames: 120
+    noise_prev: 0.2
+    only_return_generated: true
+
+  vae_batch_size: 16
+  vae_scale: 2.17
+  n_samples: 8
+
+wandb:
+  name: shahbuland
+  project: video_models
+  run_name: v2
diff --git a/configs/360p_v2.yml b/configs/360p_v2.yml
@@ -0,0 +1,70 @@
+# Config for a simple 256 -> 16 autoencoder
+model:
+  model_id: game_rft
+  sample_size: 4
+  channels: 128
+
+  n_layers: 13
+  n_heads: 16
+  d_model: 1024
+
+  tokens_per_frame: 16
+  n_buttons: 11
+  n_mouse_axes: 2
+
+  cfg_prob: 0.1
+  n_frames: 30
+
+  causal: false
+
+train:
+  trainer_id: rft
+  data_id: cod_s3
+  data_kwargs:
+    window_length: 30
+    bucket_name: cod-data-latent-360x640to4x4
+    include_keyframe: false
+
+  target_batch_size: 256
+  batch_size: 32
+
+  epochs: 200
+
+  opt: Muon
+  opt_kwargs:
+    lr: 1.0e-3
+    momentum: 0.95
+    adamw_lr: 1.0e-4
+    adamw_wd: 1.0e-4
+    adamw_eps: 1.0e-15
+    adamw_betas: [0.9, 0.95]
+    adamw_keys: [core.proj_in, core.proj_out.proj]
+
+  scheduler: null
+
+  checkpoint_dir: checkpoints/360p
+
+  sample_interval: 1000
+  save_interval: 5000
+
+  sampler_id: window
+  sampler_kwargs:
+    n_steps: 10
+    cfg_scale: 1.3
+    window_length: 30
+    num_frames: 60
+    noise_prev: 0.2
+    only_return_generated: false
+
+  n_samples: 8
+
+  vae_id: 720pr3dc
+  vae_batch_size: 4
+  vae_scale: 0.13
+  vae_cfg_path: configs/owl_vaes/cod_128x.yml
+  vae_ckpt_path: checkpoints/owl_vaes/cod_128x_30k_ema.pt
+
+wandb:
+  name: shahbuland
+  project: video_models
+  run_name: v3
diff --git a/configs/av.yml b/configs/av.yml
@@ -0,0 +1,75 @@
+model:
+  model_id: game_rft_audio
+  sample_size: 4
+  channels: 128
+  audio_channels: 64
+
+  n_layers: 13
+  n_heads: 16
+  d_model: 1024
+
+  tokens_per_frame: 17
+  n_buttons: 11
+  n_mouse_axes: 2
+
+  cfg_prob: 0.1
+  n_frames: 30
+
+  causal: false
+
+train:
+  trainer_id: av
+  data_id: cod_s3_audio
+  data_kwargs:
+    window_length: 30
+    bucket_name: cod-data-latent-360x640to4x4
+
+  target_batch_size: 256
+  batch_size: 32
+
+  epochs: 200
+
+  opt: Muon
+  opt_kwargs:
+    lr: 1.0e-3
+    momentum: 0.95
+    adamw_lr: 1.0e-4
+    adamw_wd: 1.0e-4
+    adamw_eps: 1.0e-15
+    adamw_betas: [0.9, 0.95]
+    adamw_keys: [core.proj_in, core.proj_out.proj]
+
+  scheduler: null
+
+  checkpoint_dir: checkpoints/360p
+
+  sample_interval: 1000
+  save_interval: 5000
+
+  sampler_id: av_window
+  sampler_kwargs:
+    n_steps: 10
+    cfg_scale: 1.3
+    window_length: 30
+    num_frames: 60
+    noise_prev: 0.2
+    only_return_generated: false
+
+  n_samples: 8
+
+  vae_id: null
+  vae_batch_size: 4
+  vae_scale: 0.13
+  audio_vae_scale: 0.17
+
+  vae_cfg_path: configs/owl_vaes/cod_128x.yml
+  vae_ckpt_path: checkpoints/owl_vaes/cod_128x_30k_ema.pt
+
+  audio_vae_id: null
+  audio_vae_cfg_path: configs/owl_vaes/cod_audio.yml
+  audio_vae_ckpt_path: checkpoints/owl_vaes/cod_audio_20k_ema.pt
+
+wandb:
+  name: shahbuland
+  project: video_models
+  run_name: av
diff --git a/configs/causvid.yml b/configs/causvid.yml
@@ -0,0 +1,69 @@
+# Config for a simple 256 -> 16 autoencoder
+model:
+  model_id: game_rft
+  sample_size: 4
+  channels: 128
+
+  n_layers: 17
+  n_heads: 16
+  d_model: 1024
+
+  tokens_per_frame: 16
+  n_buttons: 11
+  n_mouse_axes: 2
+
+  cfg_prob: 0.0
+  n_frames: 30
+
+  causal: false
+
+train:
+  trainer_id: causvid
+  data_id: cod_latent
+  data_kwargs:
+    window_length: 30
+    root: ../cod_data/BlackOpsColdWar
+    add_optical_flow: false
+
+  target_batch_size: 256
+  batch_size: 32
+
+  epochs: 200
+
+  opt: AdamW
+  opt_kwargs:
+    lr: 2.0e-6
+    weight_decay: 1.0e-4
+    eps: 1.0e-15
+    betas: [0.9, 0.95]
+
+  scheduler: null
+
+  checkpoint_dir: checkpoints/360p
+
+  sample_interval: 1000
+  save_interval: 5000
+
+  sampler_id: window
+  sampler_kwargs:
+    n_steps: 20
+    cfg_scale: 1.3
+    window_length: 30
+    num_frames: 60
+    noise_prev: 0.2
+    only_return_generated: true
+
+  n_samples: 8
+
+  vae_id: 720pr3dc
+  vae_batch_size: 4
+  vae_scale: 0.35
+  vae_cfg_path: configs/owl_vaes/128x_cod_stage2.yml
+  vae_ckpt_path: 720p_cod_vae_30m_35k_steps.pt
+
+  teacher_ckpt: null # Set later TODO
+
+wandb:
+  name: shahbuland
+  project: video_models
+  run_name: v2