Skip to content

Commit 082c436

Browse files
author
pytorchbot
committed
2025-10-29 nightly release (d4f0f78)
1 parent 70557c2 commit 082c436

File tree

11 files changed

+133
-87
lines changed

11 files changed

+133
-87
lines changed

.github/workflows/docs.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,7 @@ jobs:
3131
run: python -m pip install --upgrade pip
3232
- name: Install torchforge
3333
shell: bash -l {0}
34-
run: ./scripts/install.sh
35-
- name: Install docs dependencies
36-
shell: bash -l {0}
37-
run: python -m pip install -r docs/requirements.txt
34+
run: pip install uv && uv pip install . && uv pip install .[docs]
3835
- name: Build docs
3936
shell: bash -l {0}
4037
working-directory: docs

.github/workflows/gpu_test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ jobs:
3838
- name: Update pip
3939
run: python -m pip install --upgrade pip
4040
- name: Install torchforge
41-
run: ./scripts/install.sh
41+
run: pip install uv && uv pip install . && uv pip install .[dev]
4242
- name: Run unit tests with coverage
4343
# TODO add all tests
4444
run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv

README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,20 @@ You can also find our notebook tutorials (coming soon)
3232

3333
### Basic
3434

35-
torchforge requires PyTorch 2.9.0 with [Monarch](https://github.com/meta-pytorch/monarch), [vLLM](https://github.com/vllm-project/vllm), and [torchtitan](https://github.com/pytorch/torchtitan). (Note that the basic install script
35+
torchforge requires PyTorch 2.9.0 with [Monarch](https://github.com/meta-pytorch/monarch), [vLLM](https://github.com/vllm-project/vllm), and [torchtitan](https://github.com/pytorch/torchtitan).
36+
37+
You can install Forge with:
38+
```
39+
$ conda create -n forge python=3.10
40+
$ conda activate forge
41+
$ uv pip install .
42+
```
43+
44+
(conda-less uv install is a wip)
45+
46+
For your reference, we also include a basic install script that installs other system dependencies
47+
along with torchforge:
48+
(note that this basic install script
3649
uses [DNF](https://docs.fedoraproject.org/en-US/quick-docs/dnf/), but could be easily extended to other Linux OS.)
3750

3851
```bash
@@ -45,6 +58,13 @@ Optional: By default, the packages installation uses conda. If user wants to ins
4558

4659
After install, you can run the following command and should see output confirming GRPO training is running (you need a minimum 3 GPU devices):
4760

61+
62+
```
63+
uv run apps/grpo/main.py --config apps/grpo/qwen3_1_7b.yaml
64+
```
65+
66+
or if not using uv:
67+
4868
```
4969
python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
5070
```

apps/grpo/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ async def continuous_training():
465465
except KeyboardInterrupt:
466466
print("Training interrupted by user")
467467
finally:
468-
print("Shutting down...")
468+
print("Shutting down... (this may take a few seconds)")
469469
shutdown_event.set()
470470

471471
try:
-32.8 MB
Binary file not shown.

assets/versions.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ PYTORCH_VERSION="2.9.0"
1212
VLLM_VERSION="v0.10.0"
1313
MONARCH_VERSION="0.1.2"
1414
TORCHTITAN_VERSION="0.2.0"
15-
TORCHSTORE_VERSION="0.1.1"
15+
TORCHSTORE_VERSION="0.1.2"
1616

1717
# Torchtitan commit hash for launching on MAST
1818
TORCHTITAN_COMMIT_MAST="d0e25450bcac2332359b13fbda430dc701f073d4"

docs/requirements.txt

Lines changed: 0 additions & 9 deletions
This file was deleted.

pyproject.toml

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,22 @@ authors = [
1111
keywords = ["pytorch", "training", "llm"]
1212
dependencies = [
1313
# PyTorch
14+
"torch==2.9.0",
1415
"torchdata>=0.8.0",
15-
"torchtitan",
16+
"torchtitan==0.2.0",
17+
"torchmonarch==0.1.2",
18+
"torchstore==0.1.2",
1619
# vLLM
17-
# TODO: pin specific vllm version
18-
#"vllm==0.10.0",
20+
"vllm",
1921
# Hugging Face integrations
2022
"datasets>=2.21.0",
2123
"tokenizers",
2224
# Miscellaneous
2325
"omegaconf",
2426
"wandb",
2527
"hf_transfer",
28+
"six",
29+
"setuptools<80",
2630
]
2731
dynamic = ["version"]
2832

@@ -44,10 +48,16 @@ dev = [
4448
"pytest-asyncio",
4549
"multiprocess",
4650
]
47-
oss = [
48-
"torch",
49-
"torchmonarch-nightly==2025.8.1",
50-
"torchstore",
51+
docs = [
52+
"sphinx==7.2.6",
53+
"pytorch-sphinx-theme2==0.1.0",
54+
"docutils>=0.18.1,<0.21",
55+
"sphinx-design==0.6.1",
56+
"sphinxcontrib-mermaid==1.0.0",
57+
"sphinx-gallery==0.19.0",
58+
"myst-parser",
59+
"sphinx-sitemap==2.7.1",
60+
"sphinx-autodoc-typehints==1.25.3",
5161
]
5262

5363
# ---- Explicit project build information ---- #
@@ -69,23 +79,18 @@ members = [
6979
]
7080

7181
# pytorch
72-
# TODO: get auto backend to work
7382
[[tool.uv.index]]
74-
name = "pytorch-nightly-cu129"
75-
url = "https://download.pytorch.org/whl/nightly/cu129"
76-
#explicit = true
83+
name = "pytorch-cu128"
84+
url = "https://download.pytorch.org/whl/cu128"
7785

7886
# vllm
79-
# [[tool.uv.index]]
80-
# name = "vllm-nightly"
81-
# url = "https://wheels.vllm.ai/nightly"
82-
# explicit = true
87+
[[tool.uv.index]]
88+
name = "vllm-forge"
89+
url = "https://download.pytorch.org/whl/preview/forge"
8390

8491
[tool.uv.sources]
85-
torchtitan = { index = "pytorch-nightly-cu129" }
86-
torch = { index = "pytorch-nightly-cu129" }
87-
torchstore = { git = "ssh://[email protected]/meta-pytorch/torchstore.git" }
88-
#vllm = { index = "vllm-nightly" }
92+
torch = { index = "pytorch-cu128" }
93+
vllm = { index = "vllm-forge" }
8994

9095
[tool.uv]
9196
# TODO: revert to stricter default uv strategy

src/forge/actors/trainer.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,7 @@ async def train_step(
166166

167167
t.step("forward_backward")
168168

169-
current_lr = (
170-
self.engine.lr_schedulers.get_last_lr()[0]
171-
if hasattr(self.engine.lr_schedulers, "get_last_lr")
172-
else 0.001
173-
)
169+
current_lr = self.engine.lr_schedulers.schedulers[0].get_last_lr()[0]
174170
record_metric("rl_trainer/learning_rate", current_lr, Reduce.MIN)
175171

176172
self.engine.optimizers.step()

src/forge/controller/provisioner.py

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
"""Remote and local resource manager for allocation and provisioning."""
88
import asyncio
9-
import functools
109
import logging
1110

1211
import os
@@ -19,7 +18,6 @@
1918
from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
2019

2120
from monarch.tools import commands
22-
2321
from monarch.utils import setup_env_for_distributed
2422

2523
from forge.controller.launcher import BaseLauncher, get_launcher
@@ -46,6 +44,39 @@ def get_info(self) -> tuple[str, str]:
4644
return socket.gethostname(), _get_port()
4745

4846

47+
class EnvSetter(Actor):
48+
"""Actor to set environment variables on each proc in a mesh.
49+
50+
Ideally, this is handled in spawn_procs's bootstrap call which
51+
essentially does the same thing as we're doing here.
52+
53+
However, Monarch's SetupActor currently fails to stop on shutdown
54+
which leads to zombie messages sent to the SetupActor. This is a
55+
known issue, and we will move back to bootstrap once it's fixed.
56+
57+
We are able to avoid this here by properly awaiting the spawning
58+
of the actor.
59+
60+
"""
61+
62+
@endpoint
63+
def set_env(self, env_vars: dict[str, str]):
64+
"""Set environment variables on this proc.
65+
66+
Args:
67+
env_vars: Dictionary of environment variables to set
68+
"""
69+
import os
70+
import socket
71+
72+
# Set VLLM_HOST_IP (required for vLLM on multiple nodes)
73+
os.environ["VLLM_HOST_IP"] = socket.gethostbyname(socket.getfqdn())
74+
75+
# Set user-provided environment variables
76+
for k, v in env_vars.items():
77+
os.environ[k] = v
78+
79+
4980
async def get_remote_info(host_mesh: HostMesh) -> tuple[str, str]:
5081
"""Returns the host name and port of the host mesh."""
5182
throwaway_procs = host_mesh.spawn_procs(per_host={"procs": 1})
@@ -64,6 +95,20 @@ async def get_remote_info(host_mesh: HostMesh) -> tuple[str, str]:
6495
return host, port
6596

6697

98+
async def set_environment(proc_mesh: ProcMesh, env_vars: dict[str, str]):
99+
"""Set environment variables on a proc mesh using EnvSetter actor.
100+
101+
This replaces the old bootstrap approach to avoid Monarch's SetupActor
102+
mesh failures on shutdown.
103+
104+
Args:
105+
proc_mesh: The proc mesh to set environment variables on
106+
env_vars: Dictionary of environment variables to set
107+
"""
108+
env_setter = proc_mesh.spawn("_env_setter", EnvSetter)
109+
await env_setter.set_env.call(env_vars)
110+
111+
67112
class GpuManager:
68113
"""Tracks and assigns GPU devices on a host.
69114
@@ -244,26 +289,6 @@ async def get_proc_mesh(
244289
gpu_manager = self._host_gpu_map[self._this_host_id]
245290
host_mesh._host_id = self._this_host_id
246291

247-
def bootstrap(env: dict[str, str]):
248-
"""Runs on process startup.
249-
250-
We use this to set environment variables like CUDA, etc.
251-
We prefer to pass in environment variables to bootstrap,
252-
but there are occasionally host-specific environments that can
253-
only be set once the process is alive on remote hosts.
254-
255-
"""
256-
# bootstrap is run on all processes. We use this
257-
# to set environment variables like CUDA etc.
258-
import os
259-
260-
# vLLM requires this environment variable when spawning model servers
261-
# across multiple nodes.
262-
os.environ["VLLM_HOST_IP"] = socket.gethostbyname(socket.getfqdn())
263-
264-
for k, v in env.items():
265-
os.environ[k] = v
266-
267292
if with_gpus:
268293
if not addr or not port:
269294
addr, port = await get_remote_info(host_mesh)
@@ -281,17 +306,22 @@ def bootstrap(env: dict[str, str]):
281306
for env_var in all_env_vars():
282307
env_vars[env_var.name] = str(env_var.get_value())
283308

309+
# Spawn procs without bootstrap to avoid SetupActor mesh failures
284310
procs = host_mesh.spawn_procs(
285311
per_host={"procs": num_procs},
286-
bootstrap=functools.partial(bootstrap, env=env_vars),
312+
name=mesh_name,
287313
)
288314

315+
# Set up environment variables (replaces old bootstrap)
316+
if env_vars:
317+
await set_environment(procs, env_vars)
318+
319+
# Set up PyTorch distributed environment if using GPUs
289320
if with_gpus:
290-
# Set up environment variables for PyTorch distributed...
291321
await setup_env_for_distributed(
292322
procs,
293323
master_addr=addr,
294-
master_port=port,
324+
master_port=int(port),
295325
)
296326

297327
if is_remote:

0 commit comments

Comments
 (0)