Skip to content
Merged

Dev #31

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,11 @@ jobs:
- name: Run container and check /health
run: |
# Start the API in the background; it will boot in degraded mode
# since no policy is mounted, but /health should still respond.
docker run -d --name api-smoke -p 8000:8000 food-rescue-serve:smoke
# since no policy is mounted and MLflow is not available in CI.
# FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY=1 prevents hanging on MLflow connection.
docker run -d --name api-smoke \
-e FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY=1 \
-p 8000:8000 food-rescue-serve:smoke
# Give uvicorn ~10s to come up
for i in $(seq 1 30); do
if curl -fsS http://localhost:8000/health > /dev/null; then
Expand Down
83 changes: 66 additions & 17 deletions agents/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,31 +84,49 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
# -----------------------------

class ReplayBuffer:
"""Fixed-size circular buffer of (s, a, r, s', done) tuples."""
"""Fixed-size circular buffer of (s, a, r, s', done, next_mask) tuples.

def __init__(self, capacity: int, seed: Optional[int] = None):
The next_mask field is a boolean array of legal actions in the next state.
It enables training-time action masking in the Bellman target: we only
take max over valid next-state actions, so the target isn't polluted by
Q-values of actions the env wouldn't allow.

Older transitions stored without a mask are auto-padded to all-True for
backward compatibility with checkpoints / replay snapshots saved before v5.
"""

def __init__(self, capacity: int, num_actions: int, seed: Optional[int] = None):
self._buf: deque = deque(maxlen=capacity)
self._rng = random.Random(seed)
self.num_actions = num_actions

def push(self, obs: np.ndarray, action: int, reward: float,
next_obs: np.ndarray, done: bool) -> None:
self._buf.append((obs, action, reward, next_obs, done))
next_obs: np.ndarray, done: bool,
next_mask: Optional[np.ndarray] = None) -> None:
self._buf.append((obs, action, reward, next_obs, done, next_mask))

def sample(self, batch_size: int) -> tuple[np.ndarray, ...]:
batch = self._rng.sample(self._buf, batch_size)
obs, actions, rewards, next_obs, dones = zip(*batch)
obs, actions, rewards, next_obs, dones, next_masks = zip(*batch)

# Pad missing masks with all-True so we don't break legacy training
padded_masks = [
m if m is not None else np.ones(self.num_actions, dtype=bool)
for m in next_masks
]

return (
np.stack(obs).astype(np.float32),
np.array(actions, dtype=np.int64),
np.array(rewards, dtype=np.float32),
np.stack(next_obs).astype(np.float32),
np.array(dones, dtype=np.float32),
np.stack(padded_masks).astype(bool),
)

def __len__(self) -> int:
return len(self._buf)


# -----------------------------
# DQN Agent
# -----------------------------
Expand Down Expand Up @@ -138,6 +156,11 @@ def __init__(
self.config = config if config is not None else DQNConfig()
self.obs_dim = obs_dim
self.num_actions = num_actions
self.replay = ReplayBuffer(
capacity=self.config.replay_buffer_size,
num_actions=num_actions,
seed=seed,
)

if seed is not None:
torch.manual_seed(seed)
Expand All @@ -153,7 +176,6 @@ def __init__(
self.target_net.eval()

self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.config.learning_rate)
self.replay = ReplayBuffer(self.config.replay_buffer_size, seed=seed)

self._step_count = 0 # global step counter (for target updates)
self._episode_count = 0 # for ε annealing
Expand All @@ -169,47 +191,73 @@ def epsilon(self) -> float:
return c.epsilon_start + (c.epsilon_end - c.epsilon_start) * progress

def select_action(self, env: FoodRescueEnv, obs: np.ndarray) -> int:
# Get action mask from env (or use all-valid if env doesn't support it,
# to maintain backward compat with older code paths)
if hasattr(env, "action_mask"):
mask = env.action_mask()
else:
mask = np.ones(self.num_actions, dtype=bool)

valid_actions = np.flatnonzero(mask)

eps = self.epsilon() if self._training else 0.0
if self._training and self._np_rng.random() < eps:
return int(self._np_rng.integers(0, self.num_actions))
# Random exploration, but only among valid actions
return int(self._np_rng.choice(valid_actions))

with torch.no_grad():
obs_t = torch.from_numpy(obs.astype(np.float32)).unsqueeze(0).to(self.device)
q_values = self.q_net(obs_t).squeeze(0).cpu().numpy()
# Random tie-breaking
max_val = q_values.max()
best = np.flatnonzero(q_values == max_val)
return int(self._np_rng.choice(best))

# Mask out invalid actions by setting their Q-values to -inf
masked_q = np.where(mask, q_values, -np.inf)

# Random tie-breaking among the best valid actions
max_val = masked_q.max()
best = np.flatnonzero(masked_q == max_val)
return int(self._np_rng.choice(best))
# ---- Training ----

def store_transition(self, obs, action, reward, next_obs, done) -> None:
self.replay.push(obs, action, reward, next_obs, done)
def store_transition(self, obs, action, reward, next_obs, done, next_mask=None) -> None:
self.replay.push(obs, action, reward, next_obs, done, next_mask)

def train_step(self) -> Optional[float]:
"""
One gradient step on a sampled batch. Returns the loss, or None if
the replay buffer is too small to train yet.

Uses action masking in the Bellman target: max_a' Q(s', a') is taken
only over actions that are valid in s'. This sharpens credit
assignment when the env has structural action constraints.
"""
if not self._training:
return None
if len(self.replay) < self.config.min_replay_to_train:
return None

obs_b, act_b, rew_b, next_obs_b, done_b = self.replay.sample(self.config.batch_size)
obs_b, act_b, rew_b, next_obs_b, done_b, next_mask_b = self.replay.sample(
self.config.batch_size
)

obs_t = torch.from_numpy(obs_b).to(self.device)
act_t = torch.from_numpy(act_b).to(self.device)
rew_t = torch.from_numpy(rew_b).to(self.device)
next_obs_t = torch.from_numpy(next_obs_b).to(self.device)
done_t = torch.from_numpy(done_b).to(self.device)
# Convert mask to a float tensor that we'll use to set invalid Q-values
# to a very negative number before taking the max.
next_mask_t = torch.from_numpy(next_mask_b).to(self.device)

# Current Q(s, a)
q_pred = self.q_net(obs_t).gather(1, act_t.unsqueeze(1)).squeeze(1)

# Bootstrap target: r + gamma * max_a' Q_target(s', a'), zeroed at terminal
# Bootstrap target: r + gamma * max_a' Q_target(s', a'), masked.
# Invalid actions get -1e9 so they never win the argmax.
with torch.no_grad():
q_next_max = self.target_net(next_obs_t).max(dim=1).values
q_next = self.target_net(next_obs_t)
# Where mask is False, replace Q with -inf (very negative float).
q_next_masked = q_next.masked_fill(~next_mask_t, -1e9)
q_next_max = q_next_masked.max(dim=1).values
target = rew_t + self.config.discount * q_next_max * (1.0 - done_t)

loss = nn.functional.smooth_l1_loss(q_pred, target)
Expand All @@ -219,6 +267,7 @@ def train_step(self) -> Optional[float]:
torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), self.config.grad_clip)
self.optimizer.step()

# Periodic target network sync
self._step_count += 1
if self._step_count % self.config.target_update_interval == 0:
self.target_net.load_state_dict(self.q_net.state_dict())
Expand Down
38 changes: 29 additions & 9 deletions api/policy_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@

1. From the MLflow Model Registry (production-style): set
FOOD_RESCUE_MODEL_NAME and FOOD_RESCUE_MODEL_VERSION env vars
2. From a local file path: set FOOD_RESCUE_MODEL_PATH
3. Built-in default: look for experiments/policies/dqn_tuned.pt
2. From the MLflow Model Registry defaults: food_rescue_dqn @ latest
3. From a local file path: set FOOD_RESCUE_MODEL_PATH
4. Built-in fallback: look for experiments/policies/dqn_v5_masked.pt
(or any DQN policy file in that folder)

Only DQN policies are supported for serving — they take an obs vector directly,
Expand Down Expand Up @@ -66,7 +67,11 @@ def _load_from_mlflow_registry(

# Download the artifact directory
local_dir = mlflow.artifacts.download_artifacts(source_uri)
return _load_dqn_from_dir(Path(local_dir), source=f"mlflow:{model_name}:{version}")
agent, info = _load_dqn_from_dir(Path(local_dir), source=f"mlflow:{model_name}:{version}")
# Override the cosmetic fields so /info reflects the real registry version
info["model_name"] = model_name
info["model_version"] = str(version)
return agent, info


def _load_from_path(path: str) -> tuple[DQNAgent, dict[str, Any]]:
Expand Down Expand Up @@ -129,7 +134,10 @@ def load_policy_from_env() -> tuple[DQNAgent, dict[str, Any]]:
Resolution order:
1. FOOD_RESCUE_MODEL_NAME + FOOD_RESCUE_MODEL_VERSION -> MLflow Registry
2. FOOD_RESCUE_MODEL_PATH -> local file or directory
3. Default: experiments/policies/dqn_tuned.pt or dqn_v1.pt
3. Default MLflow Registry model food_rescue_dqn @ latest (skipped if unavailable)
4. Local DQN fallback files

Raises FileNotFoundError if no policy can be loaded via any method.
"""
model_name = os.environ.get("FOOD_RESCUE_MODEL_NAME")
model_version = os.environ.get("FOOD_RESCUE_MODEL_VERSION")
Expand All @@ -141,18 +149,30 @@ def load_policy_from_env() -> tuple[DQNAgent, dict[str, Any]]:
if model_path:
return _load_from_path(model_path)

# Fallback: look for any DQN policy
# Try MLflow registry default, but don't fail if it's unavailable (e.g., CI environment)
# Can be disabled entirely with FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY=1
if not os.environ.get("FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY"):
try:
return _load_from_mlflow_registry("food_rescue_dqn", "latest")
except Exception as e:
print(f" MLflow registry not available ({type(e).__name__}), trying local fallback...")
else:
print(" MLflow registry disabled via FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY")

# Fallback: look for any DQN policy on local disk
candidates = [
Path("experiments/policies/dqn_v5_masked.pt"),
Path("experiments/policies/dqn_tuned.pt"),
Path("experiments/policies/dqn_v3_normalized.pt"),
]
for c in candidates:
if c.exists():
print(f" Found local policy: {c}")
return _load_from_path(str(c))

raise FileNotFoundError(
"No policy could be loaded. Set FOOD_RESCUE_MODEL_NAME + "
"FOOD_RESCUE_MODEL_VERSION (for MLflow registry), or "
"FOOD_RESCUE_MODEL_PATH (for local file), or place a DQN policy at "
"experiments/policies/dqn_tuned.pt or dqn_v1.pt."
"No policy could be loaded. Please set one of:\n"
" - FOOD_RESCUE_MODEL_NAME + FOOD_RESCUE_MODEL_VERSION (MLflow registry)\n"
" - FOOD_RESCUE_MODEL_PATH (local file or directory)\n"
" - Place a DQN policy at experiments/policies/dqn_v5_masked.pt or similar"
)
59 changes: 59 additions & 0 deletions configs/dqn_v4_dense.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
run:
run_id: dqn_v4_dense
agent: dqn
scenario: weekday
num_episodes: 1500
seed: 42
output_dir: experiments
description: |
v4: dense pickup signal + γ back to 0.95 + higher ε floor.

Diagnosis of v3 (see scripts/eval_dqn_in_python.py output):
- Q-values collapsed to a flat ~18-20 across all 11 actions
- Average q-spread per step: 1.19 (healthy is 5-15)
- Step 0 Q-values identical across 5 different random seeds
- Model treated every action as equally good; survived only by
env forgiveness and random tie-breaking
- Root cause: γ=0.99 with sparse delivery-only reward smoothed
value across all states; the network learned ONE mean value

v4 fixes:
- Add pickup reward (dense intermediate signal at 0.2 per unit
loaded). Breaks the value-smoothing symmetry — picking up at a
donor with food is now a learnable positive event, not just a
precursor to delivery.
- γ=0.95 (was 0.99). Preserves discrimination between near-term
and far-term consequences. With 200-step episodes, γ=0.95
values reward 14 steps out at ~50% — appropriate horizon.
- ε_end=0.10 (was 0.02). Keep exploring throughout training;
model needs to discover non-greedy good actions.
- min_replay_to_train=1000 (was 5000). Start learning sooner.

agent_params:
hidden_sizes: [128, 128]
learning_rate: 0.0005
discount: 0.95
epsilon_start: 1.0
epsilon_end: 0.10
epsilon_decay_episodes: 1200
replay_buffer_size: 100000
batch_size: 64
min_replay_to_train: 1000
target_update_interval: 500
grad_clip: 1.0
device: auto

# Normalized scale (delivery=1, not 10) keeps DQN TD targets bounded.
# New pickup reward gives the dense intermediate signal v3 was missing.
reward_weights:
delivery: 1.0
pickup: 0.2
spoilage: 0.5
distance: 0.01
unmet_demand: 0.1
priority_bonus: 0.05
oversupply_penalty: 0.03

eval:
n_episodes: 5
eval_seeds: [100, 101, 102, 103, 104]
51 changes: 51 additions & 0 deletions configs/dqn_v5_masked.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
run:
run_id: dqn_v5_masked
agent: dqn
scenario: weekday
num_episodes: 1500
seed: 42
output_dir: experiments
description: |
v5: DQN with full training-time action masking.

Diagnosis of v4: pickup reward alone didn't break the flat-Q problem.
Inference-time masking on v4 (no retraining) lifted delivered from 75
to 85, proving the constraint mattered. But Q-values stayed flat
(spread 1.16) because training itself was polluted: the agent spent
many episodes taking actions the env would convert to idle.

v5 fixes:
- Action mask consulted during action selection (already in v4 at
inference; now also at training-time exploration)
- Bellman target uses max_a' Q(s', a') taken only over a' valid in s'
- Replay buffer stores next_state mask alongside each transition

Hyperparameters unchanged from v4 to make the comparison clean —
the only variable is masking. Any improvement is attributable to it.

agent_params:
hidden_sizes: [128, 128]
learning_rate: 0.0005
discount: 0.95
epsilon_start: 1.0
epsilon_end: 0.10
epsilon_decay_episodes: 1200
replay_buffer_size: 100000
batch_size: 64
min_replay_to_train: 1000
target_update_interval: 500
grad_clip: 1.0
device: auto

reward_weights:
delivery: 1.0
pickup: 0.2
spoilage: 0.5
distance: 0.01
unmet_demand: 0.1
priority_bonus: 0.05
oversupply_penalty: 0.03

eval:
n_episodes: 5
eval_seeds: [100, 101, 102, 103, 104]
23 changes: 23 additions & 0 deletions experiments/policies/dqn_v4_dense.meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"config": {
"hidden_sizes": [
128,
128
],
"learning_rate": 0.0005,
"discount": 0.95,
"epsilon_start": 1.0,
"epsilon_end": 0.1,
"epsilon_decay_episodes": 1200,
"replay_buffer_size": 100000,
"batch_size": 64,
"min_replay_to_train": 1000,
"target_update_interval": 500,
"grad_clip": 1.0,
"device": "auto"
},
"obs_dim": 31,
"num_actions": 11,
"step_count": 299001,
"episode_count": 1500
}
Binary file added experiments/policies/dqn_v4_dense.pt
Binary file not shown.
Loading
Loading