MrPhantom2325 · MrPhantom2325 · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -151,8 +151,11 @@ jobs:
       - name: Run container and check /health
         run: |
           # Start the API in the background; it will boot in degraded mode
-          # since no policy is mounted, but /health should still respond.
-          docker run -d --name api-smoke -p 8000:8000 food-rescue-serve:smoke
+          # since no policy is mounted and MLflow is not available in CI.
+          # FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY=1 prevents hanging on MLflow connection.
+          docker run -d --name api-smoke \
+            -e FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY=1 \
+            -p 8000:8000 food-rescue-serve:smoke
           # Give uvicorn ~10s to come up
           for i in $(seq 1 30); do
             if curl -fsS http://localhost:8000/health > /dev/null; then

diff --git a/agents/dqn.py b/agents/dqn.py
@@ -84,31 +84,49 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 # -----------------------------
 
 class ReplayBuffer:
-    """Fixed-size circular buffer of (s, a, r, s', done) tuples."""
+    """Fixed-size circular buffer of (s, a, r, s', done, next_mask) tuples.
 
-    def __init__(self, capacity: int, seed: Optional[int] = None):
+    The next_mask field is a boolean array of legal actions in the next state.
+    It enables training-time action masking in the Bellman target: we only
+    take max over valid next-state actions, so the target isn't polluted by
+    Q-values of actions the env wouldn't allow.
+
+    Older transitions stored without a mask are auto-padded to all-True for
+    backward compatibility with checkpoints / replay snapshots saved before v5.
+    """
+
+    def __init__(self, capacity: int, num_actions: int, seed: Optional[int] = None):
         self._buf: deque = deque(maxlen=capacity)
         self._rng = random.Random(seed)
+        self.num_actions = num_actions
 
     def push(self, obs: np.ndarray, action: int, reward: float,
-             next_obs: np.ndarray, done: bool) -> None:
-        self._buf.append((obs, action, reward, next_obs, done))
+             next_obs: np.ndarray, done: bool,
+             next_mask: Optional[np.ndarray] = None) -> None:
+        self._buf.append((obs, action, reward, next_obs, done, next_mask))
 
     def sample(self, batch_size: int) -> tuple[np.ndarray, ...]:
         batch = self._rng.sample(self._buf, batch_size)
-        obs, actions, rewards, next_obs, dones = zip(*batch)
+        obs, actions, rewards, next_obs, dones, next_masks = zip(*batch)
+
+        # Pad missing masks with all-True so we don't break legacy training
+        padded_masks = [
+            m if m is not None else np.ones(self.num_actions, dtype=bool)
+            for m in next_masks
+        ]
+
         return (
             np.stack(obs).astype(np.float32),
             np.array(actions, dtype=np.int64),
             np.array(rewards, dtype=np.float32),
             np.stack(next_obs).astype(np.float32),
             np.array(dones, dtype=np.float32),
+            np.stack(padded_masks).astype(bool),
         )
 
     def __len__(self) -> int:
         return len(self._buf)
 
-
 # -----------------------------
 # DQN Agent
 # -----------------------------
@@ -138,6 +156,11 @@ def __init__(
         self.config = config if config is not None else DQNConfig()
         self.obs_dim = obs_dim
         self.num_actions = num_actions
+        self.replay = ReplayBuffer(
+            capacity=self.config.replay_buffer_size,
+            num_actions=num_actions,
+            seed=seed,
+        )
 
         if seed is not None:
             torch.manual_seed(seed)
@@ -153,7 +176,6 @@ def __init__(
         self.target_net.eval()
 
         self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.config.learning_rate)
-        self.replay = ReplayBuffer(self.config.replay_buffer_size, seed=seed)
 
         self._step_count = 0       # global step counter (for target updates)
         self._episode_count = 0    # for ε annealing
@@ -169,47 +191,73 @@ def epsilon(self) -> float:
         return c.epsilon_start + (c.epsilon_end - c.epsilon_start) * progress
 
     def select_action(self, env: FoodRescueEnv, obs: np.ndarray) -> int:
+        # Get action mask from env (or use all-valid if env doesn't support it,
+        # to maintain backward compat with older code paths)
+        if hasattr(env, "action_mask"):
+            mask = env.action_mask()
+        else:
+            mask = np.ones(self.num_actions, dtype=bool)
+
+        valid_actions = np.flatnonzero(mask)
+
         eps = self.epsilon() if self._training else 0.0
         if self._training and self._np_rng.random() < eps:
-            return int(self._np_rng.integers(0, self.num_actions))
+            # Random exploration, but only among valid actions
+            return int(self._np_rng.choice(valid_actions))
 
         with torch.no_grad():
             obs_t = torch.from_numpy(obs.astype(np.float32)).unsqueeze(0).to(self.device)
             q_values = self.q_net(obs_t).squeeze(0).cpu().numpy()
-        # Random tie-breaking
-        max_val = q_values.max()
-        best = np.flatnonzero(q_values == max_val)
-        return int(self._np_rng.choice(best))
 
+        # Mask out invalid actions by setting their Q-values to -inf
+        masked_q = np.where(mask, q_values, -np.inf)
+
+        # Random tie-breaking among the best valid actions
+        max_val = masked_q.max()
+        best = np.flatnonzero(masked_q == max_val)
+        return int(self._np_rng.choice(best))
     # ---- Training ----
 
-    def store_transition(self, obs, action, reward, next_obs, done) -> None:
-        self.replay.push(obs, action, reward, next_obs, done)
+    def store_transition(self, obs, action, reward, next_obs, done, next_mask=None) -> None:
+        self.replay.push(obs, action, reward, next_obs, done, next_mask)
 
     def train_step(self) -> Optional[float]:
         """
         One gradient step on a sampled batch. Returns the loss, or None if
         the replay buffer is too small to train yet.
+
+        Uses action masking in the Bellman target: max_a' Q(s', a') is taken
+        only over actions that are valid in s'. This sharpens credit
+        assignment when the env has structural action constraints.
         """
         if not self._training:
             return None
         if len(self.replay) < self.config.min_replay_to_train:
             return None
 
-        obs_b, act_b, rew_b, next_obs_b, done_b = self.replay.sample(self.config.batch_size)
+        obs_b, act_b, rew_b, next_obs_b, done_b, next_mask_b = self.replay.sample(
+            self.config.batch_size
+        )
 
         obs_t = torch.from_numpy(obs_b).to(self.device)
         act_t = torch.from_numpy(act_b).to(self.device)
         rew_t = torch.from_numpy(rew_b).to(self.device)
         next_obs_t = torch.from_numpy(next_obs_b).to(self.device)
         done_t = torch.from_numpy(done_b).to(self.device)
+        # Convert mask to a float tensor that we'll use to set invalid Q-values
+        # to a very negative number before taking the max.
+        next_mask_t = torch.from_numpy(next_mask_b).to(self.device)
 
         # Current Q(s, a)
         q_pred = self.q_net(obs_t).gather(1, act_t.unsqueeze(1)).squeeze(1)
 
-        # Bootstrap target: r + gamma * max_a' Q_target(s', a'), zeroed at terminal
+        # Bootstrap target: r + gamma * max_a' Q_target(s', a'), masked.
+        # Invalid actions get -1e9 so they never win the argmax.
         with torch.no_grad():
-            q_next_max = self.target_net(next_obs_t).max(dim=1).values
+            q_next = self.target_net(next_obs_t)
+            # Where mask is False, replace Q with -inf (very negative float).
+            q_next_masked = q_next.masked_fill(~next_mask_t, -1e9)
+            q_next_max = q_next_masked.max(dim=1).values
             target = rew_t + self.config.discount * q_next_max * (1.0 - done_t)
 
         loss = nn.functional.smooth_l1_loss(q_pred, target)
@@ -219,6 +267,7 @@ def train_step(self) -> Optional[float]:
         torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), self.config.grad_clip)
         self.optimizer.step()
 
+        # Periodic target network sync
         self._step_count += 1
         if self._step_count % self.config.target_update_interval == 0:
             self.target_net.load_state_dict(self.q_net.state_dict())

diff --git a/api/policy_loader.py b/api/policy_loader.py
@@ -5,8 +5,9 @@
 
 1. From the MLflow Model Registry (production-style): set
    FOOD_RESCUE_MODEL_NAME and FOOD_RESCUE_MODEL_VERSION env vars
-2. From a local file path: set FOOD_RESCUE_MODEL_PATH
-3. Built-in default: look for experiments/policies/dqn_tuned.pt
+2. From the MLflow Model Registry defaults: food_rescue_dqn @ latest
+3. From a local file path: set FOOD_RESCUE_MODEL_PATH
+4. Built-in fallback: look for experiments/policies/dqn_v5_masked.pt
    (or any DQN policy file in that folder)
 
 Only DQN policies are supported for serving — they take an obs vector directly,
@@ -66,7 +67,11 @@ def _load_from_mlflow_registry(
 
     # Download the artifact directory
     local_dir = mlflow.artifacts.download_artifacts(source_uri)
-    return _load_dqn_from_dir(Path(local_dir), source=f"mlflow:{model_name}:{version}")
+    agent, info = _load_dqn_from_dir(Path(local_dir), source=f"mlflow:{model_name}:{version}")
+    # Override the cosmetic fields so /info reflects the real registry version
+    info["model_name"] = model_name
+    info["model_version"] = str(version)
+    return agent, info
 
 
 def _load_from_path(path: str) -> tuple[DQNAgent, dict[str, Any]]:
@@ -129,7 +134,10 @@ def load_policy_from_env() -> tuple[DQNAgent, dict[str, Any]]:
     Resolution order:
     1. FOOD_RESCUE_MODEL_NAME + FOOD_RESCUE_MODEL_VERSION -> MLflow Registry
     2. FOOD_RESCUE_MODEL_PATH -> local file or directory
-    3. Default: experiments/policies/dqn_tuned.pt or dqn_v1.pt
+    3. Default MLflow Registry model food_rescue_dqn @ latest (skipped if unavailable)
+    4. Local DQN fallback files
+
+    Raises FileNotFoundError if no policy can be loaded via any method.
     """
     model_name = os.environ.get("FOOD_RESCUE_MODEL_NAME")
     model_version = os.environ.get("FOOD_RESCUE_MODEL_VERSION")
@@ -141,18 +149,30 @@ def load_policy_from_env() -> tuple[DQNAgent, dict[str, Any]]:
     if model_path:
         return _load_from_path(model_path)
 
-    # Fallback: look for any DQN policy
+    # Try MLflow registry default, but don't fail if it's unavailable (e.g., CI environment)
+    # Can be disabled entirely with FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY=1
+    if not os.environ.get("FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY"):
+        try:
+            return _load_from_mlflow_registry("food_rescue_dqn", "latest")
+        except Exception as e:
+            print(f"  MLflow registry not available ({type(e).__name__}), trying local fallback...")
+    else:
+        print("  MLflow registry disabled via FOOD_RESCUE_DISABLE_MLFLOW_REGISTRY")
+
+    # Fallback: look for any DQN policy on local disk
     candidates = [
+        Path("experiments/policies/dqn_v5_masked.pt"),
         Path("experiments/policies/dqn_tuned.pt"),
         Path("experiments/policies/dqn_v3_normalized.pt"),
     ]
     for c in candidates:
         if c.exists():
+            print(f"  Found local policy: {c}")
             return _load_from_path(str(c))
 
     raise FileNotFoundError(
-        "No policy could be loaded. Set FOOD_RESCUE_MODEL_NAME + "
-        "FOOD_RESCUE_MODEL_VERSION (for MLflow registry), or "
-        "FOOD_RESCUE_MODEL_PATH (for local file), or place a DQN policy at "
-        "experiments/policies/dqn_tuned.pt or dqn_v1.pt."
+        "No policy could be loaded. Please set one of:\n"
+        "  - FOOD_RESCUE_MODEL_NAME + FOOD_RESCUE_MODEL_VERSION (MLflow registry)\n"
+        "  - FOOD_RESCUE_MODEL_PATH (local file or directory)\n"
+        "  - Place a DQN policy at experiments/policies/dqn_v5_masked.pt or similar"
     )
diff --git a/configs/dqn_v4_dense.yaml b/configs/dqn_v4_dense.yaml
@@ -0,0 +1,59 @@
+run:
+  run_id: dqn_v4_dense
+  agent: dqn
+  scenario: weekday
+  num_episodes: 1500
+  seed: 42
+  output_dir: experiments
+  description: |
+    v4: dense pickup signal + γ back to 0.95 + higher ε floor.
+
+    Diagnosis of v3 (see scripts/eval_dqn_in_python.py output):
+    - Q-values collapsed to a flat ~18-20 across all 11 actions
+    - Average q-spread per step: 1.19 (healthy is 5-15)
+    - Step 0 Q-values identical across 5 different random seeds
+    - Model treated every action as equally good; survived only by
+      env forgiveness and random tie-breaking
+    - Root cause: γ=0.99 with sparse delivery-only reward smoothed
+      value across all states; the network learned ONE mean value
+
+    v4 fixes:
+    - Add pickup reward (dense intermediate signal at 0.2 per unit
+      loaded). Breaks the value-smoothing symmetry — picking up at a
+      donor with food is now a learnable positive event, not just a
+      precursor to delivery.
+    - γ=0.95 (was 0.99). Preserves discrimination between near-term
+      and far-term consequences. With 200-step episodes, γ=0.95
+      values reward 14 steps out at ~50% — appropriate horizon.
+    - ε_end=0.10 (was 0.02). Keep exploring throughout training;
+      model needs to discover non-greedy good actions.
+    - min_replay_to_train=1000 (was 5000). Start learning sooner.
+
+agent_params:
+  hidden_sizes: [128, 128]
+  learning_rate: 0.0005
+  discount: 0.95
+  epsilon_start: 1.0
+  epsilon_end: 0.10
+  epsilon_decay_episodes: 1200
+  replay_buffer_size: 100000
+  batch_size: 64
+  min_replay_to_train: 1000
+  target_update_interval: 500
+  grad_clip: 1.0
+  device: auto
+
+# Normalized scale (delivery=1, not 10) keeps DQN TD targets bounded.
+# New pickup reward gives the dense intermediate signal v3 was missing.
+reward_weights:
+  delivery: 1.0
+  pickup: 0.2
+  spoilage: 0.5
+  distance: 0.01
+  unmet_demand: 0.1
+  priority_bonus: 0.05
+  oversupply_penalty: 0.03
+
+eval:
+  n_episodes: 5
+  eval_seeds: [100, 101, 102, 103, 104]
diff --git a/configs/dqn_v5_masked.yaml b/configs/dqn_v5_masked.yaml
@@ -0,0 +1,51 @@
+run:
+  run_id: dqn_v5_masked
+  agent: dqn
+  scenario: weekday
+  num_episodes: 1500
+  seed: 42
+  output_dir: experiments
+  description: |
+    v5: DQN with full training-time action masking.
+
+    Diagnosis of v4: pickup reward alone didn't break the flat-Q problem.
+    Inference-time masking on v4 (no retraining) lifted delivered from 75
+    to 85, proving the constraint mattered. But Q-values stayed flat
+    (spread 1.16) because training itself was polluted: the agent spent
+    many episodes taking actions the env would convert to idle.
+
+    v5 fixes:
+    - Action mask consulted during action selection (already in v4 at
+      inference; now also at training-time exploration)
+    - Bellman target uses max_a' Q(s', a') taken only over a' valid in s'
+    - Replay buffer stores next_state mask alongside each transition
+
+    Hyperparameters unchanged from v4 to make the comparison clean —
+    the only variable is masking. Any improvement is attributable to it.
+
+agent_params:
+  hidden_sizes: [128, 128]
+  learning_rate: 0.0005
+  discount: 0.95
+  epsilon_start: 1.0
+  epsilon_end: 0.10
+  epsilon_decay_episodes: 1200
+  replay_buffer_size: 100000
+  batch_size: 64
+  min_replay_to_train: 1000
+  target_update_interval: 500
+  grad_clip: 1.0
+  device: auto
+
+reward_weights:
+  delivery: 1.0
+  pickup: 0.2
+  spoilage: 0.5
+  distance: 0.01
+  unmet_demand: 0.1
+  priority_bonus: 0.05
+  oversupply_penalty: 0.03
+
+eval:
+  n_episodes: 5
+  eval_seeds: [100, 101, 102, 103, 104]
diff --git a/experiments/policies/dqn_v4_dense.meta.json b/experiments/policies/dqn_v4_dense.meta.json
@@ -0,0 +1,23 @@
+{
+  "config": {
+    "hidden_sizes": [
+      128,
+      128
+    ],
+    "learning_rate": 0.0005,
+    "discount": 0.95,
+    "epsilon_start": 1.0,
+    "epsilon_end": 0.1,
+    "epsilon_decay_episodes": 1200,
+    "replay_buffer_size": 100000,
+    "batch_size": 64,
+    "min_replay_to_train": 1000,
+    "target_update_interval": 500,
+    "grad_clip": 1.0,
+    "device": "auto"
+  },
+  "obs_dim": 31,
+  "num_actions": 11,
+  "step_count": 299001,
+  "episode_count": 1500
+}
diff --git a/experiments/policies/dqn_v4_dense.pt b/experiments/policies/dqn_v4_dense.pt