From 6b8df0bd094b5befb03e769c2681cd84fb893232 Mon Sep 17 00:00:00 2001 From: nghetienhiep Date: Fri, 26 Jun 2026 20:21:40 +0700 Subject: [PATCH] perf(per-miner): cache reverse cid->seq index in recover_tier_seq_for MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit recover_tier_seq_for re-scanned the full allotment (~10k HMAC-SHA256 per call) on every lookup. Since #296 calls it on the submit path whenever the assignment row is replica-lagged — and that runs inside the submit gate slot — a high replica-miss rate makes the O(allotment) re-scan dominate submit latency and spike CPU under load. instance_id is a deterministic HMAC, so the cid->seq map is stable; build it once per (hotkey, epoch, tier) and cache it (lru_cache, bounded by CATHEDRAL_PERMINER_RECOVER_INDEX_CACHE, default 64) for amortized O(1) lookups. Behaviour is unchanged (still identity-bound: a foreign or bogus challenge_id still resolves to None). Co-Authored-By: Claude Opus 4.8 (1M context) --- scaffold/publisher/per_miner.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/scaffold/publisher/per_miner.py b/scaffold/publisher/per_miner.py index d1da66082..001061ac4 100644 --- a/scaffold/publisher/per_miner.py +++ b/scaffold/publisher/per_miner.py @@ -368,17 +368,33 @@ def verify_miner_submission_for( return True, None +_RECOVER_INDEX_CACHE = max(1, _env_int("CATHEDRAL_PERMINER_RECOVER_INDEX_CACHE", 64)) + + +@lru_cache(maxsize=_RECOVER_INDEX_CACHE) +def _instance_index(hotkey: str, epoch: int, tier: int) -> dict[str, int]: + """Reverse map challenge_id -> seq for one (hotkey, epoch, tier), built once and cached. + + ``instance_id`` is a deterministic HMAC, so this map is stable for the process lifetime. + It turns ``recover_tier_seq_for`` from an O(allotment) HMAC re-scan on every call into an + amortized O(1) lookup. That recovery path runs inside the submit gate slot (it backs the + assignment-row replica-lag tolerance added in #296), so re-scanning ~allotment HMACs on + every replica-lagged submit can dominate submit latency under load. Bounded by + ``CATHEDRAL_PERMINER_RECOVER_INDEX_CACHE`` (default 64 maps). + """ + return {instance_id(hotkey, epoch, tier, seq): seq for seq in range(allotment_for(tier))} + + def recover_tier_seq_for(hotkey: str, epoch: int, challenge_id: str) -> tuple[int, int] | None: - """Find (tier, seq) for a challenge_id by scanning the miner's allotment. + """Find (tier, seq) for a challenge_id from the miner's allotment (cached O(1) lookup). Returns None if the challenge_id was not generated for this hotkey+epoch. """ parsed = parse_challenge_id(challenge_id) candidate_tiers = [parsed["tier"]] if parsed and parsed["tier"] in TIERS else TIERS for tier in candidate_tiers: - for seq in range(allotment_for(tier)): - cid = instance_id(hotkey, epoch, tier, seq) - if cid == challenge_id: - return tier, seq + seq = _instance_index(hotkey, epoch, tier).get(challenge_id) + if seq is not None: + return tier, seq return None