fix: randomize block refresh to try to reduce overhead, dilate block retry time

gmega · gmega · commit 0962c65ffd2e · 2025-07-09T14:44:14.000-03:00
diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim
@@ -12,6 +12,7 @@ import std/sets
 import std/options
 import std/algorithm
 import std/sugar
+import std/random
 
 import pkg/chronos
 import pkg/libp2p/[cid, switch, multihash, multicodec]
@@ -199,7 +200,6 @@ proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledErr
 
     # In dynamic swarms, staleness will dominate latency.
     if peer.lastRefresh < self.pendingBlocks.lastInclusion or peer.isKnowledgeStale:
-      trace "Refreshing block knowledge for peer", peer = peer.id
       peer.refreshRequested()
       # TODO: optimize this by keeping track of what was sent and sending deltas.
       #   This should allow us to run much more frequent refreshes, and be way more
@@ -269,8 +269,9 @@ proc downloadInternal(
 
         # We now wait for a bit and then retry. If the handle gets completed in the
         # meantime (cause the presence handler might have requested the block and
-        # received it in the meantime), we are done.
-        await handle or sleepAsync(self.pendingBlocks.retryInterval)
+        # received it in the meantime), we are done. Retry delays are randomized
+        # so we don't get all block loops spinning at the same time.
+        await handle or sleepAsync(secs(rand(self.pendingBlocks.retryInterval.secs)))
         if handle.finished:
           break
         # If we still don't have the block, we'll go for another cycle.
@@ -484,6 +485,9 @@ proc cancelBlocks(
         # If so, schedules a cancellation.
         scheduledCancellations[peerCtx.id] = intersection
 
+    if scheduledCancellations.len == 0:
+      return
+
     let (succeededFuts, failedFuts) = await allFinishedFailed[PeerId](
       toSeq(scheduledCancellations.pairs).map(dispatchCancellations)
     )
diff --git a/codex/blockexchange/engine/pendingblocks.nim b/codex/blockexchange/engine/pendingblocks.nim
@@ -34,7 +34,7 @@ declareGauge(
 
 const
   DefaultBlockRetries* = 3000
-  DefaultRetryInterval* = 1.seconds
+  DefaultRetryInterval* = 10.seconds
 
 type
   RetriesExhaustedError* = object of CatchableError
diff --git a/codex/blockexchange/peers/peerctxstore.nim b/codex/blockexchange/peers/peerctxstore.nim
@@ -78,7 +78,7 @@ func peersWant*(self: PeerCtxStore, cid: Cid): seq[BlockExcPeerCtx] =
 proc getPeersForBlock*(self: PeerCtxStore, address: BlockAddress): PeersForBlock =
   var res: PeersForBlock = (@[], @[])
   for peer in self:
-    if address in peer.peerHave:
+    if address in peer:
       res.with.add(peer)
     else:
       res.without.add(peer)
diff --git a/tests/codex/blockexchange/engine/testblockexc.nim b/tests/codex/blockexchange/engine/testblockexc.nim
@@ -213,4 +213,4 @@ asyncchecksuite "NetworkStore - dissemination":
     await nodes.linearTopology()
 
     let downloads = nodes.mapIt(downloadDataset(it, dataset))
-    await allFuturesThrowing(downloads).wait(20.seconds)
+    await allFuturesThrowing(downloads).wait(30.seconds)