From 3bfae70e6330807404be64c3c4d58aa6ece167c2 Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Tue, 5 May 2026 16:34:13 +0300
Subject: [PATCH 01/13] perf: improve round finalization metrics

---
 cmd/performance-test/main.go      | 261 +++++++++++++++++++++++-------
 cmd/performance-test/types.go     |  72 ++++++---
 internal/round/batch_processor.go |  90 +++++++----
 3 files changed, 309 insertions(+), 114 deletions(-)

diff --git a/cmd/performance-test/main.go b/cmd/performance-test/main.go
index 32ed146..195b043 100644
--- a/cmd/performance-test/main.go
+++ b/cmd/performance-test/main.go
@@ -828,6 +828,17 @@ func printShardFinalReport(metrics *Metrics, shardClients []*ShardClient) {
 	}
 }
 
+func parseOptionalLogDuration(raw string) (time.Duration, bool, error) {
+	if raw == "" {
+		return 0, false, nil
+	}
+	duration, err := time.ParseDuration(raw)
+	if err != nil {
+		return 0, true, err
+	}
+	return duration, true, nil
+}
+
 func parseAggregatorRoundLogs(path string, start, end time.Time) ([]aggregatorRoundSummary, error) {
 	file, err := os.Open(path)
 	if err != nil {
@@ -882,32 +893,91 @@ func parseAggregatorRoundLogs(path string, start, end time.Time) ([]aggregatorRo
 		if err != nil {
 			continue
 		}
-		medianDur, err := time.ParseDuration(raw.ProofReadyMedian)
+
+		finalizeScan, hasFinalizeScan, err := parseOptionalLogDuration(raw.FinalizeScan)
+		if err != nil {
+			continue
+		}
+		finalizeConvert, hasFinalizeConvert, err := parseOptionalLogDuration(raw.FinalizeConvert)
+		if err != nil {
+			continue
+		}
+		finalizeStoreBlock, hasFinalizeStoreBlock, err := parseOptionalLogDuration(raw.FinalizeStoreBlock)
 		if err != nil {
 			continue
 		}
-		p95Dur, err := time.ParseDuration(raw.ProofReadyP95)
+		finalizeStoreData, hasFinalizeStoreData, err := parseOptionalLogDuration(raw.FinalizeStoreData)
 		if err != nil {
 			continue
 		}
-		p99Dur, err := time.ParseDuration(raw.ProofReadyP99)
+		finalizeStoreSmt, hasFinalizeStoreSmt, err := parseOptionalLogDuration(raw.FinalizeStoreSmt)
 		if err != nil {
 			continue
 		}
+		finalizeStoreRecords, hasFinalizeStoreRecords, err := parseOptionalLogDuration(raw.FinalizeStoreRecords)
+		if err != nil {
+			continue
+		}
+		finalizeLockWait, hasFinalizeLockWait, err := parseOptionalLogDuration(raw.FinalizeLockWait)
+		if err != nil {
+			continue
+		}
+		finalizeSmtCommit, hasFinalizeSmtCommit, err := parseOptionalLogDuration(raw.FinalizeSmtCommit)
+		if err != nil {
+			continue
+		}
+		finalizeSetFinalized, hasFinalizeSetFinalized, err := parseOptionalLogDuration(raw.FinalizeSetFinalized)
+		if err != nil {
+			continue
+		}
+		finalizeAck, hasFinalizeAck, err := parseOptionalLogDuration(raw.FinalizeAck)
+		if err != nil {
+			continue
+		}
+		hasFinalizationBreakdown := hasFinalizeScan || hasFinalizeConvert || hasFinalizeStoreBlock || hasFinalizeStoreData || hasFinalizeStoreSmt || hasFinalizeStoreRecords || hasFinalizeLockWait || hasFinalizeSmtCommit || hasFinalizeSetFinalized || hasFinalizeAck
+
+		medianDur, hasMedian, err := parseOptionalLogDuration(raw.ProofReadyMedian)
+		if err != nil {
+			continue
+		}
+		p95Dur, hasP95, err := parseOptionalLogDuration(raw.ProofReadyP95)
+		if err != nil {
+			continue
+		}
+		p99Dur, hasP99, err := parseOptionalLogDuration(raw.ProofReadyP99)
+		if err != nil {
+			continue
+		}
+		hasProofReady := hasMedian || hasP95 || hasP99
+		if hasProofReady && (!hasMedian || !hasP95 || !hasP99) {
+			continue
+		}
 
 		summaries = append(summaries, aggregatorRoundSummary{
-			Timestamp:    timestamp,
-			Block:        raw.Block,
-			Commitments:  raw.Commitments,
-			RoundTime:    roundDur,
-			Processing:   procDur,
-			BftWait:      bftDur,
-			Finalization: finalDur,
-			ProofMedian:  medianDur,
-			ProofP95:     p95Dur,
-			ProofP99:     p99Dur,
-			RedisTotal:   raw.RedisTotal,
-			RedisPending: raw.RedisPending,
+			Timestamp:                timestamp,
+			Block:                    raw.Block,
+			Commitments:              raw.Commitments,
+			RoundTime:                roundDur,
+			Processing:               procDur,
+			BftWait:                  bftDur,
+			Finalization:             finalDur,
+			HasFinalizationBreakdown: hasFinalizationBreakdown,
+			FinalizeScan:             finalizeScan,
+			FinalizeConvert:          finalizeConvert,
+			FinalizeStoreBlock:       finalizeStoreBlock,
+			FinalizeStoreData:        finalizeStoreData,
+			FinalizeStoreSmt:         finalizeStoreSmt,
+			FinalizeStoreRecords:     finalizeStoreRecords,
+			FinalizeLockWait:         finalizeLockWait,
+			FinalizeSmtCommit:        finalizeSmtCommit,
+			FinalizeSetFinalized:     finalizeSetFinalized,
+			FinalizeAck:              finalizeAck,
+			HasProofReady:            hasProofReady,
+			ProofMedian:              medianDur,
+			ProofP95:                 p95Dur,
+			ProofP99:                 p99Dur,
+			RedisTotal:               raw.RedisTotal,
+			RedisPending:             raw.RedisPending,
 		})
 	}
 
@@ -1018,57 +1088,87 @@ func discoverAggregatorLogSources(shardClients []*ShardClient) []aggregatorLogSo
 	return sources
 }
 
-func printAggregatorServerStatsSummary(header string, summaries []aggregatorRoundSummary) {
-	if len(summaries) == 0 {
+func printFinalizationBreakdownSummary(label string, entries []aggregatorRoundSummary) {
+	prefix := "Average"
+	if label != "" {
+		prefix = label + " average"
+	}
+
+	withBreakdown := make([]aggregatorRoundSummary, 0, len(entries))
+	for _, entry := range entries {
+		if entry.HasFinalizationBreakdown {
+			withBreakdown = append(withBreakdown, entry)
+		}
+	}
+	if len(withBreakdown) == 0 {
 		return
 	}
-	ordered := append([]aggregatorRoundSummary(nil), summaries...)
-	sort.Slice(ordered, func(i, j int) bool {
-		return ordered[i].Timestamp.Before(ordered[j].Timestamp)
-	})
 
-	usable := ordered
-	if len(ordered) > 2 {
-		usable = ordered[1 : len(ordered)-1]
+	var scanSum, convertSum, storeBlockSum, storeDataSum time.Duration
+	var storeSmtSum, storeRecordsSum, lockWaitSum time.Duration
+	var smtCommitSum, setFinalizedSum, ackSum time.Duration
+	for _, entry := range withBreakdown {
+		scanSum += entry.FinalizeScan
+		convertSum += entry.FinalizeConvert
+		storeBlockSum += entry.FinalizeStoreBlock
+		storeDataSum += entry.FinalizeStoreData
+		storeSmtSum += entry.FinalizeStoreSmt
+		storeRecordsSum += entry.FinalizeStoreRecords
+		lockWaitSum += entry.FinalizeLockWait
+		smtCommitSum += entry.FinalizeSmtCommit
+		setFinalizedSum += entry.FinalizeSetFinalized
+		ackSum += entry.FinalizeAck
+	}
+
+	count := time.Duration(len(withBreakdown))
+	fmt.Printf("%s finalization breakdown: scan=%v convert=%v storeBlock=%v storeData=%v (smt=%v records=%v) lockWait=%v smtCommit=%v setFinalized=%v ack=%v (%d rounds)\n",
+		prefix,
+		(scanSum / count).Truncate(time.Millisecond),
+		(convertSum / count).Truncate(time.Millisecond),
+		(storeBlockSum / count).Truncate(time.Millisecond),
+		(storeDataSum / count).Truncate(time.Millisecond),
+		(storeSmtSum / count).Truncate(time.Millisecond),
+		(storeRecordsSum / count).Truncate(time.Millisecond),
+		(lockWaitSum / count).Truncate(time.Millisecond),
+		(smtCommitSum / count).Truncate(time.Millisecond),
+		(setFinalizedSum / count).Truncate(time.Millisecond),
+		(ackSum / count).Truncate(time.Millisecond),
+		len(withBreakdown))
+}
+
+func printAggregatorAverages(label string, entries []aggregatorRoundSummary) {
+	prefix := "Average"
+	if label != "" {
+		prefix = label + " average"
 	}
-	if len(usable) == 0 {
-		usable = ordered
+
+	if len(entries) == 0 {
+		return
 	}
 
-	var roundSum time.Duration
-	var finalSum time.Duration
-	var procSum, bftSum time.Duration
+	var roundSum, finalSum, procSum, bftSum time.Duration
 	var proofMedSum, proofP95Sum, proofP99Sum time.Duration
 	totalCommitments := 0
-	for _, entry := range usable {
+	proofCount := 0
+	for _, entry := range entries {
 		roundSum += entry.RoundTime
 		finalSum += entry.Finalization
 		procSum += entry.Processing
 		bftSum += entry.BftWait
-		proofMedSum += entry.ProofMedian
-		proofP95Sum += entry.ProofP95
-		proofP99Sum += entry.ProofP99
 		totalCommitments += entry.Commitments
+		if entry.HasProofReady {
+			proofMedSum += entry.ProofMedian
+			proofP95Sum += entry.ProofP95
+			proofP99Sum += entry.ProofP99
+			proofCount++
+		}
 	}
 
-	count := len(usable)
-
-	avgFinal := time.Duration(0)
-	avgProcessing := time.Duration(0)
-	avgBft := time.Duration(0)
-	avgProofMedian := time.Duration(0)
-	avgProofP95 := time.Duration(0)
-	avgProofP99 := time.Duration(0)
-	avgCommit := 0.0
-	if count > 0 {
-		avgFinal = finalSum / time.Duration(count)
-		avgProcessing = procSum / time.Duration(count)
-		avgBft = bftSum / time.Duration(count)
-		avgProofMedian = proofMedSum / time.Duration(count)
-		avgProofP95 = proofP95Sum / time.Duration(count)
-		avgProofP99 = proofP99Sum / time.Duration(count)
-		avgCommit = float64(totalCommitments) / float64(count)
-	}
+	count := time.Duration(len(entries))
+	avgFinal := finalSum / count
+	avgProcessing := procSum / count
+	avgBft := bftSum / count
+	avgCommit := float64(totalCommitments) / float64(len(entries))
 
 	finalPct := 0.0
 	procPct := 0.0
@@ -1079,18 +1179,55 @@ func printAggregatorServerStatsSummary(header string, summaries []aggregatorRoun
 		bftPct = float64(bftSum) / float64(roundSum) * 100
 	}
 
+	fmt.Printf("%s round time: %v\n", prefix, (roundSum / count).Truncate(time.Millisecond))
+	fmt.Printf("%s finalization time: %v (%.1f%% of round time)\n", prefix, avgFinal.Truncate(time.Millisecond), finalPct)
+	fmt.Printf("%s commitments per round: %.0f\n", prefix, avgCommit)
+	fmt.Printf("%s processing time: %v (%.1f%% of round time)\n", prefix, avgProcessing.Truncate(time.Millisecond), procPct)
+	fmt.Printf("%s BFT wait: %v (%.1f%% of round time)\n", prefix, avgBft.Truncate(time.Millisecond), bftPct)
+	if proofCount > 0 {
+		proofCountDuration := time.Duration(proofCount)
+		fmt.Printf("%s proof readiness: median %v, p95 %v, p99 %v (%d rounds)\n",
+			prefix,
+			(proofMedSum / proofCountDuration).Truncate(time.Millisecond),
+			(proofP95Sum / proofCountDuration).Truncate(time.Millisecond),
+			(proofP99Sum / proofCountDuration).Truncate(time.Millisecond),
+			proofCount)
+	} else {
+		fmt.Printf("%s proof readiness: n/a (no proof-ready rounds in window)\n", prefix)
+	}
+	printFinalizationBreakdownSummary(label, entries)
+}
+
+func printAggregatorServerStatsSummary(header string, summaries []aggregatorRoundSummary) {
+	if len(summaries) == 0 {
+		return
+	}
+	ordered := append([]aggregatorRoundSummary(nil), summaries...)
+	sort.Slice(ordered, func(i, j int) bool {
+		return ordered[i].Timestamp.Before(ordered[j].Timestamp)
+	})
+
+	usable := ordered
+	if len(ordered) > 2 {
+		usable = ordered[1 : len(ordered)-1]
+	}
+	if len(usable) == 0 {
+		usable = ordered
+	}
+
 	fmt.Printf("\nAGGREGATOR SERVER STATS [%s] (%d rounds, averages exclude first/last when possible)\n", header, len(ordered))
-	fmt.Printf("Average finalization time: %v (%.1f%% of round time)\n",
-		avgFinal.Truncate(time.Millisecond), finalPct)
-	fmt.Printf("Average commitments per round: %.0f\n", avgCommit)
-	fmt.Printf("Average processing time: %v (%.1f%% of round time)\n",
-		avgProcessing.Truncate(time.Millisecond), procPct)
-	fmt.Printf("Average BFT wait: %v (%.1f%% of round time)\n",
-		avgBft.Truncate(time.Millisecond), bftPct)
-	fmt.Printf("Average proof readiness: median %v, p95 %v, p99 %v\n",
-		avgProofMedian.Truncate(time.Millisecond),
-		avgProofP95.Truncate(time.Millisecond),
-		avgProofP99.Truncate(time.Millisecond))
+	printAggregatorAverages("", usable)
+
+	active := make([]aggregatorRoundSummary, 0, len(usable))
+	for _, entry := range usable {
+		if entry.Commitments > 0 {
+			active = append(active, entry)
+		}
+	}
+	if len(active) > 0 && len(active) != len(usable) {
+		printAggregatorAverages("Active", active)
+	}
+
 	fmt.Printf("Log window: %s to %s\n",
 		ordered[0].Timestamp.Format(time.RFC3339),
 		ordered[len(ordered)-1].Timestamp.Format(time.RFC3339))
diff --git a/cmd/performance-test/types.go b/cmd/performance-test/types.go
index 8a59044..eb5daf4 100644
--- a/cmd/performance-test/types.go
+++ b/cmd/performance-test/types.go
@@ -134,34 +134,56 @@ func (rr *RequestRateCounters) IncProofCompleted()  { rr.proofCompleted.Add(1) }
 func (rr *RequestRateCounters) IncProofRetries()    { rr.proofRetries.Add(1) }
 
 type aggregatorLogRaw struct {
-	Time             string `json:"time"`
-	Msg              string `json:"msg"`
-	Block            string `json:"block"`
-	Commitments      int    `json:"commitments"`
-	RoundTime        string `json:"roundTime"`
-	Processing       string `json:"processing"`
-	BftWait          string `json:"bftWait"`
-	Finalization     string `json:"finalization"`
-	ProofReadyMedian string `json:"proofReadyMedian"`
-	ProofReadyP95    string `json:"proofReadyP95"`
-	ProofReadyP99    string `json:"proofReadyP99"`
-	RedisTotal       int    `json:"redisTotal"`
-	RedisPending     int    `json:"redisPending"`
+	Time                 string `json:"time"`
+	Msg                  string `json:"msg"`
+	Block                string `json:"block"`
+	Commitments          int    `json:"commitments"`
+	RoundTime            string `json:"roundTime"`
+	Processing           string `json:"processing"`
+	BftWait              string `json:"bftWait"`
+	Finalization         string `json:"finalization"`
+	FinalizeScan         string `json:"finalizeScan"`
+	FinalizeConvert      string `json:"finalizeConvert"`
+	FinalizeStoreBlock   string `json:"finalizeStoreBlock"`
+	FinalizeStoreData    string `json:"finalizeStoreData"`
+	FinalizeStoreSmt     string `json:"finalizeStoreSmt"`
+	FinalizeStoreRecords string `json:"finalizeStoreRecords"`
+	FinalizeLockWait     string `json:"finalizeLockWait"`
+	FinalizeSmtCommit    string `json:"finalizeSmtCommit"`
+	FinalizeSetFinalized string `json:"finalizeSetFinalized"`
+	FinalizeAck          string `json:"finalizeAck"`
+	ProofReadyMedian     string `json:"proofReadyMedian"`
+	ProofReadyP95        string `json:"proofReadyP95"`
+	ProofReadyP99        string `json:"proofReadyP99"`
+	RedisTotal           int    `json:"redisTotal"`
+	RedisPending         int    `json:"redisPending"`
 }
 
 type aggregatorRoundSummary struct {
-	Timestamp    time.Time
-	Block        string
-	Commitments  int
-	RoundTime    time.Duration
-	Processing   time.Duration
-	BftWait      time.Duration
-	Finalization time.Duration
-	ProofMedian  time.Duration
-	ProofP95     time.Duration
-	ProofP99     time.Duration
-	RedisTotal   int
-	RedisPending int
+	Timestamp                time.Time
+	Block                    string
+	Commitments              int
+	RoundTime                time.Duration
+	Processing               time.Duration
+	BftWait                  time.Duration
+	Finalization             time.Duration
+	HasFinalizationBreakdown bool
+	FinalizeScan             time.Duration
+	FinalizeConvert          time.Duration
+	FinalizeStoreBlock       time.Duration
+	FinalizeStoreData        time.Duration
+	FinalizeStoreSmt         time.Duration
+	FinalizeStoreRecords     time.Duration
+	FinalizeLockWait         time.Duration
+	FinalizeSmtCommit        time.Duration
+	FinalizeSetFinalized     time.Duration
+	FinalizeAck              time.Duration
+	HasProofReady            bool
+	ProofMedian              time.Duration
+	ProofP95                 time.Duration
+	ProofP99                 time.Duration
+	RedisTotal               int
+	RedisPending             int
 }
 
 func (m *Metrics) addProofLatency(latency time.Duration) {
diff --git a/internal/round/batch_processor.go b/internal/round/batch_processor.go
index 9c47e44..6a7e7ed 100644
--- a/internal/round/batch_processor.go
+++ b/internal/round/batch_processor.go
@@ -400,8 +400,6 @@ func (rm *RoundManager) FinalizeBlock(ctx context.Context, block *models.Block)
 	finalizationStartTime := time.Now()
 	var proposalTime time.Time
 	var processingTime time.Duration
-	var markProcessedStart, persistDataStart, commitSnapshotStart time.Time
-	var markProcessedTime, persistDataTime, commitSnapshotTime time.Duration
 	commitmentCount := 0
 
 	rm.roundMutex.Lock()
@@ -425,30 +423,24 @@ func (rm *RoundManager) FinalizeBlock(ctx context.Context, block *models.Block)
 	commitmentCount = len(pendingCommitments)
 	stateIDs := make([]api.StateID, commitmentCount)
 	ackEntries := make([]interfaces.CertificationRequestAck, commitmentCount)
-	var proofTimes []time.Duration
 
-	now := time.Now()
+	finalizationScanStart := time.Now()
 	for i, commitment := range pendingCommitments {
 		stateIDs[i] = commitment.StateID
 		ackEntries[i] = interfaces.CertificationRequestAck{StateID: commitment.StateID, StreamID: commitment.StreamID}
-
-		if commitment.CreatedAt != nil {
-			proofReadyTime := now.Sub(commitment.CreatedAt.Time)
-			if proofReadyTime > 0 {
-				metrics.ProofReadinessDuration.Observe(proofReadyTime.Seconds())
-				proofTimes = append(proofTimes, proofReadyTime)
-			}
-		}
 	}
+	finalizationScanDuration := time.Since(finalizationScanStart)
 
-	persistDataStart = time.Now()
+	finalizationConvertStart := time.Now()
 	smtNodes, err := rm.convertLeavesToNodes(pendingLeaves)
 	if err != nil {
 		return fmt.Errorf("failed to convert leaves to storage nodes: %w", err)
 	}
 	records := rm.convertCommitmentsToRecords(pendingCommitments, block.Index)
+	finalizationConvertDuration := time.Since(finalizationConvertStart)
 
 	block.Finalized = false
+	storeBlockStart := time.Now()
 	if err := rm.storeBlockAndRecords(ctx, block, stateIDs); err != nil {
 		if !errors.Is(err, interfaces.ErrDuplicateKey) {
 			return fmt.Errorf("failed to store block and records: %w", err)
@@ -456,33 +448,44 @@ func (rm *RoundManager) FinalizeBlock(ctx context.Context, block *models.Block)
 		rm.logger.WithContext(ctx).Info("Block already exists, continuing with remaining steps",
 			"blockNumber", block.Index.String())
 	}
+	storeBlockDuration := time.Since(storeBlockStart)
 
-	if err := rm.storeDataParallel(ctx, block.Index, smtNodes, records); err != nil {
+	storeDataTiming, err := rm.storeDataParallel(ctx, block.Index, smtNodes, records)
+	if err != nil {
 		return fmt.Errorf("failed to store SMT nodes and aggregator records: %w", err)
 	}
-	persistDataTime = time.Since(persistDataStart)
 
+	lockWaitStart := time.Now()
 	rm.finalizationMu.Lock()
+	finalizationLockWaitDuration := time.Since(lockWaitStart)
+
+	smtCommitStart := time.Now()
 	if snapshot != nil {
-		commitSnapshotStart = time.Now()
 		snapshot.Commit(rm.smt)
-		commitSnapshotTime = time.Since(commitSnapshotStart)
 	}
+	smtCommitDuration := time.Since(smtCommitStart)
 
+	setFinalizedStart := time.Now()
 	if err := rm.storage.BlockStorage().SetFinalized(ctx, block.Index, true); err != nil {
 		rm.finalizationMu.Unlock()
 		return fmt.Errorf("failed to set block as finalized: %w", err)
 	}
+	setFinalizedDuration := time.Since(setFinalizedStart)
 	rm.finalizationMu.Unlock()
 	block.Finalized = true
-	metrics.RoundFinalizationDuration.Observe(time.Since(finalizationStartTime).Seconds())
 
+	// Proofs are requestable only after the SMT snapshot is committed and the block is visible as finalized.
+	// Redis ACK is recovery bookkeeping.
+	proofReadyAt := time.Now()
+	metrics.RoundFinalizationDuration.Observe(proofReadyAt.Sub(finalizationStartTime).Seconds())
+
+	ackDuration := time.Duration(0)
 	if len(ackEntries) > 0 {
-		markProcessedStart = time.Now()
+		ackStart := time.Now()
 		if err := rm.commitmentQueue.MarkProcessed(ctx, ackEntries); err != nil {
 			return fmt.Errorf("failed to mark commitments as processed: %w", err)
 		}
-		markProcessedTime = time.Since(markProcessedStart)
+		ackDuration = time.Since(ackStart)
 	}
 
 	rm.roundMutex.Lock()
@@ -496,7 +499,18 @@ func (rm *RoundManager) FinalizeBlock(ctx context.Context, block *models.Block)
 	rm.roundMutex.Unlock()
 
 	actualFinalizationTime := time.Since(finalizationStartTime)
-	finalizationWorkDuration := markProcessedTime + persistDataTime + commitSnapshotTime
+	proofTimes := make([]time.Duration, 0, len(pendingCommitments))
+	for _, commitment := range pendingCommitments {
+		if commitment.CreatedAt == nil {
+			continue
+		}
+		proofReadyTime := proofReadyAt.Sub(commitment.CreatedAt.Time)
+		if proofReadyTime > 0 {
+			metrics.ProofReadinessDuration.Observe(proofReadyTime.Seconds())
+			proofTimes = append(proofTimes, proofReadyTime)
+		}
+	}
+
 	var totalRoundTime time.Duration
 	var bftWaitTime time.Duration
 	if !proposalTime.IsZero() {
@@ -531,7 +545,17 @@ func (rm *RoundManager) FinalizeBlock(ctx context.Context, block *models.Block)
 		"roundTime", shortDur(totalRoundTime),
 		"processing", shortDur(processingTime),
 		"bftWait", shortDur(bftWaitTime),
-		"finalization", shortDur(finalizationWorkDuration),
+		"finalization", shortDur(actualFinalizationTime),
+		"finalizeScan", shortDur(finalizationScanDuration),
+		"finalizeConvert", shortDur(finalizationConvertDuration),
+		"finalizeStoreBlock", shortDur(storeBlockDuration),
+		"finalizeStoreData", shortDur(storeDataTiming.total),
+		"finalizeStoreSmt", shortDur(storeDataTiming.smt),
+		"finalizeStoreRecords", shortDur(storeDataTiming.records),
+		"finalizeLockWait", shortDur(finalizationLockWaitDuration),
+		"finalizeSmtCommit", shortDur(smtCommitDuration),
+		"finalizeSetFinalized", shortDur(setFinalizedDuration),
+		"finalizeAck", shortDur(ackDuration),
 	}
 
 	if len(proofTimes) > 0 {
@@ -656,6 +680,12 @@ func (rm *RoundManager) storeBlockAndRecords(ctx context.Context, block *models.
 	})
 }
 
+type storeDataTiming struct {
+	total   time.Duration
+	smt     time.Duration
+	records time.Duration
+}
+
 // storeDataParallel stores SMT nodes and aggregator records in parallel.
 // StoreBatch handles duplicates internally (ignores duplicate key errors).
 func (rm *RoundManager) storeDataParallel(
@@ -663,7 +693,7 @@ func (rm *RoundManager) storeDataParallel(
 	blockNumber *api.BigInt,
 	smtNodes []*models.SmtNode,
 	records []*models.AggregatorRecord,
-) error {
+) (storeDataTiming, error) {
 	start := time.Now()
 
 	var smtErr, recordsErr error
@@ -690,20 +720,26 @@ func (rm *RoundManager) storeDataParallel(
 
 	wg.Wait()
 
+	timing := storeDataTiming{
+		total:   time.Since(start),
+		smt:     smtTime,
+		records: recordsTime,
+	}
+
 	rm.logger.WithContext(ctx).Debug("PARALLEL_TIMING",
 		"block", blockNumber.String(),
 		"storeSmtNodes", smtTime.Milliseconds(),
 		"storeAggRecords", recordsTime.Milliseconds(),
 		"smtCount", len(smtNodes),
 		"recordCount", len(records),
-		"totalMs", time.Since(start).Milliseconds())
+		"totalMs", timing.total.Milliseconds())
 
 	if smtErr != nil {
-		return fmt.Errorf("failed to store SMT nodes: %w", smtErr)
+		return timing, fmt.Errorf("failed to store SMT nodes: %w", smtErr)
 	}
 	if recordsErr != nil {
-		return fmt.Errorf("failed to store aggregator records: %w", recordsErr)
+		return timing, fmt.Errorf("failed to store aggregator records: %w", recordsErr)
 	}
 
-	return nil
+	return timing, nil
 }

From 0d3e4a061bf38cfe7f369a360dfce8e51f47560a Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Tue, 5 May 2026 19:09:15 +0300
Subject: [PATCH 02/13] perf: chunk Mongo finalization inserts

---
 internal/config/config.go                     |  36 +++++-
 internal/config/config_test.go                |  98 ++++++++++++++++
 internal/round/batch_processor.go             |   3 +
 internal/round/round_manager.go               |  20 +++-
 internal/storage/mongodb/aggregator_record.go |  14 ++-
 .../storage/mongodb/aggregator_record_test.go |  49 ++++++++
 internal/storage/mongodb/batch_insert.go      | 109 ++++++++++++++++++
 internal/storage/mongodb/connection.go        |   8 +-
 internal/storage/mongodb/smt.go               |  14 ++-
 internal/storage/mongodb/smt_test.go          |  45 ++++++++
 10 files changed, 370 insertions(+), 26 deletions(-)
 create mode 100644 internal/config/config_test.go
 create mode 100644 internal/storage/mongodb/batch_insert.go

diff --git a/internal/config/config.go b/internal/config/config.go
index 1f6664b..d2f76d7 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -73,6 +73,10 @@ type DatabaseConfig struct {
 	MaxPoolSize            uint64        `mapstructure:"max_pool_size"`
 	MinPoolSize            uint64        `mapstructure:"min_pool_size"`
 	MaxConnIdleTime        time.Duration `mapstructure:"max_conn_idle_time"`
+	// Optional finalization insert chunking. A zero chunk size keeps the
+	// existing single InsertMany behavior.
+	FinalizationInsertChunkSize    int `mapstructure:"finalization_insert_chunk_size"`
+	FinalizationInsertChunkWorkers int `mapstructure:"finalization_insert_chunk_workers"`
 }
 
 // HAConfig holds High Availability configuration
@@ -103,9 +107,11 @@ type LoggingConfig struct {
 
 // ProcessingConfig holds batch processing configuration
 type ProcessingConfig struct {
-	BatchLimit             int           `mapstructure:"batch_limit"`
-	RoundDuration          time.Duration `mapstructure:"round_duration"`
-	MaxCommitmentsPerRound int           `mapstructure:"max_commitments_per_round"` // Stop waiting once this many commitments collected
+	BatchLimit                 int           `mapstructure:"batch_limit"`
+	RoundDuration              time.Duration `mapstructure:"round_duration"`
+	MaxCommitmentsPerRound     int           `mapstructure:"max_commitments_per_round"`     // Stop waiting once this many commitments collected
+	CollectPhaseDuration       time.Duration `mapstructure:"collect_phase_duration"`        // Non-child fixed collection window before proposing a round
+	CommitmentStreamBufferSize int           `mapstructure:"commitment_stream_buffer_size"` // Buffer between queue streamer and round collection
 }
 
 // RedisConfig holds Redis connection configuration
@@ -293,6 +299,10 @@ func Load() (*Config, error) {
 			MaxPoolSize:            uint64(getEnvIntOrDefault("MONGODB_MAX_POOL_SIZE", 100)),
 			MinPoolSize:            uint64(getEnvIntOrDefault("MONGODB_MIN_POOL_SIZE", 5)),
 			MaxConnIdleTime:        getEnvDurationOrDefault("MONGODB_MAX_CONN_IDLE_TIME", "5m"),
+			FinalizationInsertChunkSize: getEnvIntOrDefault(
+				"MONGODB_FINALIZATION_INSERT_CHUNK_SIZE", 0),
+			FinalizationInsertChunkWorkers: getEnvIntOrDefault(
+				"MONGODB_FINALIZATION_INSERT_CHUNK_WORKERS", 1),
 		},
 		HA: HAConfig{
 			Enabled:                       !getEnvBoolOrDefault("DISABLE_HIGH_AVAILABILITY", false),
@@ -317,9 +327,11 @@ func Load() (*Config, error) {
 			CompressBackups: getEnvBoolOrDefault("LOG_COMPRESS_BACKUPS", true),
 		},
 		Processing: ProcessingConfig{
-			BatchLimit:             getEnvIntOrDefault("BATCH_LIMIT", 1000),
-			RoundDuration:          getEnvDurationOrDefault("ROUND_DURATION", "1s"),
-			MaxCommitmentsPerRound: getEnvIntOrDefault("MAX_COMMITMENTS_PER_ROUND", 10000), // Default 10k to keep rounds under 2s
+			BatchLimit:                 getEnvIntOrDefault("BATCH_LIMIT", 1000),
+			RoundDuration:              getEnvDurationOrDefault("ROUND_DURATION", "1s"),
+			MaxCommitmentsPerRound:     getEnvIntOrDefault("MAX_COMMITMENTS_PER_ROUND", 10000), // Default 10k to keep rounds under 2s
+			CollectPhaseDuration:       getEnvDurationOrDefault("COLLECT_PHASE_DURATION", "200ms"),
+			CommitmentStreamBufferSize: getEnvIntOrDefault("COMMITMENT_STREAM_BUFFER_SIZE", 10000),
 		},
 		Redis: RedisConfig{
 			Host:         getEnvOrDefault("REDIS_HOST", "localhost"),
@@ -428,6 +440,18 @@ func (c *Config) Validate() error {
 	if c.Server.HTTP2MaxConcurrentStreams <= 0 {
 		return fmt.Errorf("HTTP/2 max concurrent streams must be positive")
 	}
+	if c.Processing.CommitmentStreamBufferSize <= 0 {
+		return fmt.Errorf("COMMITMENT_STREAM_BUFFER_SIZE must be positive")
+	}
+	if c.Processing.CollectPhaseDuration <= 0 {
+		return fmt.Errorf("COLLECT_PHASE_DURATION must be positive")
+	}
+	if c.Database.FinalizationInsertChunkSize < 0 {
+		return fmt.Errorf("MONGODB_FINALIZATION_INSERT_CHUNK_SIZE must be non-negative")
+	}
+	if c.Database.FinalizationInsertChunkSize > 0 && c.Database.FinalizationInsertChunkWorkers <= 0 {
+		return fmt.Errorf("MONGODB_FINALIZATION_INSERT_CHUNK_WORKERS must be positive when chunking is enabled")
+	}
 
 	// Validate log level
 	validLevels := []string{"debug", "info", "warn", "error", "fatal", "panic"}
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
new file mode 100644
index 0000000..c80c8ae
--- /dev/null
+++ b/internal/config/config_test.go
@@ -0,0 +1,98 @@
+package config
+
+import (
+	"strings"
+	"testing"
+	"time"
+)
+
+func validTestConfig() *Config {
+	return &Config{
+		Server: ServerConfig{
+			Port:                      "3000",
+			HTTP2MaxConcurrentStreams: 1,
+		},
+		Database: DatabaseConfig{
+			URI:                            "mongodb://localhost:27017",
+			Database:                       "aggregator",
+			FinalizationInsertChunkWorkers: 1,
+		},
+		Logging: LoggingConfig{
+			Level: "info",
+		},
+		Sharding: ShardingConfig{
+			Mode:          ShardingModeStandalone,
+			ShardIDLength: 4,
+		},
+		Processing: ProcessingConfig{
+			CommitmentStreamBufferSize: 10000,
+			CollectPhaseDuration:       200 * time.Millisecond,
+		},
+		BFT: BFTConfig{
+			Enabled: false,
+		},
+	}
+}
+
+func TestConfigValidate_FinalizationInsertChunking(t *testing.T) {
+	t.Run("disabled chunking is valid", func(t *testing.T) {
+		cfg := validTestConfig()
+
+		if err := cfg.Validate(); err != nil {
+			t.Fatalf("Validate() returned error: %v", err)
+		}
+	})
+
+	t.Run("negative chunk size is invalid", func(t *testing.T) {
+		cfg := validTestConfig()
+		cfg.Database.FinalizationInsertChunkSize = -1
+
+		err := cfg.Validate()
+		if err == nil {
+			t.Fatal("Validate() expected error, got nil")
+		}
+		if !strings.Contains(err.Error(), "MONGODB_FINALIZATION_INSERT_CHUNK_SIZE") {
+			t.Fatalf("Validate() error = %q, want chunk size env name", err.Error())
+		}
+	})
+
+	t.Run("enabled chunking requires positive workers", func(t *testing.T) {
+		cfg := validTestConfig()
+		cfg.Database.FinalizationInsertChunkSize = 500
+		cfg.Database.FinalizationInsertChunkWorkers = 0
+
+		err := cfg.Validate()
+		if err == nil {
+			t.Fatal("Validate() expected error, got nil")
+		}
+		if !strings.Contains(err.Error(), "MONGODB_FINALIZATION_INSERT_CHUNK_WORKERS") {
+			t.Fatalf("Validate() error = %q, want worker env name", err.Error())
+		}
+	})
+}
+
+func TestConfigValidate_CommitmentStreamBufferSize(t *testing.T) {
+	cfg := validTestConfig()
+	cfg.Processing.CommitmentStreamBufferSize = 0
+
+	err := cfg.Validate()
+	if err == nil {
+		t.Fatal("Validate() expected error, got nil")
+	}
+	if !strings.Contains(err.Error(), "COMMITMENT_STREAM_BUFFER_SIZE") {
+		t.Fatalf("Validate() error = %q, want stream buffer env name", err.Error())
+	}
+}
+
+func TestConfigValidate_CollectPhaseDuration(t *testing.T) {
+	cfg := validTestConfig()
+	cfg.Processing.CollectPhaseDuration = 0
+
+	err := cfg.Validate()
+	if err == nil {
+		t.Fatal("Validate() expected error, got nil")
+	}
+	if !strings.Contains(err.Error(), "COLLECT_PHASE_DURATION") {
+		t.Fatalf("Validate() error = %q, want collect phase env name", err.Error())
+	}
+}
diff --git a/internal/round/batch_processor.go b/internal/round/batch_processor.go
index 6a7e7ed..130c628 100644
--- a/internal/round/batch_processor.go
+++ b/internal/round/batch_processor.go
@@ -583,6 +583,9 @@ func (rm *RoundManager) FinalizeBlock(ctx context.Context, block *models.Block)
 	logFields = append(logFields,
 		"redisTotal", redisTotal,
 		"redisPending", redisPending,
+		"collectPhaseDuration", rm.config.Processing.CollectPhaseDuration.String(),
+		"streamChannelSize", len(rm.commitmentStream),
+		"streamChannelCapacity", cap(rm.commitmentStream),
 	)
 	if proofWaitDuration > 0 {
 		logFields = append(logFields,
diff --git a/internal/round/round_manager.go b/internal/round/round_manager.go
index 2254ccf..8c28924 100644
--- a/internal/round/round_manager.go
+++ b/internal/round/round_manager.go
@@ -164,6 +164,11 @@ func NewRoundManager(
 	threadSafeSmt *smt.ThreadSafeSMT,
 	trustBaseProvider interfaces.TrustBaseProvider,
 ) (*RoundManager, error) {
+	commitmentStreamBufferSize := cfg.Processing.CommitmentStreamBufferSize
+	if commitmentStreamBufferSize <= 0 {
+		commitmentStreamBufferSize = 10000
+	}
+
 	rm := &RoundManager{
 		config:              cfg,
 		logger:              logger,
@@ -173,11 +178,11 @@ func NewRoundManager(
 		rootClient:          rootAggregatorClient,
 		stateTracker:        stateTracker,
 		eventBus:            eventBus,
-		roundDuration:       cfg.Processing.RoundDuration,                   // Configurable round duration (default 1s)
-		commitmentStream:    make(chan *models.CertificationRequest, 10000), // Reasonable buffer for streaming
-		avgProcessingRate:   1.0,                                            // Initial estimate: 1 commitment per ms
-		avgFinalizationTime: 200 * time.Millisecond,                         // Initial estimate (conservative)
-		avgSMTUpdateTime:    5 * time.Millisecond,                           // Initial estimate per batch
+		roundDuration:       cfg.Processing.RoundDuration,                                        // Configurable round duration (default 1s)
+		commitmentStream:    make(chan *models.CertificationRequest, commitmentStreamBufferSize), // Buffer for queue streamer
+		avgProcessingRate:   1.0,                                                                 // Initial estimate: 1 commitment per ms
+		avgFinalizationTime: 200 * time.Millisecond,                                              // Initial estimate (conservative)
+		avgSMTUpdateTime:    5 * time.Millisecond,                                                // Initial estimate per batch
 	}
 
 	if rm.storage != nil && rm.storage.SmtStorage() != nil {
@@ -457,7 +462,10 @@ func (rm *RoundManager) processRound(ctx context.Context) error {
 	rm.roundMutex.Unlock()
 
 	if !rm.config.Sharding.Mode.IsChild() {
-		collectDuration := 200 * time.Millisecond
+		collectDuration := rm.config.Processing.CollectPhaseDuration
+		if collectDuration <= 0 {
+			collectDuration = 200 * time.Millisecond
+		}
 		deadline := time.Now().Add(collectDuration)
 
 		for time.Now().Before(deadline) {
diff --git a/internal/storage/mongodb/aggregator_record.go b/internal/storage/mongodb/aggregator_record.go
index c79b231..116eabd 100644
--- a/internal/storage/mongodb/aggregator_record.go
+++ b/internal/storage/mongodb/aggregator_record.go
@@ -18,12 +18,18 @@ const aggregatorRecordCollection = "aggregator_records"
 // AggregatorRecordStorage implements aggregator record storage for MongoDB
 type AggregatorRecordStorage struct {
 	collection *mongo.Collection
+	insertOpts finalizationInsertOptions
 }
 
 // NewAggregatorRecordStorage creates a new aggregator record storage instance
-func NewAggregatorRecordStorage(db *mongo.Database) *AggregatorRecordStorage {
+func NewAggregatorRecordStorage(db *mongo.Database, insertOpts ...finalizationInsertOptions) *AggregatorRecordStorage {
+	opts := finalizationInsertOptions{workers: 1}
+	if len(insertOpts) > 0 {
+		opts = insertOpts[0]
+	}
 	return &AggregatorRecordStorage{
 		collection: db.Collection(aggregatorRecordCollection),
+		insertOpts: opts,
 	}
 }
 
@@ -55,12 +61,8 @@ func (ars *AggregatorRecordStorage) StoreBatch(ctx context.Context, records []*m
 		docs[i] = recordBSON
 	}
 
-	_, err := ars.collection.InsertMany(ctx, docs, options.InsertMany().SetOrdered(false))
+	err := insertManyFinalizationBatch(ctx, ars.collection, docs, ars.insertOpts)
 	if err != nil {
-		// Ignore duplicate key errors - with SetOrdered(false), non-duplicates are still inserted
-		if mongo.IsDuplicateKeyError(err) {
-			return nil
-		}
 		return fmt.Errorf("failed to store aggregator records batch: %w", err)
 	}
 	return nil
diff --git a/internal/storage/mongodb/aggregator_record_test.go b/internal/storage/mongodb/aggregator_record_test.go
index 37c9ca9..0d64764 100644
--- a/internal/storage/mongodb/aggregator_record_test.go
+++ b/internal/storage/mongodb/aggregator_record_test.go
@@ -155,6 +155,55 @@ func TestAggregatorRecordStorage_StoreBatch_DuplicateHandling(t *testing.T) {
 	assert.Equal(t, int64(5), count, "Should have exactly 5 records now")
 }
 
+func TestAggregatorRecordStorage_StoreBatch_ChunkedDuplicateHandling(t *testing.T) {
+	db := setupAggregatorRecordTestDB(t)
+
+	storage := NewAggregatorRecordStorage(db, finalizationInsertOptions{
+		chunkSize: 2,
+		workers:   2,
+	})
+	ctx := context.Background()
+
+	err := storage.CreateIndexes(ctx)
+	require.NoError(t, err, "CreateIndexes should not return an error")
+
+	state1 := testStateIDHex("01")
+	state2 := testStateIDHex("02")
+	state3 := testStateIDHex("03")
+	state4 := testStateIDHex("04")
+	state5 := testStateIDHex("05")
+
+	initialRecords := []*models.AggregatorRecord{
+		createTestAggregatorRecord(state1, 1, 0),
+		createTestAggregatorRecord(state2, 1, 1),
+		createTestAggregatorRecord(state3, 1, 2),
+	}
+	mixedRecords := []*models.AggregatorRecord{
+		createTestAggregatorRecord(state1, 1, 0),
+		createTestAggregatorRecord(state4, 1, 3),
+		createTestAggregatorRecord(state2, 1, 1),
+		createTestAggregatorRecord(state5, 1, 4),
+	}
+
+	err = storage.StoreBatch(ctx, initialRecords)
+	require.NoError(t, err, "Initial StoreBatch should not return an error")
+
+	err = storage.StoreBatch(ctx, mixedRecords)
+	require.NoError(t, err, "Chunked StoreBatch should ignore duplicate key errors")
+
+	count, err := storage.Count(ctx)
+	require.NoError(t, err, "Count should not return an error")
+	assert.Equal(t, int64(5), count, "Should have exactly 5 unique records")
+
+	record4, err := storage.GetByStateID(ctx, api.RequireNewImprintV2(state4))
+	require.NoError(t, err, "GetByStateID should not return an error for state4")
+	require.NotNil(t, record4, "state4 should be inserted from a chunk with a duplicate")
+
+	record5, err := storage.GetByStateID(ctx, api.RequireNewImprintV2(state5))
+	require.NoError(t, err, "GetByStateID should not return an error for state5")
+	require.NotNil(t, record5, "state5 should be inserted from a chunk with a duplicate")
+}
+
 func TestAggregatorRecordStorage_GetByBlockNumber(t *testing.T) {
 	db := setupAggregatorRecordTestDB(t)
 	storage := NewAggregatorRecordStorage(db)
diff --git a/internal/storage/mongodb/batch_insert.go b/internal/storage/mongodb/batch_insert.go
new file mode 100644
index 0000000..ae55d8c
--- /dev/null
+++ b/internal/storage/mongodb/batch_insert.go
@@ -0,0 +1,109 @@
+package mongodb
+
+import (
+	"context"
+	"sync"
+
+	"go.mongodb.org/mongo-driver/mongo"
+	"go.mongodb.org/mongo-driver/mongo/options"
+)
+
+type finalizationInsertOptions struct {
+	chunkSize int
+	workers   int
+}
+
+func insertManyFinalizationBatch(
+	ctx context.Context,
+	collection *mongo.Collection,
+	docs []interface{},
+	opts finalizationInsertOptions,
+) error {
+	if len(docs) == 0 {
+		return nil
+	}
+	if opts.chunkSize <= 0 || opts.chunkSize >= len(docs) {
+		return ignoreDuplicateInsertError(collection.InsertMany(ctx, docs, options.InsertMany().SetOrdered(false)))
+	}
+	if opts.workers <= 1 {
+		for start := 0; start < len(docs); start += opts.chunkSize {
+			if err := ctx.Err(); err != nil {
+				return err
+			}
+			end := min(start+opts.chunkSize, len(docs))
+			if err := ignoreDuplicateInsertError(collection.InsertMany(ctx, docs[start:end], options.InsertMany().SetOrdered(false))); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+
+	type chunk struct {
+		start int
+		end   int
+	}
+
+	chunkCount := (len(docs) + opts.chunkSize - 1) / opts.chunkSize
+	workers := min(opts.workers, chunkCount)
+	jobs := make(chan chunk)
+
+	var wg sync.WaitGroup
+	var errMu sync.Mutex
+	var firstErr error
+	setFirstErr := func(err error) {
+		if err == nil {
+			return
+		}
+		errMu.Lock()
+		defer errMu.Unlock()
+		if firstErr == nil {
+			firstErr = err
+		}
+	}
+	getFirstErr := func() error {
+		errMu.Lock()
+		defer errMu.Unlock()
+		return firstErr
+	}
+
+	for range workers {
+		wg.Go(func() {
+			for job := range jobs {
+				if err := ctx.Err(); err != nil {
+					setFirstErr(err)
+					continue
+				}
+				err := ignoreDuplicateInsertError(collection.InsertMany(ctx, docs[job.start:job.end], options.InsertMany().SetOrdered(false)))
+				setFirstErr(err)
+			}
+		})
+	}
+
+queue:
+	for start := 0; start < len(docs); start += opts.chunkSize {
+		if err := ctx.Err(); err != nil {
+			setFirstErr(err)
+			break
+		}
+		if getFirstErr() != nil {
+			break
+		}
+		select {
+		case jobs <- chunk{start: start, end: min(start+opts.chunkSize, len(docs))}:
+		case <-ctx.Done():
+			setFirstErr(ctx.Err())
+			break queue
+		}
+	}
+	close(jobs)
+	wg.Wait()
+
+	return getFirstErr()
+}
+
+func ignoreDuplicateInsertError(_ *mongo.InsertManyResult, err error) error {
+	if err == nil || mongo.IsDuplicateKeyError(err) {
+		return nil
+	}
+	return err
+}
diff --git a/internal/storage/mongodb/connection.go b/internal/storage/mongodb/connection.go
index 18fd4a6..f00ff1d 100644
--- a/internal/storage/mongodb/connection.go
+++ b/internal/storage/mongodb/connection.go
@@ -72,12 +72,16 @@ func NewStorage(ctx context.Context, config config.Config) (*Storage, error) {
 		database: database,
 		config:   &cfg,
 	}
+	finalizationInsertOpts := finalizationInsertOptions{
+		chunkSize: cfg.FinalizationInsertChunkSize,
+		workers:   cfg.FinalizationInsertChunkWorkers,
+	}
 
 	// Initialize storage implementations
 	storage.commitmentStorage = NewCommitmentStorage(database)
-	storage.aggregatorRecordStorage = NewAggregatorRecordStorage(database)
+	storage.aggregatorRecordStorage = NewAggregatorRecordStorage(database, finalizationInsertOpts)
 	storage.blockStorage = NewBlockStorage(database)
-	storage.smtStorage = NewSmtStorage(database)
+	storage.smtStorage = NewSmtStorage(database, finalizationInsertOpts)
 	storage.blockRecordsStorage = NewBlockRecordsStorage(database)
 	storage.leadershipStorage = NewLeadershipStorage(database, config.HA.LockTTLSeconds)
 	storage.cachedTrustBaseStorage = NewCachedTrustBaseStorage(NewTrustBaseStorage(database))
diff --git a/internal/storage/mongodb/smt.go b/internal/storage/mongodb/smt.go
index e77606d..ca71493 100644
--- a/internal/storage/mongodb/smt.go
+++ b/internal/storage/mongodb/smt.go
@@ -18,12 +18,18 @@ const smtCollection = "smt_nodes"
 // SmtStorage implements SMT storage for MongoDB
 type SmtStorage struct {
 	collection *mongo.Collection
+	insertOpts finalizationInsertOptions
 }
 
 // NewSmtStorage creates a new SMT storage instance
-func NewSmtStorage(db *mongo.Database) *SmtStorage {
+func NewSmtStorage(db *mongo.Database, insertOpts ...finalizationInsertOptions) *SmtStorage {
+	opts := finalizationInsertOptions{workers: 1}
+	if len(insertOpts) > 0 {
+		opts = insertOpts[0]
+	}
 	return &SmtStorage{
 		collection: db.Collection(smtCollection),
+		insertOpts: opts,
 	}
 }
 
@@ -82,12 +88,8 @@ func (ss *SmtStorage) StoreBatch(ctx context.Context, nodes []*models.SmtNode) e
 		docs[i] = node.ToBSON()
 	}
 
-	_, err := ss.collection.InsertMany(ctx, docs, options.InsertMany().SetOrdered(false))
+	err := insertManyFinalizationBatch(ctx, ss.collection, docs, ss.insertOpts)
 	if err != nil {
-		// Ignore duplicate key errors - with SetOrdered(false), non-duplicates are still inserted
-		if mongo.IsDuplicateKeyError(err) {
-			return nil
-		}
 		return fmt.Errorf("failed to store SMT nodes batch: %w", err)
 	}
 	return nil
diff --git a/internal/storage/mongodb/smt_test.go b/internal/storage/mongodb/smt_test.go
index 3125bf5..043435f 100644
--- a/internal/storage/mongodb/smt_test.go
+++ b/internal/storage/mongodb/smt_test.go
@@ -470,6 +470,51 @@ func TestSmtStorage_StoreBatch_DuplicateHandling(t *testing.T) {
 	require.Equal(t, []byte("newvalue"), []byte(newNode.Value), "New node should have correct value")
 }
 
+func TestSmtStorage_StoreBatch_ChunkedDuplicateHandling(t *testing.T) {
+	db := setupSmtTestDB(t)
+	storage := NewSmtStorage(db, finalizationInsertOptions{
+		chunkSize: 2,
+		workers:   2,
+	})
+	ctx := context.Background()
+
+	err := storage.CreateIndexes(ctx)
+	require.NoError(t, err, "CreateIndexes should not return an error")
+
+	initialNodes := createTestSmtNodes(3)
+	newNodes := createTestSmtNodes(5)
+	mixedNodes := []*models.SmtNode{
+		models.NewSmtNode(initialNodes[0].Key, []byte("duplicate_value_0")),
+		newNodes[3],
+		models.NewSmtNode(initialNodes[1].Key, []byte("duplicate_value_1")),
+		newNodes[4],
+	}
+
+	err = storage.StoreBatch(ctx, initialNodes)
+	require.NoError(t, err, "Initial StoreBatch should not return an error")
+
+	err = storage.StoreBatch(ctx, mixedNodes)
+	require.NoError(t, err, "Chunked StoreBatch should ignore duplicate key errors")
+
+	count, err := storage.Count(ctx)
+	require.NoError(t, err, "Count should not return an error")
+	require.Equal(t, int64(5), count, "Should have exactly 5 unique SMT nodes")
+
+	for i := 0; i < 2; i++ {
+		storedNode, err := storage.GetByKey(ctx, initialNodes[i].Key)
+		require.NoError(t, err, "GetByKey should not return an error for original node %d", i)
+		require.NotNil(t, storedNode, "Original node %d should still exist", i)
+		require.Equal(t, initialNodes[i].Value, storedNode.Value, "Duplicate chunk insert should not overwrite original node %d", i)
+	}
+
+	for _, node := range []*models.SmtNode{newNodes[3], newNodes[4]} {
+		storedNode, err := storage.GetByKey(ctx, node.Key)
+		require.NoError(t, err, "GetByKey should not return an error for new node")
+		require.NotNil(t, storedNode, "New node should be inserted from a chunk with a duplicate")
+		require.Equal(t, node.Value, storedNode.Value)
+	}
+}
+
 func TestSmtStorage_GetByKeys(t *testing.T) {
 	db := setupSmtTestDB(t)
 	storage := NewSmtStorage(db)

From 52f862ad193184cc971d0ca7c5e91fdc3b80968d Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Wed, 6 May 2026 11:04:28 +0300
Subject: [PATCH 03/13] remove unused indexes

---
 internal/storage/mongodb/aggregator_record.go | 28 +++---
 .../storage/mongodb/aggregator_record_test.go | 19 ++++
 internal/storage/mongodb/block_records.go     |  9 +-
 internal/storage/mongodb/index_test.go        | 97 +++++++++++++++++++
 internal/storage/mongodb/smt.go               |  5 +-
 scripts/mongo-init.js                         |  7 --
 6 files changed, 134 insertions(+), 31 deletions(-)
 create mode 100644 internal/storage/mongodb/index_test.go

diff --git a/internal/storage/mongodb/aggregator_record.go b/internal/storage/mongodb/aggregator_record.go
index 116eabd..51df8f5 100644
--- a/internal/storage/mongodb/aggregator_record.go
+++ b/internal/storage/mongodb/aggregator_record.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"sort"
 
 	"go.mongodb.org/mongo-driver/bson"
 	"go.mongodb.org/mongo-driver/mongo"
@@ -133,6 +134,18 @@ func (ars *AggregatorRecordStorage) GetByBlockNumber(ctx context.Context, blockN
 		return nil, fmt.Errorf("cursor error: %w", err)
 	}
 
+	sort.SliceStable(records, func(i, j int) bool {
+		left := records[i].LeafIndex
+		right := records[j].LeafIndex
+		if left == nil || left.Int == nil {
+			return right != nil && right.Int != nil
+		}
+		if right == nil || right.Int == nil {
+			return false
+		}
+		return left.Int.Cmp(right.Int) < 0
+	})
+
 	return records, nil
 }
 
@@ -168,7 +181,8 @@ func (ars *AggregatorRecordStorage) Count(ctx context.Context) (int64, error) {
 	return count, nil
 }
 
-// CreateIndexes creates necessary indexes for the aggregator record collection
+// CreateIndexes creates the necessary indexes needed by the submit, proof, and
+// block-record lookup paths.
 func (ars *AggregatorRecordStorage) CreateIndexes(ctx context.Context) error {
 	indexes := []mongo.IndexModel{
 		{
@@ -178,18 +192,6 @@ func (ars *AggregatorRecordStorage) CreateIndexes(ctx context.Context) error {
 		{
 			Keys: bson.D{{Key: "blockNumber", Value: 1}},
 		},
-		{
-			Keys: bson.D{{Key: "leafIndex", Value: 1}},
-		},
-		{
-			Keys: bson.D{{Key: "finalizedAt", Value: -1}},
-		},
-		{
-			Keys: bson.D{
-				{Key: "blockNumber", Value: 1},
-				{Key: "leafIndex", Value: 1},
-			},
-		},
 	}
 
 	_, err := ars.collection.Indexes().CreateMany(ctx, indexes)
diff --git a/internal/storage/mongodb/aggregator_record_test.go b/internal/storage/mongodb/aggregator_record_test.go
index 0d64764..853adc7 100644
--- a/internal/storage/mongodb/aggregator_record_test.go
+++ b/internal/storage/mongodb/aggregator_record_test.go
@@ -256,6 +256,25 @@ func TestAggregatorRecordStorage_GetByBlockNumber(t *testing.T) {
 		require.True(t, stateIDs[state0103])
 	})
 
+	t.Run("should return records ordered by leaf index", func(t *testing.T) {
+		unorderedRecords := []*models.AggregatorRecord{
+			createTestAggregatorRecord("0203", 102, 2),
+			createTestAggregatorRecord("0201", 102, 0),
+			createTestAggregatorRecord("0202", 102, 1),
+		}
+		err := storage.StoreBatch(ctx, unorderedRecords)
+		require.NoError(t, err, "StoreBatch should not return an error")
+
+		blockNum := api.NewBigInt(big.NewInt(102))
+		retrieved, err := storage.GetByBlockNumber(ctx, blockNum)
+		require.NoError(t, err)
+		require.Len(t, retrieved, 3)
+
+		require.Equal(t, "0", retrieved[0].LeafIndex.String())
+		require.Equal(t, "1", retrieved[1].LeafIndex.String())
+		require.Equal(t, "2", retrieved[2].LeafIndex.String())
+	})
+
 	t.Run("should return empty slice for non-existent block number", func(t *testing.T) {
 		blockNum := api.NewBigInt(big.NewInt(999))
 		retrieved, err := storage.GetByBlockNumber(ctx, blockNum)
diff --git a/internal/storage/mongodb/block_records.go b/internal/storage/mongodb/block_records.go
index 54d99cb..bd952e3 100644
--- a/internal/storage/mongodb/block_records.go
+++ b/internal/storage/mongodb/block_records.go
@@ -151,19 +151,14 @@ func (brs *BlockRecordsStorage) GetLatestBlockNumber(ctx context.Context) (*api.
 	return api.NewBigInt(blockNumber), nil
 }
 
-// CreateIndexes creates necessary indexes for the block records collection
+// CreateIndexes creates the necessary indexes needed by block recovery and HA
+// block sync.
 func (brs *BlockRecordsStorage) CreateIndexes(ctx context.Context) error {
 	indexes := []mongo.IndexModel{
 		{
 			Keys:    bson.D{{Key: "blockNumber", Value: 1}},
 			Options: options.Index().SetUnique(true),
 		},
-		{
-			Keys: bson.D{{Key: "stateIds", Value: 1}},
-		},
-		{
-			Keys: bson.D{{Key: "createdAt", Value: -1}},
-		},
 	}
 
 	_, err := brs.collection.Indexes().CreateMany(ctx, indexes)
diff --git a/internal/storage/mongodb/index_test.go b/internal/storage/mongodb/index_test.go
new file mode 100644
index 0000000..f5f8412
--- /dev/null
+++ b/internal/storage/mongodb/index_test.go
@@ -0,0 +1,97 @@
+package mongodb
+
+import (
+	"context"
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"go.mongodb.org/mongo-driver/bson"
+	"go.mongodb.org/mongo-driver/mongo"
+)
+
+type mongoIndexSpec struct {
+	Name   string `bson:"name"`
+	Key    bson.D `bson:"key"`
+	Unique bool   `bson:"unique,omitempty"`
+}
+
+func listIndexSpecsByName(t *testing.T, ctx context.Context, collection *mongo.Collection) map[string]mongoIndexSpec {
+	t.Helper()
+
+	cursor, err := collection.Indexes().List(ctx)
+	require.NoError(t, err)
+	defer cursor.Close(ctx)
+
+	indexes := make(map[string]mongoIndexSpec)
+	for cursor.Next(ctx) {
+		var index mongoIndexSpec
+		require.NoError(t, cursor.Decode(&index))
+		indexes[index.Name] = index
+	}
+	require.NoError(t, cursor.Err())
+
+	return indexes
+}
+
+func requireIndexNames(t *testing.T, indexes map[string]mongoIndexSpec, expected ...string) {
+	t.Helper()
+
+	actual := make([]string, 0, len(indexes))
+	for name := range indexes {
+		actual = append(actual, name)
+	}
+	assert.ElementsMatch(t, expected, actual)
+}
+
+func requireIndexKey(t *testing.T, index mongoIndexSpec, expected bson.D) {
+	t.Helper()
+
+	require.Len(t, index.Key, len(expected))
+	for i := range expected {
+		assert.Equal(t, expected[i].Key, index.Key[i].Key)
+		assert.Equal(t, fmt.Sprint(expected[i].Value), fmt.Sprint(index.Key[i].Value))
+	}
+}
+
+func TestAggregatorRecordStorage_CreateIndexes_ProductionIndexSet(t *testing.T) {
+	db := setupTestDB(t)
+	storage := NewAggregatorRecordStorage(db)
+	ctx := context.Background()
+
+	require.NoError(t, storage.CreateIndexes(ctx))
+
+	indexes := listIndexSpecsByName(t, ctx, storage.collection)
+	requireIndexNames(t, indexes, "_id_", "stateId_1", "blockNumber_1")
+	requireIndexKey(t, indexes["stateId_1"], bson.D{{Key: "stateId", Value: 1}})
+	requireIndexKey(t, indexes["blockNumber_1"], bson.D{{Key: "blockNumber", Value: 1}})
+	assert.True(t, indexes["stateId_1"].Unique, "stateId must stay unique for proof lookup and duplicate protection")
+	assert.False(t, indexes["blockNumber_1"].Unique, "multiple records belong to the same block")
+}
+
+func TestBlockRecordsStorage_CreateIndexes_ProductionIndexSet(t *testing.T) {
+	db := setupTestDB(t)
+	storage := NewBlockRecordsStorage(db)
+	ctx := context.Background()
+
+	require.NoError(t, storage.CreateIndexes(ctx))
+
+	indexes := listIndexSpecsByName(t, ctx, storage.collection)
+	requireIndexNames(t, indexes, "_id_", "blockNumber_1")
+	requireIndexKey(t, indexes["blockNumber_1"], bson.D{{Key: "blockNumber", Value: 1}})
+	assert.True(t, indexes["blockNumber_1"].Unique, "one block_records document is stored per block")
+}
+
+func TestSmtStorage_CreateIndexes_ProductionIndexSet(t *testing.T) {
+	db := setupTestDB(t)
+	storage := NewSmtStorage(db)
+	ctx := context.Background()
+
+	require.NoError(t, storage.CreateIndexes(ctx))
+
+	indexes := listIndexSpecsByName(t, ctx, storage.collection)
+	requireIndexNames(t, indexes, "_id_", "key_1")
+	requireIndexKey(t, indexes["key_1"], bson.D{{Key: "key", Value: 1}})
+	assert.True(t, indexes["key_1"].Unique, "SMT node keys must stay unique")
+}
diff --git a/internal/storage/mongodb/smt.go b/internal/storage/mongodb/smt.go
index ca71493..f443515 100644
--- a/internal/storage/mongodb/smt.go
+++ b/internal/storage/mongodb/smt.go
@@ -279,16 +279,13 @@ func (ss *SmtStorage) GetChunked(ctx context.Context, offset, limit int) ([]*mod
 	return nodes, nil
 }
 
-// CreateIndexes creates necessary indexes for the SMT collection
+// CreateIndexes creates the necessary indexes needed by the SMT node lookup.
 func (ss *SmtStorage) CreateIndexes(ctx context.Context) error {
 	indexes := []mongo.IndexModel{
 		{
 			Keys:    bson.D{{Key: "key", Value: 1}},
 			Options: options.Index().SetUnique(true),
 		},
-		{
-			Keys: bson.D{{Key: "createdAt", Value: -1}},
-		},
 	}
 
 	_, err := ss.collection.Indexes().CreateMany(ctx, indexes)
diff --git a/scripts/mongo-init.js b/scripts/mongo-init.js
index 102ee50..8233a9b 100644
--- a/scripts/mongo-init.js
+++ b/scripts/mongo-init.js
@@ -20,9 +20,6 @@ db.commitments.createIndex({ processedAt: 1, createdAt: 1 });
 db.createCollection('aggregator_records');
 db.aggregator_records.createIndex({ stateId: 1 }, { unique: true });
 db.aggregator_records.createIndex({ blockNumber: 1 });
-db.aggregator_records.createIndex({ leafIndex: 1 });
-db.aggregator_records.createIndex({ finalizedAt: -1 });
-db.aggregator_records.createIndex({ blockNumber: 1, leafIndex: 1 });
 
 // Blocks collection
 db.createCollection('blocks');
@@ -33,14 +30,10 @@ db.blocks.createIndex({ chainId: 1 });
 // SMT nodes collection
 db.createCollection('smt_nodes');
 db.smt_nodes.createIndex({ key: 1 }, { unique: true });
-db.smt_nodes.createIndex({ hash: 1 });
-db.smt_nodes.createIndex({ createdAt: -1 });
 
 // Block records collection
 db.createCollection('block_records');
 db.block_records.createIndex({ blockNumber: 1 }, { unique: true });
-db.block_records.createIndex({ stateIds: 1 });
-db.block_records.createIndex({ createdAt: -1 });
 
 // Leadership collection
 db.createCollection('leadership');

From 6e77bb569f6d506d5cb1d6cbc75e4b23c03513c1 Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Mon, 11 May 2026 11:59:44 +0300
Subject: [PATCH 04/13] perf: improve performance test gateway support

---
 cmd/performance-test/main.go  | 595 +++++++++++++++++++---------------
 cmd/performance-test/types.go | 188 +++++++----
 2 files changed, 461 insertions(+), 322 deletions(-)

diff --git a/cmd/performance-test/main.go b/cmd/performance-test/main.go
index 195b043..d4b9307 100644
--- a/cmd/performance-test/main.go
+++ b/cmd/performance-test/main.go
@@ -31,15 +31,17 @@ import (
 const (
 	defaultAggregatorURL     = "https://localhost:3000"
 	defaultTestDuration      = 30 * time.Second
-	workerCount              = 20
-	proofWorkerCount         = 10 // Separate worker pool for proof requests
-	httpClientPoolSize       = 4
+	defaultWorkerCount       = 20
+	defaultProofWorkerCount  = 10 // Separate worker pool for proof requests
+	defaultHTTPClientPool    = 4
 	defaultRequestsPerSec    = 2000
 	aggregatorLogPath        = "logs/aggregator.log"
 	aggregatorLogPathsEnv    = "AGGREGATOR_LOG_PATHS"
 	proofMaxRetries          = 10
-	proofRetryDelay          = 1000 * time.Millisecond
+	defaultProofRetryDelay   = 1000 * time.Millisecond
 	defaultProofInitialDelay = 2500 * time.Millisecond
+	defaultStartupProbeWait  = 60 * time.Second
+	startupProbeInterval     = 250 * time.Millisecond
 )
 
 // Sharding modes for routing generated state IDs to shard endpoints.
@@ -52,12 +54,17 @@ const (
 
 // Configurable via environment variables
 var (
-	testDuration      = getEnvDuration("TEST_DURATION", defaultTestDuration)
-	requestsPerSec    = getEnvInt("REQUESTS_PER_SEC", defaultRequestsPerSec)
-	proofInitialDelay = getEnvDuration("PROOF_INITIAL_DELAY", defaultProofInitialDelay)
-	shardingMode      = getShardingMode()
-	shardTargets      = getEnvShardTargets()
-	enableH2C         = os.Getenv("ENABLE_H2C") != "false"
+	testDuration       = getEnvDuration("TEST_DURATION", defaultTestDuration)
+	requestsPerSec     = getEnvInt("REQUESTS_PER_SEC", defaultRequestsPerSec)
+	workerCount        = getEnvInt("SUBMISSION_WORKERS", defaultWorkerCount)
+	proofWorkerCount   = getEnvInt("PROOF_WORKERS", defaultProofWorkerCount)
+	httpClientPoolSize = getEnvInt("HTTP_CLIENT_POOL_SIZE", defaultHTTPClientPool)
+	proofRetryDelay    = getEnvDuration("PROOF_RETRY_DELAY", defaultProofRetryDelay)
+	proofInitialDelay  = getEnvDuration("PROOF_INITIAL_DELAY", defaultProofInitialDelay)
+	startupProbeWait   = getEnvDuration("STARTUP_PROBE_WAIT", defaultStartupProbeWait)
+	shardingMode       = getShardingMode()
+	shardTargets       = getEnvShardTargets()
+	enableH2C          = os.Getenv("ENABLE_H2C") != "false"
 )
 
 func getShardingMode() string {
@@ -95,6 +102,64 @@ func getEnvDuration(key string, defaultVal time.Duration) time.Duration {
 	return defaultVal
 }
 
+func waitForStartingBlock(sc *ShardClient, timeout time.Duration) (int64, error) {
+	deadline := time.Now().Add(timeout)
+	var lastErr error
+
+	for attempt := 1; ; attempt++ {
+		startingBlock, err := getStartingBlock(sc)
+		if err == nil {
+			if attempt > 1 {
+				fmt.Printf("✓ get_block_height succeeded for %s after %d attempts\n", sc.url, attempt)
+			}
+			return startingBlock, nil
+		}
+
+		lastErr = err
+		if time.Now().Add(startupProbeInterval).After(deadline) {
+			return 0, lastErr
+		}
+
+		if attempt == 1 || attempt%8 == 0 {
+			fmt.Printf("Waiting for get_block_height from %s: %v\n", sc.url, err)
+		}
+		time.Sleep(startupProbeInterval)
+	}
+}
+
+func getStartingBlock(sc *ShardClient) (int64, error) {
+	// Include shardId param so gateway proxies can route the request.
+	var blockHeightParams interface{}
+	if shardingMode == shardingModeBFT && sc.shardBits != "" {
+		blockHeightParams = map[string]interface{}{"shardId": sc.shardBits}
+	} else if sc.shardMask > 0 {
+		blockHeightParams = map[string]interface{}{"shardId": fmt.Sprintf("%d", sc.shardMask)}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	resp, err := sc.client.callWithContext(ctx, "get_block_height", blockHeightParams)
+	if err != nil {
+		return 0, err
+	}
+	if resp.Error != nil {
+		return 0, fmt.Errorf("JSON-RPC error: %s", resp.Error.Message)
+	}
+
+	var heightResp GetBlockHeightResponse
+	respBytes, _ := json.Marshal(resp.Result)
+	if err := json.Unmarshal(respBytes, &heightResp); err != nil {
+		return 0, fmt.Errorf("parse block height response: %w", err)
+	}
+
+	var startingBlockNumber int64
+	if _, err := fmt.Sscanf(heightResp.BlockNumber, "%d", &startingBlockNumber); err != nil {
+		return 0, fmt.Errorf("parse starting block number %q: %w", heightResp.BlockNumber, err)
+	}
+	return startingBlockNumber, nil
+}
+
 // getEnvShardTargets parses SHARD_TARGETS. Each comma-separated entry is
 // URL:mask where mask is interpreted by the active sharding mode (decimal
 // sentinel-int in app mode, MSB-first binary in bft-shard mode). The last colon
@@ -486,6 +551,7 @@ func commitmentWorker(ctx context.Context, shardClients []*ShardClient, metrics
 					if sm := metrics.shard(shardIdx); sm != nil {
 						sm.failedRequests.Add(1)
 					}
+					metrics.recordError(fmt.Sprintf("submit API error (code %d): %s", resp.Error.Code, resp.Error.Message))
 					if resp.Error.Message == "STATE_ID_EXISTS" {
 						atomic.AddInt64(&metrics.stateIdExistsErr, 1)
 						if sm := metrics.shard(shardIdx); sm != nil {
@@ -521,9 +587,11 @@ func commitmentWorker(ctx context.Context, shardClients []*ShardClient, metrics
 					metrics.submittedStateIDs.Store(stateIDStr, true)
 
 					if proofQueue != nil {
-						metrics.recordSubmissionTimestamp(stateIDStr)
+						submittedAt := time.Now()
+						firstProofAt := submittedAt.Add(proofInitialDelay)
+						metrics.recordSubmissionTimestamp(stateIDStr, submittedAt)
 						select {
-						case proofQueue <- proofJob{shardIdx: shardIdx, request: req}:
+						case proofQueue <- proofJob{shardIdx: shardIdx, request: req, submittedAt: submittedAt, firstProofAt: firstProofAt}:
 						default:
 							// Queue full, skip proof verification for this one
 						}
@@ -542,148 +610,204 @@ func commitmentWorker(ctx context.Context, shardClients []*ShardClient, metrics
 	}
 }
 
-// Worker function that continuously verifies inclusion proofs in a sharded setup.
-func proofVerificationWorker(ctx context.Context, shardClients []*ShardClient, metrics *Metrics, proofQueue chan proofJob, counters *RequestRateCounters) {
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case job := <-proofQueue:
-			go func(job proofJob) {
-				if job.request == nil {
-					metrics.recordError("Missing original request for proof verification")
-					atomic.AddInt64(&metrics.proofVerifyFailed, 1)
-					if sm := metrics.shard(job.shardIdx); sm != nil {
-						sm.proofVerifyFailed.Add(1)
-					}
-					return
-				}
+func sleepOrDone(ctx context.Context, d time.Duration) bool {
+	if d <= 0 {
+		return true
+	}
 
-				stateID := normalizeStateID(job.request.StateID.String())
-				shardIdx := job.shardIdx
-				time.Sleep(proofInitialDelay)
-				startTime := time.Now()
-				normalizedID := stateID
-				client := shardClients[shardIdx].proofClient // Use separate proof client pool
+	timer := time.NewTimer(d)
+	defer timer.Stop()
 
-				for attempt := 0; attempt < proofMaxRetries; attempt++ {
-					atomic.AddInt64(&metrics.proofAttempts, 1)
-					if sm := metrics.shard(shardIdx); sm != nil {
-						sm.proofAttempts.Add(1)
-					}
-					if attempt > 0 {
-						atomic.AddInt64(&metrics.proofRetries, 1)
-						if sm := metrics.shard(shardIdx); sm != nil {
-							sm.proofRetries.Add(1)
-						}
-						if counters != nil {
-							counters.IncProofRetries()
-						}
-					}
+	select {
+	case <-ctx.Done():
+		return false
+	case <-timer.C:
+		return true
+	}
+}
 
-					proofReq := GetInclusionProofRequestV2{StateID: stateID}
+func verifyProofJob(ctx context.Context, shardClients []*ShardClient, metrics *Metrics, job proofJob, counters *RequestRateCounters) {
+	if job.request == nil {
+		metrics.recordError("Missing original request for proof verification")
+		atomic.AddInt64(&metrics.proofVerifyFailed, 1)
+		if sm := metrics.shard(job.shardIdx); sm != nil {
+			sm.proofVerifyFailed.Add(1)
+		}
+		return
+	}
 
-					atomic.AddInt64(&metrics.proofActiveRequests, 1)
-					if counters != nil {
-						counters.IncProofStarted()
-					}
-					requestStart := time.Now()
+	stateID := normalizeStateID(job.request.StateID.String())
+	shardIdx := job.shardIdx
+	startTime := time.Now()
+	client := shardClients[shardIdx].proofClient // Use separate proof client pool
 
-					// Create a context with 5 second timeout for proof retrieval
-					proofCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-					resp, err := client.callWithContext(proofCtx, "get_inclusion_proof.v2", proofReq)
-					cancel()
+	for attempt := 0; attempt < proofMaxRetries; attempt++ {
+		if ctx.Err() != nil {
+			return
+		}
 
-					if counters != nil {
-						counters.IncProofCompleted()
-					}
-					requestDuration := time.Since(requestStart)
-					atomic.AddInt64(&metrics.proofActiveRequests, -1)
-
-					if err != nil {
-						if attempt >= 3 {
-							metrics.recordError(fmt.Sprintf("Network error getting proof after %d attempts: %v", attempt+1, err))
-							atomic.AddInt64(&metrics.proofFailed, 1)
-							if sm := metrics.shard(shardIdx); sm != nil {
-								sm.proofFailed.Add(1)
-							}
-							return
-						}
-						time.Sleep(proofRetryDelay)
-						continue
-					}
+		atomic.AddInt64(&metrics.proofAttempts, 1)
+		if sm := metrics.shard(shardIdx); sm != nil {
+			sm.proofAttempts.Add(1)
+		}
+		if attempt > 0 {
+			atomic.AddInt64(&metrics.proofRetries, 1)
+			if sm := metrics.shard(shardIdx); sm != nil {
+				sm.proofRetries.Add(1)
+			}
+			if counters != nil {
+				counters.IncProofRetries()
+			}
+		}
 
-					if resp.Error != nil {
-						if attempt >= proofMaxRetries-1 {
-							metrics.recordError(fmt.Sprintf("API error getting proof (code %d): %s", resp.Error.Code, resp.Error.Message))
-							atomic.AddInt64(&metrics.proofFailed, 1)
-							if sm := metrics.shard(shardIdx); sm != nil {
-								sm.proofFailed.Add(1)
-							}
-							return
-						}
-						time.Sleep(proofRetryDelay)
-						continue
-					}
+		proofReq := GetInclusionProofRequestV2{StateID: stateID}
 
-					var proofResp api.GetInclusionProofResponseV2
-					respBytes, _ := json.Marshal(resp.Result)
-					if err := json.Unmarshal(respBytes, &proofResp); err != nil {
-						metrics.recordError(fmt.Sprintf("Failed to parse proof response: %v", err))
-						atomic.AddInt64(&metrics.proofFailed, 1)
-						if sm := metrics.shard(shardIdx); sm != nil {
-							sm.proofFailed.Add(1)
-						}
-						return
-					}
+		requestStart := time.Now()
+		if attempt == 0 && !job.submittedAt.IsZero() {
+			firstStartLag := requestStart.Sub(job.submittedAt)
+			schedulerLag := time.Duration(0)
+			if !job.firstProofAt.IsZero() && requestStart.After(job.firstProofAt) {
+				schedulerLag = requestStart.Sub(job.firstProofAt)
+			}
+			metrics.addProofStartTiming(firstStartLag, schedulerLag)
+		}
 
-					if proofResp.InclusionProof == nil || proofResp.InclusionProof.CertificationData == nil || proofResp.InclusionProof.CertificationData.TransactionHash == nil {
-						time.Sleep(proofRetryDelay)
-						continue
-					}
+		atomic.AddInt64(&metrics.proofActiveRequests, 1)
+		if counters != nil {
+			counters.IncProofStarted()
+		}
 
-					atomic.AddInt64(&metrics.proofSuccess, 1)
-					if sm := metrics.shard(shardIdx); sm != nil {
-						sm.proofSuccess.Add(1)
-					}
-					metrics.recordProofSuccessAttempt(attempt)
-					metrics.addProofRequestDuration(requestDuration)
-
-					submittedAt, hasSubmission := metrics.getSubmissionTimestamp(normalizedID)
-					var totalLatency time.Duration
-					if hasSubmission {
-						totalLatency = time.Since(submittedAt)
-						metrics.clearSubmissionTimestamp(normalizedID)
-					} else {
-						totalLatency = time.Since(startTime) + proofInitialDelay
-					}
-					metrics.addProofLatency(totalLatency)
+		proofCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+		resp, err := client.callWithContext(proofCtx, "get_inclusion_proof.v2", proofReq)
+		cancel()
 
-					if err := proofverify.VerifyInclusionProofLocal(proofResp.InclusionProof, job.request); err != nil {
-						if attempt < proofMaxRetries-1 {
-							time.Sleep(proofRetryDelay)
-							continue
-						}
-						metrics.recordError(fmt.Sprintf("Proof verification failed for state ID %s: %v", stateID, err))
-						atomic.AddInt64(&metrics.proofVerifyFailed, 1)
-						if sm := metrics.shard(shardIdx); sm != nil {
-							sm.proofVerifyFailed.Add(1)
-						}
-						return
-					}
-					atomic.AddInt64(&metrics.proofVerified, 1)
-					if sm := metrics.shard(shardIdx); sm != nil {
-						sm.proofVerified.Add(1)
-					}
+		if counters != nil {
+			counters.IncProofCompleted()
+		}
+		requestDuration := time.Since(requestStart)
+		atomic.AddInt64(&metrics.proofActiveRequests, -1)
 
-					return
+		if err != nil {
+			if attempt >= 3 {
+				metrics.recordError(fmt.Sprintf("Network error getting proof after %d attempts: %v", attempt+1, err))
+				atomic.AddInt64(&metrics.proofFailed, 1)
+				if sm := metrics.shard(shardIdx); sm != nil {
+					sm.proofFailed.Add(1)
 				}
+				return
+			}
+			if !sleepOrDone(ctx, proofRetryDelay) {
+				return
+			}
+			continue
+		}
 
-				metrics.recordError(fmt.Sprintf("Timeout getting proof after %d attempts", proofMaxRetries))
+		if resp.Error != nil {
+			if attempt >= proofMaxRetries-1 {
+				metrics.recordError(fmt.Sprintf("API error getting proof (code %d): %s", resp.Error.Code, resp.Error.Message))
 				atomic.AddInt64(&metrics.proofFailed, 1)
 				if sm := metrics.shard(shardIdx); sm != nil {
 					sm.proofFailed.Add(1)
 				}
+				return
+			}
+			if !sleepOrDone(ctx, proofRetryDelay) {
+				return
+			}
+			continue
+		}
+
+		var proofResp api.GetInclusionProofResponseV2
+		respBytes, _ := json.Marshal(resp.Result)
+		if err := json.Unmarshal(respBytes, &proofResp); err != nil {
+			metrics.recordError(fmt.Sprintf("Failed to parse proof response: %v", err))
+			atomic.AddInt64(&metrics.proofFailed, 1)
+			if sm := metrics.shard(shardIdx); sm != nil {
+				sm.proofFailed.Add(1)
+			}
+			return
+		}
+
+		if proofResp.InclusionProof == nil || proofResp.InclusionProof.CertificationData == nil || proofResp.InclusionProof.CertificationData.TransactionHash == nil {
+			if !sleepOrDone(ctx, proofRetryDelay) {
+				return
+			}
+			continue
+		}
+
+		atomic.AddInt64(&metrics.proofSuccess, 1)
+		if sm := metrics.shard(shardIdx); sm != nil {
+			sm.proofSuccess.Add(1)
+		}
+		metrics.recordProofSuccessAttempt(attempt)
+		metrics.addProofRequestDuration(requestDuration)
+
+		submittedAt, hasSubmission := metrics.getSubmissionTimestamp(stateID)
+		var totalLatency time.Duration
+		if !job.submittedAt.IsZero() {
+			totalLatency = time.Since(job.submittedAt)
+			metrics.clearSubmissionTimestamp(stateID)
+		} else if hasSubmission {
+			totalLatency = time.Since(submittedAt)
+			metrics.clearSubmissionTimestamp(stateID)
+		} else {
+			totalLatency = time.Since(startTime) + proofInitialDelay
+		}
+		metrics.addProofLatency(totalLatency)
+
+		if err := proofverify.VerifyInclusionProofLocal(proofResp.InclusionProof, job.request); err != nil {
+			if attempt < proofMaxRetries-1 {
+				if !sleepOrDone(ctx, proofRetryDelay) {
+					return
+				}
+				continue
+			}
+			metrics.recordError(fmt.Sprintf("Proof verification failed for state ID %s: %v", stateID, err))
+			atomic.AddInt64(&metrics.proofVerifyFailed, 1)
+			if sm := metrics.shard(shardIdx); sm != nil {
+				sm.proofVerifyFailed.Add(1)
+			}
+			return
+		}
+
+		atomic.AddInt64(&metrics.proofVerified, 1)
+		if sm := metrics.shard(shardIdx); sm != nil {
+			sm.proofVerified.Add(1)
+		}
+		return
+	}
+
+	metrics.recordError(fmt.Sprintf("Timeout getting proof after %d attempts", proofMaxRetries))
+	atomic.AddInt64(&metrics.proofFailed, 1)
+	if sm := metrics.shard(job.shardIdx); sm != nil {
+		sm.proofFailed.Add(1)
+	}
+}
+
+func sleepUntilOrDone(ctx context.Context, deadline time.Time) bool {
+	if deadline.IsZero() {
+		return true
+	}
+	return sleepOrDone(ctx, time.Until(deadline))
+}
+
+func scheduleProofJobs(ctx context.Context, proofQueue <-chan proofJob, shardClients []*ShardClient, metrics *Metrics, counters *RequestRateCounters, proofJobsWg *sync.WaitGroup) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case job, ok := <-proofQueue:
+			if !ok {
+				return
+			}
+
+			proofJobsWg.Add(1)
+			go func(job proofJob) {
+				defer proofJobsWg.Done()
+				if !sleepUntilOrDone(ctx, job.firstProofAt) {
+					return
+				}
+				verifyProofJob(ctx, shardClients, metrics, job, counters)
 			}(job)
 		}
 	}
@@ -906,6 +1030,14 @@ func parseAggregatorRoundLogs(path string, start, end time.Time) ([]aggregatorRo
 		if err != nil {
 			continue
 		}
+		finalizeStoreBlockDoc, hasFinalizeStoreBlockDoc, err := parseOptionalLogDuration(raw.FinalizeStoreBlockDoc)
+		if err != nil {
+			continue
+		}
+		finalizeStoreBlockRecords, hasFinalizeStoreBlockRecords, err := parseOptionalLogDuration(raw.FinalizeStoreBlockRecords)
+		if err != nil {
+			continue
+		}
 		finalizeStoreData, hasFinalizeStoreData, err := parseOptionalLogDuration(raw.FinalizeStoreData)
 		if err != nil {
 			continue
@@ -934,7 +1066,7 @@ func parseAggregatorRoundLogs(path string, start, end time.Time) ([]aggregatorRo
 		if err != nil {
 			continue
 		}
-		hasFinalizationBreakdown := hasFinalizeScan || hasFinalizeConvert || hasFinalizeStoreBlock || hasFinalizeStoreData || hasFinalizeStoreSmt || hasFinalizeStoreRecords || hasFinalizeLockWait || hasFinalizeSmtCommit || hasFinalizeSetFinalized || hasFinalizeAck
+		hasFinalizationBreakdown := hasFinalizeScan || hasFinalizeConvert || hasFinalizeStoreBlock || hasFinalizeStoreBlockDoc || hasFinalizeStoreBlockRecords || hasFinalizeStoreData || hasFinalizeStoreSmt || hasFinalizeStoreRecords || hasFinalizeLockWait || hasFinalizeSmtCommit || hasFinalizeSetFinalized || hasFinalizeAck
 
 		medianDur, hasMedian, err := parseOptionalLogDuration(raw.ProofReadyMedian)
 		if err != nil {
@@ -954,30 +1086,32 @@ func parseAggregatorRoundLogs(path string, start, end time.Time) ([]aggregatorRo
 		}
 
 		summaries = append(summaries, aggregatorRoundSummary{
-			Timestamp:                timestamp,
-			Block:                    raw.Block,
-			Commitments:              raw.Commitments,
-			RoundTime:                roundDur,
-			Processing:               procDur,
-			BftWait:                  bftDur,
-			Finalization:             finalDur,
-			HasFinalizationBreakdown: hasFinalizationBreakdown,
-			FinalizeScan:             finalizeScan,
-			FinalizeConvert:          finalizeConvert,
-			FinalizeStoreBlock:       finalizeStoreBlock,
-			FinalizeStoreData:        finalizeStoreData,
-			FinalizeStoreSmt:         finalizeStoreSmt,
-			FinalizeStoreRecords:     finalizeStoreRecords,
-			FinalizeLockWait:         finalizeLockWait,
-			FinalizeSmtCommit:        finalizeSmtCommit,
-			FinalizeSetFinalized:     finalizeSetFinalized,
-			FinalizeAck:              finalizeAck,
-			HasProofReady:            hasProofReady,
-			ProofMedian:              medianDur,
-			ProofP95:                 p95Dur,
-			ProofP99:                 p99Dur,
-			RedisTotal:               raw.RedisTotal,
-			RedisPending:             raw.RedisPending,
+			Timestamp:                 timestamp,
+			Block:                     raw.Block,
+			Commitments:               raw.Commitments,
+			RoundTime:                 roundDur,
+			Processing:                procDur,
+			BftWait:                   bftDur,
+			Finalization:              finalDur,
+			HasFinalizationBreakdown:  hasFinalizationBreakdown,
+			FinalizeScan:              finalizeScan,
+			FinalizeConvert:           finalizeConvert,
+			FinalizeStoreBlock:        finalizeStoreBlock,
+			FinalizeStoreBlockDoc:     finalizeStoreBlockDoc,
+			FinalizeStoreBlockRecords: finalizeStoreBlockRecords,
+			FinalizeStoreData:         finalizeStoreData,
+			FinalizeStoreSmt:          finalizeStoreSmt,
+			FinalizeStoreRecords:      finalizeStoreRecords,
+			FinalizeLockWait:          finalizeLockWait,
+			FinalizeSmtCommit:         finalizeSmtCommit,
+			FinalizeSetFinalized:      finalizeSetFinalized,
+			FinalizeAck:               finalizeAck,
+			HasProofReady:             hasProofReady,
+			ProofMedian:               medianDur,
+			ProofP95:                  p95Dur,
+			ProofP99:                  p99Dur,
+			RedisTotal:                raw.RedisTotal,
+			RedisPending:              raw.RedisPending,
 		})
 	}
 
@@ -1104,13 +1238,15 @@ func printFinalizationBreakdownSummary(label string, entries []aggregatorRoundSu
 		return
 	}
 
-	var scanSum, convertSum, storeBlockSum, storeDataSum time.Duration
+	var scanSum, convertSum, storeBlockSum, storeBlockDocSum, storeBlockRecordsSum, storeDataSum time.Duration
 	var storeSmtSum, storeRecordsSum, lockWaitSum time.Duration
 	var smtCommitSum, setFinalizedSum, ackSum time.Duration
 	for _, entry := range withBreakdown {
 		scanSum += entry.FinalizeScan
 		convertSum += entry.FinalizeConvert
 		storeBlockSum += entry.FinalizeStoreBlock
+		storeBlockDocSum += entry.FinalizeStoreBlockDoc
+		storeBlockRecordsSum += entry.FinalizeStoreBlockRecords
 		storeDataSum += entry.FinalizeStoreData
 		storeSmtSum += entry.FinalizeStoreSmt
 		storeRecordsSum += entry.FinalizeStoreRecords
@@ -1121,11 +1257,13 @@ func printFinalizationBreakdownSummary(label string, entries []aggregatorRoundSu
 	}
 
 	count := time.Duration(len(withBreakdown))
-	fmt.Printf("%s finalization breakdown: scan=%v convert=%v storeBlock=%v storeData=%v (smt=%v records=%v) lockWait=%v smtCommit=%v setFinalized=%v ack=%v (%d rounds)\n",
+	fmt.Printf("%s finalization breakdown: scan=%v convert=%v storeBlock=%v (blockDoc=%v blockRecords=%v) storeData=%v (smt=%v records=%v) lockWait=%v smtCommit=%v setFinalized=%v ack=%v (%d rounds)\n",
 		prefix,
 		(scanSum / count).Truncate(time.Millisecond),
 		(convertSum / count).Truncate(time.Millisecond),
 		(storeBlockSum / count).Truncate(time.Millisecond),
+		(storeBlockDocSum / count).Truncate(time.Millisecond),
+		(storeBlockRecordsSum / count).Truncate(time.Millisecond),
 		(storeDataSum / count).Truncate(time.Millisecond),
 		(storeSmtSum / count).Truncate(time.Millisecond),
 		(storeRecordsSum / count).Truncate(time.Millisecond),
@@ -1333,7 +1471,9 @@ func main() {
 	}
 	fmt.Printf("Duration: %v\n", testDuration)
 	fmt.Printf("Submission workers: %d\n", workerCount)
-	fmt.Printf("Proof workers: %d\n", proofWorkerCount)
+	fmt.Printf("Proof scheduling: exact per-submission timer (PROOF_WORKERS ignored, value=%d)\n", proofWorkerCount)
+	fmt.Printf("Proof initial delay: %v\n", proofInitialDelay)
+	fmt.Printf("Proof retry delay: %v\n", proofRetryDelay)
 	fmt.Printf("HTTP client pool size: %d\n", httpClientPoolSize)
 	if enableH2C {
 		fmt.Printf("H2C: enabled (HTTP/2 cleartext for plain HTTP)\n")
@@ -1357,29 +1497,9 @@ func main() {
 	// Test connectivity and get starting block number for each shard
 	for _, sc := range shardClients {
 		fmt.Printf("Testing connectivity to %s...\n", sc.url)
-		// Include shardId param so gateway proxies can route the request
-		var blockHeightParams interface{}
-		if sc.shardMask > 0 {
-			blockHeightParams = map[string]interface{}{"shardId": fmt.Sprintf("%d", sc.shardMask)}
-		}
-		resp, err := sc.client.call("get_block_height", blockHeightParams)
+		startingBlockNumber, err := waitForStartingBlock(sc, startupProbeWait)
 		if err != nil {
-			log.Fatalf("Failed to connect to aggregator at %s: %v", sc.url, err)
-		}
-
-		if resp.Error != nil {
-			log.Fatalf("Error getting block height from %s: %v", sc.url, resp.Error.Message)
-		}
-
-		var heightResp GetBlockHeightResponse
-		respBytes, _ := json.Marshal(resp.Result)
-		if err := json.Unmarshal(respBytes, &heightResp); err != nil {
-			log.Fatalf("Failed to parse block height from %s: %v", sc.url, err)
-		}
-
-		var startingBlockNumber int64
-		if _, err := fmt.Sscanf(heightResp.BlockNumber, "%d", &startingBlockNumber); err != nil {
-			log.Fatalf("Failed to parse starting block number from %s: %v", sc.url, err)
+			log.Fatalf("Failed to connect to aggregator at %s after %v: %v", sc.url, startupProbeWait, err)
 		}
 		sc.startingBlock = startingBlockNumber
 
@@ -1387,7 +1507,9 @@ func main() {
 		fmt.Printf("✓ Starting block number for %s: %d\n", sc.url, startingBlockNumber)
 	}
 
-	// Create proof verification queue (buffered channel)
+	// proofQueue receives successful submissions. Each proof job owns its
+	// submittedAt+PROOF_INITIAL_DELAY timer so first-attempt latency matches an
+	// external client polling at that exact delay.
 	proofQueue := make(chan proofJob, 10000)
 
 	// Pre-generate commitment pool to eliminate client-side crypto overhead
@@ -1525,16 +1647,15 @@ func main() {
 		}()
 	}
 
-	// Start SEPARATE proof verification workers (use proofCtx - runs longer)
-	// These workers are dedicated to proof requests, so they don't compete with submissions
-	var proofWg sync.WaitGroup
-	for i := 0; i < proofWorkerCount; i++ {
-		proofWg.Add(1)
-		go func() {
-			defer proofWg.Done()
-			proofVerificationWorker(proofCtx, shardClients, metrics, proofQueue, rateCounters)
-		}()
-	}
+	// Start proof scheduling separately from submissions. The scheduler gives
+	// each successful submission its own absolute first-proof timestamp.
+	var proofDispatchWg sync.WaitGroup
+	var proofJobsWg sync.WaitGroup
+	proofDispatchWg.Add(1)
+	go func() {
+		defer proofDispatchWg.Done()
+		scheduleProofJobs(proofCtx, proofQueue, shardClients, metrics, rateCounters, &proofJobsWg)
+	}()
 
 	perfLogCtx := proofCtx
 	plannedRequests := int64(float64(requestsPerSec) * testDuration.Seconds())
@@ -1543,67 +1664,23 @@ func main() {
 	}
 	go logClientPerfRates(perfLogCtx, metrics, rateCounters, plannedRequests, shardClients)
 
-	// Monitor when submission phase completes
-	go func() {
-		<-submitCtx.Done()
-
-		// Wait for all outstanding submissions to complete
-		fmt.Printf("\n----------------------------------------\n")
-		fmt.Printf("Submission window closed; waiting for outstanding requests to complete...\n")
-		submissionWg.Wait()
-
-		metrics.submissionEndTime = time.Now()
-		fmt.Printf("All submissions completed.\n")
-	}()
-
-	// Wait for all workers to complete (both submission and proof verification)
-	// Note: Proof workers continue running for up to 20 seconds after submissions stop
+	// Wait for all workers to complete (both submission and proof verification).
+	// Proof jobs continue running for up to proofTimeout after submissions stop.
 	fmt.Printf("\n----------------------------------------\n")
 	fmt.Printf("Commitment submissions in progress; will verify proofs after...\n")
 
-	// Monitor proof verification progress and cancel early when all proofs are verified
-	lastProofCount := int64(0)
-	ticker := time.NewTicker(1 * time.Second)
-	defer ticker.Stop()
+	wg.Wait() // Wait for submission workers to stop accepting new work
 
-	go func() {
-		noProgressCount := 0
-		for {
-			select {
-			case <-proofCtx.Done():
-				return
-			case <-ticker.C:
-				currentProofs := atomic.LoadInt64(&metrics.proofVerified)
-				proofFailed := atomic.LoadInt64(&metrics.proofFailed)
-				successful := atomic.LoadInt64(&metrics.successfulRequests)
-
-				// Check if all proofs are done (verified + failed = expected)
-				totalProcessed := currentProofs + proofFailed
-				if totalProcessed >= successful {
-					fmt.Printf("  All proofs processed (%d verified, %d failed). Finishing early...\n", currentProofs, proofFailed)
-					proofCancel() // Cancel early since we're done
-					return
-				}
-
-				// Also check if we haven't made progress in several seconds and queue is likely drained
-				if currentProofs > 0 && currentProofs == lastProofCount {
-					noProgressCount++
-					// If no progress for 5 seconds and we've verified most proofs, likely done
-					if noProgressCount >= 5 && totalProcessed >= successful*95/100 {
-						fmt.Printf("  No progress for 5s and %d%% complete. Finishing early...\n", totalProcessed*100/successful)
-						proofCancel()
-						return
-					}
-				} else {
-					noProgressCount = 0
-				}
-				lastProofCount = currentProofs
-			}
-		}
-	}()
+	fmt.Printf("\n----------------------------------------\n")
+	fmt.Printf("Submission window closed; waiting for outstanding requests to complete...\n")
+	submissionWg.Wait()
+	metrics.submissionEndTime = time.Now()
+	close(proofQueue)
+	fmt.Printf("All submissions completed.\n")
 
-	wg.Wait()      // Wait for submission workers
-	proofWg.Wait() // Wait for proof workers
+	proofDispatchWg.Wait() // Wait for scheduler to drain proofQueue
+	proofJobsWg.Wait()     // Wait for delayed proof jobs to finish request/retry/verification
+	proofCancel()
 
 	// Stop submission phase and get counts
 	fmt.Printf("\n----------------------------------------\n")
@@ -1671,6 +1748,18 @@ func main() {
 			p95Latency.Truncate(time.Millisecond),
 			p99Latency.Truncate(time.Millisecond))
 
+		firstStartMedian, firstStartP95, firstStartP99, schedulerMedian, schedulerP95, schedulerP99 := metrics.getProofStartTimingStats()
+		if firstStartMedian > 0 || firstStartP95 > 0 || firstStartP99 > 0 {
+			fmt.Printf("First proof request start lag: median %v, p95 %v, p99 %v\n",
+				firstStartMedian.Truncate(time.Millisecond),
+				firstStartP95.Truncate(time.Millisecond),
+				firstStartP99.Truncate(time.Millisecond))
+			fmt.Printf("Proof scheduler lag after target: median %v, p95 %v, p99 %v\n",
+				schedulerMedian.Truncate(time.Millisecond),
+				schedulerP95.Truncate(time.Millisecond),
+				schedulerP99.Truncate(time.Millisecond))
+		}
+
 		// Display proof request duration statistics
 		avg, min, max, p50, p95, p99 := metrics.getProofRequestStats()
 		if avg > 0 {
diff --git a/cmd/performance-test/types.go b/cmd/performance-test/types.go
index eb5daf4..3658581 100644
--- a/cmd/performance-test/types.go
+++ b/cmd/performance-test/types.go
@@ -12,6 +12,7 @@ import (
 	"net"
 	"net/http"
 	"os"
+	"sort"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -84,8 +85,11 @@ type Metrics struct {
 	proofRetries          int64
 	proofLatencies        []time.Duration
 	proofRequestDurations []time.Duration
+	firstProofStartLags   []time.Duration
+	proofSchedulerLags    []time.Duration
 	proofLatenciesMutex   sync.RWMutex
 	proofRequestDurMutex  sync.RWMutex
+	proofScheduleMutex    sync.RWMutex
 	proofSuccessAttempts  [proofMaxRetries]atomic.Int64
 	shardMetrics          []*ShardMetrics
 }
@@ -114,8 +118,10 @@ type ShardClient struct {
 }
 
 type proofJob struct {
-	shardIdx int
-	request  *api.CertificationRequest
+	shardIdx     int
+	request      *api.CertificationRequest
+	submittedAt  time.Time
+	firstProofAt time.Time
 }
 
 // RequestRateCounters tracks per-second client-side request activity.
@@ -134,56 +140,60 @@ func (rr *RequestRateCounters) IncProofCompleted()  { rr.proofCompleted.Add(1) }
 func (rr *RequestRateCounters) IncProofRetries()    { rr.proofRetries.Add(1) }
 
 type aggregatorLogRaw struct {
-	Time                 string `json:"time"`
-	Msg                  string `json:"msg"`
-	Block                string `json:"block"`
-	Commitments          int    `json:"commitments"`
-	RoundTime            string `json:"roundTime"`
-	Processing           string `json:"processing"`
-	BftWait              string `json:"bftWait"`
-	Finalization         string `json:"finalization"`
-	FinalizeScan         string `json:"finalizeScan"`
-	FinalizeConvert      string `json:"finalizeConvert"`
-	FinalizeStoreBlock   string `json:"finalizeStoreBlock"`
-	FinalizeStoreData    string `json:"finalizeStoreData"`
-	FinalizeStoreSmt     string `json:"finalizeStoreSmt"`
-	FinalizeStoreRecords string `json:"finalizeStoreRecords"`
-	FinalizeLockWait     string `json:"finalizeLockWait"`
-	FinalizeSmtCommit    string `json:"finalizeSmtCommit"`
-	FinalizeSetFinalized string `json:"finalizeSetFinalized"`
-	FinalizeAck          string `json:"finalizeAck"`
-	ProofReadyMedian     string `json:"proofReadyMedian"`
-	ProofReadyP95        string `json:"proofReadyP95"`
-	ProofReadyP99        string `json:"proofReadyP99"`
-	RedisTotal           int    `json:"redisTotal"`
-	RedisPending         int    `json:"redisPending"`
+	Time                      string `json:"time"`
+	Msg                       string `json:"msg"`
+	Block                     string `json:"block"`
+	Commitments               int    `json:"commitments"`
+	RoundTime                 string `json:"roundTime"`
+	Processing                string `json:"processing"`
+	BftWait                   string `json:"bftWait"`
+	Finalization              string `json:"finalization"`
+	FinalizeScan              string `json:"finalizeScan"`
+	FinalizeConvert           string `json:"finalizeConvert"`
+	FinalizeStoreBlock        string `json:"finalizeStoreBlock"`
+	FinalizeStoreBlockDoc     string `json:"finalizeStoreBlockDoc"`
+	FinalizeStoreBlockRecords string `json:"finalizeStoreBlockRecords"`
+	FinalizeStoreData         string `json:"finalizeStoreData"`
+	FinalizeStoreSmt          string `json:"finalizeStoreSmt"`
+	FinalizeStoreRecords      string `json:"finalizeStoreRecords"`
+	FinalizeLockWait          string `json:"finalizeLockWait"`
+	FinalizeSmtCommit         string `json:"finalizeSmtCommit"`
+	FinalizeSetFinalized      string `json:"finalizeSetFinalized"`
+	FinalizeAck               string `json:"finalizeAck"`
+	ProofReadyMedian          string `json:"proofReadyMedian"`
+	ProofReadyP95             string `json:"proofReadyP95"`
+	ProofReadyP99             string `json:"proofReadyP99"`
+	RedisTotal                int    `json:"redisTotal"`
+	RedisPending              int    `json:"redisPending"`
 }
 
 type aggregatorRoundSummary struct {
-	Timestamp                time.Time
-	Block                    string
-	Commitments              int
-	RoundTime                time.Duration
-	Processing               time.Duration
-	BftWait                  time.Duration
-	Finalization             time.Duration
-	HasFinalizationBreakdown bool
-	FinalizeScan             time.Duration
-	FinalizeConvert          time.Duration
-	FinalizeStoreBlock       time.Duration
-	FinalizeStoreData        time.Duration
-	FinalizeStoreSmt         time.Duration
-	FinalizeStoreRecords     time.Duration
-	FinalizeLockWait         time.Duration
-	FinalizeSmtCommit        time.Duration
-	FinalizeSetFinalized     time.Duration
-	FinalizeAck              time.Duration
-	HasProofReady            bool
-	ProofMedian              time.Duration
-	ProofP95                 time.Duration
-	ProofP99                 time.Duration
-	RedisTotal               int
-	RedisPending             int
+	Timestamp                 time.Time
+	Block                     string
+	Commitments               int
+	RoundTime                 time.Duration
+	Processing                time.Duration
+	BftWait                   time.Duration
+	Finalization              time.Duration
+	HasFinalizationBreakdown  bool
+	FinalizeScan              time.Duration
+	FinalizeConvert           time.Duration
+	FinalizeStoreBlock        time.Duration
+	FinalizeStoreBlockDoc     time.Duration
+	FinalizeStoreBlockRecords time.Duration
+	FinalizeStoreData         time.Duration
+	FinalizeStoreSmt          time.Duration
+	FinalizeStoreRecords      time.Duration
+	FinalizeLockWait          time.Duration
+	FinalizeSmtCommit         time.Duration
+	FinalizeSetFinalized      time.Duration
+	FinalizeAck               time.Duration
+	HasProofReady             bool
+	ProofMedian               time.Duration
+	ProofP95                  time.Duration
+	ProofP99                  time.Duration
+	RedisTotal                int
+	RedisPending              int
 }
 
 func (m *Metrics) addProofLatency(latency time.Duration) {
@@ -198,6 +208,13 @@ func (m *Metrics) addProofRequestDuration(duration time.Duration) {
 	m.proofRequestDurations = append(m.proofRequestDurations, duration)
 }
 
+func (m *Metrics) addProofStartTiming(firstStartLag, schedulerLag time.Duration) {
+	m.proofScheduleMutex.Lock()
+	defer m.proofScheduleMutex.Unlock()
+	m.firstProofStartLags = append(m.firstProofStartLags, firstStartLag)
+	m.proofSchedulerLags = append(m.proofSchedulerLags, schedulerLag)
+}
+
 func (m *Metrics) recordProofSuccessAttempt(attempt int) {
 	if attempt >= 0 && attempt < len(m.proofSuccessAttempts) {
 		m.proofSuccessAttempts[attempt].Add(1)
@@ -230,8 +247,8 @@ func (m *Metrics) shard(idx int) *ShardMetrics {
 	return m.shardMetrics[idx]
 }
 
-func (m *Metrics) recordSubmissionTimestamp(id string) {
-	m.submissionTimes.Store(id, time.Now())
+func (m *Metrics) recordSubmissionTimestamp(id string, submittedAt time.Time) {
+	m.submissionTimes.Store(id, submittedAt)
 }
 
 func (m *Metrics) getSubmissionTimestamp(id string) (time.Time, bool) {
@@ -304,24 +321,7 @@ func (m *Metrics) getProofLatencyStats() (median, p95, p99 time.Duration) {
 	m.proofLatenciesMutex.RLock()
 	defer m.proofLatenciesMutex.RUnlock()
 
-	if len(m.proofLatencies) == 0 {
-		return 0, 0, 0
-	}
-
-	sorted := make([]time.Duration, len(m.proofLatencies))
-	copy(sorted, m.proofLatencies)
-	for i := 0; i < len(sorted); i++ {
-		for j := i + 1; j < len(sorted); j++ {
-			if sorted[i] > sorted[j] {
-				sorted[i], sorted[j] = sorted[j], sorted[i]
-			}
-		}
-	}
-
-	median = sorted[len(sorted)/2]
-	p95 = sorted[len(sorted)*95/100]
-	p99 = sorted[len(sorted)*99/100]
-	return
+	return durationPercentilesLocked(m.proofLatencies)
 }
 
 func (m *Metrics) getProofRequestStats() (avg, min, max, p50, p95, p99 time.Duration) {
@@ -359,6 +359,29 @@ func (m *Metrics) getProofRequestStats() (avg, min, max, p50, p95, p99 time.Dura
 	return avg, min, max, p50, p95, p99
 }
 
+func (m *Metrics) getProofStartTimingStats() (startMedian, startP95, startP99, schedulerMedian, schedulerP95, schedulerP99 time.Duration) {
+	m.proofScheduleMutex.RLock()
+	defer m.proofScheduleMutex.RUnlock()
+
+	startMedian, startP95, startP99 = durationPercentilesLocked(m.firstProofStartLags)
+	schedulerMedian, schedulerP95, schedulerP99 = durationPercentilesLocked(m.proofSchedulerLags)
+	return
+}
+
+func durationPercentilesLocked(values []time.Duration) (median, p95, p99 time.Duration) {
+	if len(values) == 0 {
+		return 0, 0, 0
+	}
+
+	sorted := make([]time.Duration, len(values))
+	copy(sorted, values)
+	sort.Slice(sorted, func(i, j int) bool {
+		return sorted[i] < sorted[j]
+	})
+
+	return sorted[len(sorted)/2], sorted[len(sorted)*95/100], sorted[len(sorted)*99/100]
+}
+
 type JSONRPCClient struct {
 	clients     []*http.Client
 	clientIndex atomic.Int64
@@ -558,6 +581,9 @@ func (c *JSONRPCClient) callWithContext(ctx context.Context, method string, para
 		return nil, fmt.Errorf("failed to create request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
+	if stateID := stateIDHeaderFromParams(params); stateID != "" {
+		req.Header.Set("X-State-ID", stateID)
+	}
 
 	// Add auth header if provided
 	if c.authHeader != "" {
@@ -584,3 +610,27 @@ func (c *JSONRPCClient) callWithContext(ctx context.Context, method string, para
 
 	return &response, nil
 }
+
+func stateIDHeaderFromParams(params interface{}) string {
+	switch p := params.(type) {
+	case *api.CertificationRequest:
+		if p != nil {
+			return normalizeStateID(p.StateID.String())
+		}
+	case api.CertificationRequest:
+		return normalizeStateID(p.StateID.String())
+	case GetInclusionProofRequestV2:
+		return normalizeStateID(p.StateID)
+	case *GetInclusionProofRequestV2:
+		if p != nil {
+			return normalizeStateID(p.StateID)
+		}
+	case api.GetInclusionProofRequestV2:
+		return normalizeStateID(p.StateID.String())
+	case *api.GetInclusionProofRequestV2:
+		if p != nil {
+			return normalizeStateID(p.StateID.String())
+		}
+	}
+	return ""
+}

From b2edccd174a978f1e42338029324afb2da69ecab Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Mon, 11 May 2026 12:20:48 +0300
Subject: [PATCH 05/13] perf: improve bft-shard proof readiness under load

---
 internal/bft/client.go                        |   3 +-
 internal/bft/client_stub.go                   |   2 +-
 internal/bft/client_stub_test.go              |   4 +
 internal/config/config.go                     |  11 +-
 internal/round/batch_processor.go             |   5 +-
 internal/round/factory.go                     |   3 +
 internal/round/parent_round_manager.go        |  10 +
 internal/round/precollection_test.go          | 176 ++++++++++-
 internal/round/precollector.go                |   6 +
 internal/round/round_manager.go               | 273 +++++++++++++++++-
 internal/service/service.go                   |  48 +--
 internal/service/service_test.go              |   3 +
 internal/storage/redis/commitment.go          |  18 +-
 .../redis/commitment_integration_test.go      | 124 ++++++++
 14 files changed, 642 insertions(+), 44 deletions(-)

diff --git a/internal/bft/client.go b/internal/bft/client.go
index 1f2221c..bdea190 100644
--- a/internal/bft/client.go
+++ b/internal/bft/client.go
@@ -80,6 +80,7 @@ type (
 		FinalizeBlock(ctx context.Context, block *models.Block) error
 		FinalizeBlockWithRetry(ctx context.Context, block *models.Block) error
 		StartNewRound(ctx context.Context, roundNumber *api.BigInt) error
+		StartNextRoundFromPrecollector(ctx context.Context, roundNumber *api.BigInt) error
 	}
 
 	TrustBaseStore interface {
@@ -508,7 +509,7 @@ func (c *BFTClientImpl) handleUnicityCertificate(ctx context.Context, uc *types.
 	c.logger.WithContext(ctx).Info("Block finalized, starting new round",
 		"nextRoundNumber", nextRoundNumber.String())
 
-	err = c.roundManager.StartNewRound(ctx, api.NewBigInt(nextRoundNumber))
+	err = c.roundManager.StartNextRoundFromPrecollector(ctx, api.NewBigInt(nextRoundNumber))
 	if err != nil {
 		c.logger.WithContext(ctx).Error("Failed to start new round",
 			"nextRoundNumber", nextRoundNumber.String(),
diff --git a/internal/bft/client_stub.go b/internal/bft/client_stub.go
index c6cee0d..2082e40 100644
--- a/internal/bft/client_stub.go
+++ b/internal/bft/client_stub.go
@@ -83,7 +83,7 @@ func (n *BFTClientStub) CertificationRequest(ctx context.Context, block *models.
 	nextRoundNumber.Add(block.Index.Int, big.NewInt(1))
 
 	go func() {
-		if err := n.roundManager.StartNewRound(ctx, nextRoundNumber); err != nil {
+		if err := n.roundManager.StartNextRoundFromPrecollector(ctx, nextRoundNumber); err != nil {
 			n.logger.Error("Failed to start next round", "error", err.Error())
 		}
 	}()
diff --git a/internal/bft/client_stub_test.go b/internal/bft/client_stub_test.go
index a584d85..b9275e1 100644
--- a/internal/bft/client_stub_test.go
+++ b/internal/bft/client_stub_test.go
@@ -32,6 +32,10 @@ func (m *stubRoundManager) StartNewRound(ctx context.Context, roundNumber *api.B
 	return nil
 }
 
+func (m *stubRoundManager) StartNextRoundFromPrecollector(ctx context.Context, roundNumber *api.BigInt) error {
+	return m.StartNewRound(ctx, roundNumber)
+}
+
 func TestBFTClientStub_CertificationRequest_PopulatesSyntheticUC(t *testing.T) {
 	rm := &stubRoundManager{}
 	log, err := logger.New("warn", "json", "", false)
diff --git a/internal/config/config.go b/internal/config/config.go
index d2f76d7..14ba3f7 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -109,9 +109,11 @@ type LoggingConfig struct {
 type ProcessingConfig struct {
 	BatchLimit                 int           `mapstructure:"batch_limit"`
 	RoundDuration              time.Duration `mapstructure:"round_duration"`
+	PrecollectorGracePeriod    time.Duration `mapstructure:"precollector_grace_period"`     // Extra wait before cutting a precollected round snapshot
 	MaxCommitmentsPerRound     int           `mapstructure:"max_commitments_per_round"`     // Stop waiting once this many commitments collected
 	CollectPhaseDuration       time.Duration `mapstructure:"collect_phase_duration"`        // Non-child fixed collection window before proposing a round
 	CommitmentStreamBufferSize int           `mapstructure:"commitment_stream_buffer_size"` // Buffer between queue streamer and round collection
+	SkipDuplicateCheck         bool          `mapstructure:"skip_duplicate_check"`          // Skip finalized record lookup on submit
 }
 
 // RedisConfig holds Redis connection configuration
@@ -329,9 +331,11 @@ func Load() (*Config, error) {
 		Processing: ProcessingConfig{
 			BatchLimit:                 getEnvIntOrDefault("BATCH_LIMIT", 1000),
 			RoundDuration:              getEnvDurationOrDefault("ROUND_DURATION", "1s"),
-			MaxCommitmentsPerRound:     getEnvIntOrDefault("MAX_COMMITMENTS_PER_ROUND", 10000), // Default 10k to keep rounds under 2s
+			PrecollectorGracePeriod:    getEnvDurationOrDefault("PRECOLLECTOR_GRACE_PERIOD", "0s"),
+			MaxCommitmentsPerRound:     getEnvIntOrDefault("MAX_COMMITMENTS_PER_ROUND", 20000),
 			CollectPhaseDuration:       getEnvDurationOrDefault("COLLECT_PHASE_DURATION", "200ms"),
-			CommitmentStreamBufferSize: getEnvIntOrDefault("COMMITMENT_STREAM_BUFFER_SIZE", 10000),
+			CommitmentStreamBufferSize: getEnvIntOrDefault("COMMITMENT_STREAM_BUFFER_SIZE", 50000),
+			SkipDuplicateCheck:         getEnvBoolOrDefault("SKIP_DUPLICATE_CHECK", true),
 		},
 		Redis: RedisConfig{
 			Host:         getEnvOrDefault("REDIS_HOST", "localhost"),
@@ -446,6 +450,9 @@ func (c *Config) Validate() error {
 	if c.Processing.CollectPhaseDuration <= 0 {
 		return fmt.Errorf("COLLECT_PHASE_DURATION must be positive")
 	}
+	if c.Processing.PrecollectorGracePeriod < 0 {
+		return fmt.Errorf("PRECOLLECTOR_GRACE_PERIOD must be non-negative")
+	}
 	if c.Database.FinalizationInsertChunkSize < 0 {
 		return fmt.Errorf("MONGODB_FINALIZATION_INSERT_CHUNK_SIZE must be non-negative")
 	}
diff --git a/internal/round/batch_processor.go b/internal/round/batch_processor.go
index 130c628..f64f90c 100644
--- a/internal/round/batch_processor.go
+++ b/internal/round/batch_processor.go
@@ -61,9 +61,11 @@ func (rm *RoundManager) processMiniBatch(ctx context.Context, commitments []*mod
 			result := tryAddLeavesOneByOne(ctx, rm.logger, rm.commitmentQueue, rm.currentRound.Snapshot, leaves, validCommitments)
 			rm.currentRound.PendingLeaves = append(rm.currentRound.PendingLeaves, result.successLeaves...)
 			rm.currentRound.PendingCommitments = append(rm.currentRound.PendingCommitments, result.successCommitments...)
+			rm.markProofsPending(result.successCommitments)
 		} else {
 			rm.currentRound.PendingLeaves = append(rm.currentRound.PendingLeaves, leaves...)
 			rm.currentRound.PendingCommitments = append(rm.currentRound.PendingCommitments, validCommitments...)
+			rm.markProofsPending(validCommitments)
 		}
 	}
 
@@ -471,8 +473,9 @@ func (rm *RoundManager) FinalizeBlock(ctx context.Context, block *models.Block)
 		return fmt.Errorf("failed to set block as finalized: %w", err)
 	}
 	setFinalizedDuration := time.Since(setFinalizedStart)
-	rm.finalizationMu.Unlock()
 	block.Finalized = true
+	rm.markProofsReady(block, stateIDs)
+	rm.finalizationMu.Unlock()
 
 	// Proofs are requestable only after the SMT snapshot is committed and the block is visible as finalized.
 	// Redis ACK is recovery bookkeeping.
diff --git a/internal/round/factory.go b/internal/round/factory.go
index ee06d19..04f0988 100644
--- a/internal/round/factory.go
+++ b/internal/round/factory.go
@@ -10,9 +10,11 @@ import (
 	"github.com/unicitynetwork/aggregator-go/internal/events"
 	"github.com/unicitynetwork/aggregator-go/internal/ha/state"
 	"github.com/unicitynetwork/aggregator-go/internal/logger"
+	"github.com/unicitynetwork/aggregator-go/internal/models"
 	"github.com/unicitynetwork/aggregator-go/internal/sharding"
 	"github.com/unicitynetwork/aggregator-go/internal/smt"
 	"github.com/unicitynetwork/aggregator-go/internal/storage/interfaces"
+	"github.com/unicitynetwork/aggregator-go/pkg/api"
 )
 
 // Manager interface for both standalone and parent round managers
@@ -25,6 +27,7 @@ type Manager interface {
 	CheckParentHealth(ctx context.Context) error
 	// FinalizationReadLock blocks during the SMT commit+finalize window to keep root/block consistent.
 	FinalizationReadLock() func()
+	GetKnownNotReadyBlock(stateID api.StateID) (*models.Block, bool)
 }
 
 // NewManager creates the appropriate round manager based on sharding mode
diff --git a/internal/round/parent_round_manager.go b/internal/round/parent_round_manager.go
index dc87da8..82d06c7 100644
--- a/internal/round/parent_round_manager.go
+++ b/internal/round/parent_round_manager.go
@@ -179,6 +179,12 @@ func (prm *ParentRoundManager) StartNewRound(ctx context.Context, roundNumber *a
 	return prm.startNewRound(ctx, roundNumber)
 }
 
+// StartNextRoundFromPrecollector exists to satisfy the BFT RoundManager
+// interface. Parent mode keeps its existing collect behavior.
+func (prm *ParentRoundManager) StartNextRoundFromPrecollector(ctx context.Context, roundNumber *api.BigInt) error {
+	return prm.StartNewRound(ctx, roundNumber)
+}
+
 // startNewRound is the internal implementation
 func (prm *ParentRoundManager) startNewRound(ctx context.Context, roundNumber *api.BigInt) error {
 	prm.roundMutex.Lock()
@@ -399,6 +405,10 @@ func (prm *ParentRoundManager) FinalizationReadLock() func() {
 	return func() {}
 }
 
+func (prm *ParentRoundManager) GetKnownNotReadyBlock(stateID api.StateID) (*models.Block, bool) {
+	return nil, false
+}
+
 // Activate starts active round processing (called when node becomes leader in HA mode)
 func (prm *ParentRoundManager) Activate(ctx context.Context) error {
 	prm.logger.WithContext(ctx).Info("Activating parent round manager")
diff --git a/internal/round/precollection_test.go b/internal/round/precollection_test.go
index 9b7019a..4140e40 100644
--- a/internal/round/precollection_test.go
+++ b/internal/round/precollection_test.go
@@ -272,7 +272,7 @@ func newTestPrecollector(t *testing.T, stream chan *models.CertificationRequest,
 	if maxPerRound <= 0 {
 		maxPerRound = 10000
 	}
-	cp := newChildPrecollector(stream, nil, log, maxPerRound)
+	cp := newChildPrecollector(stream, nil, log, maxPerRound, nil)
 	return cp, smtInstance
 }
 
@@ -569,7 +569,7 @@ func TestPreCollectionReparenting(t *testing.T) {
 
 		// Start precollector from Round N's snapshot
 		stream := make(chan *models.CertificationRequest, 100)
-		cp := newChildPrecollector(stream, nil, testLogger, 10000)
+		cp := newChildPrecollector(stream, nil, testLogger, 10000, nil)
 		cp.Start(ctx, roundNSnapshot)
 		defer cp.Stop()
 
@@ -824,6 +824,178 @@ func TestStartNewRoundWithSnapshot(t *testing.T) {
 	})
 }
 
+func TestStandalonePrecollectorGraceIncludesLateCommitment(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	cfg := config.Config{
+		Database: config.DatabaseConfig{Database: "test_standalone_precollector_grace"},
+		Processing: config.ProcessingConfig{
+			RoundDuration:           100 * time.Millisecond,
+			PrecollectorGracePeriod: 150 * time.Millisecond,
+			MaxCommitmentsPerRound:  1000,
+		},
+		Storage:  config.StorageConfig{UseRedisForCommitments: true},
+		Sharding: config.ShardingConfig{Mode: config.ShardingModeStandalone},
+		BFT: config.BFTConfig{
+			Enabled: false,
+			// Keep the precollected round in-flight while the test inspects it.
+			StubDelay: 5 * time.Second,
+		},
+	}
+	storage := testutil.SetupTestStorage(t, cfg)
+
+	testLogger := newTestLogger(t)
+	smtInstance := smt.NewThreadSafeSMT(smt.NewSparseMerkleTree(api.SHA256, api.StateTreeKeyLengthBits))
+
+	rm, err := NewRoundManager(
+		ctx,
+		&cfg,
+		testLogger,
+		storage.CommitmentQueue(),
+		storage,
+		nil,
+		state.NewSyncStateTracker(),
+		nil,
+		events.NewEventBus(testLogger),
+		smtInstance,
+		nil,
+	)
+	require.NoError(t, err)
+
+	t.Cleanup(func() {
+		shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer shutdownCancel()
+		_ = rm.Stop(shutdownCtx)
+	})
+
+	cp := newChildPrecollector(
+		rm.commitmentStream,
+		rm.commitmentQueue,
+		rm.logger,
+		rm.config.Processing.MaxCommitmentsPerRound,
+		rm.markProofsPending,
+	)
+	rm.roundMutex.Lock()
+	rm.precollectorDisabled = false
+	rm.precollectorDone = make(chan struct{})
+	rm.precollector = cp
+	rm.roundMutex.Unlock()
+	cp.Start(ctx, smtInstance.CreateSnapshot())
+
+	lateCommitment := testutil.CreateTestCertificationRequest(t, "during_grace")
+	go func() {
+		time.Sleep(50 * time.Millisecond)
+		rm.commitmentStream <- lateCommitment
+	}()
+
+	start := time.Now()
+	require.NoError(t, rm.StartNextRoundFromPrecollector(ctx, api.NewBigInt(big.NewInt(2))))
+	assert.GreaterOrEqual(t, time.Since(start), cfg.Processing.PrecollectorGracePeriod)
+
+	rm.roundMutex.RLock()
+	currentRound := rm.currentRound
+	var commitments []*models.CertificationRequest
+	if currentRound != nil {
+		commitments = append([]*models.CertificationRequest(nil), currentRound.Commitments...)
+	}
+	rm.roundMutex.RUnlock()
+
+	require.NotNil(t, currentRound)
+	assert.EqualValues(t, 2, currentRound.Number.Int64())
+	require.Len(t, commitments, 1)
+	assert.Equal(t, lateCommitment.StateID, commitments[0].StateID)
+}
+
+func TestStandaloneActivePrecollectorLifecycle(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	cfg := config.Config{
+		Database: config.DatabaseConfig{Database: "test_standalone_active_precollector_lifecycle"},
+		Processing: config.ProcessingConfig{
+			RoundDuration:           100 * time.Millisecond,
+			PrecollectorGracePeriod: 50 * time.Millisecond,
+			MaxCommitmentsPerRound:  1000,
+			CollectPhaseDuration:    500 * time.Millisecond,
+		},
+		Storage:  config.StorageConfig{UseRedisForCommitments: true},
+		Sharding: config.ShardingConfig{Mode: config.ShardingModeStandalone},
+		BFT: config.BFTConfig{
+			Enabled:   false,
+			StubDelay: 1 * time.Second,
+		},
+	}
+	storage := testutil.SetupTestStorage(t, cfg)
+
+	testLogger := newTestLogger(t)
+	smtInstance := smt.NewThreadSafeSMT(smt.NewSparseMerkleTree(api.SHA256, api.StateTreeKeyLengthBits))
+
+	rm, err := NewRoundManager(
+		ctx,
+		&cfg,
+		testLogger,
+		storage.CommitmentQueue(),
+		storage,
+		nil,
+		state.NewSyncStateTracker(),
+		nil,
+		events.NewEventBus(testLogger),
+		smtInstance,
+		nil,
+	)
+	require.NoError(t, err)
+
+	t.Cleanup(func() {
+		shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer shutdownCancel()
+		_ = rm.Stop(shutdownCtx)
+	})
+
+	require.NoError(t, rm.Start(ctx))
+	require.NoError(t, rm.Activate(ctx))
+
+	require.Eventually(t, func() bool {
+		rm.roundMutex.RLock()
+		defer rm.roundMutex.RUnlock()
+		return rm.precollector != nil
+	}, 2*time.Second, 25*time.Millisecond, "round 1 should start the active precollector after fixed collect")
+
+	lateCommitment := testutil.CreateTestCertificationRequest(t, "active_precollector_lifecycle")
+	select {
+	case rm.commitmentStream <- lateCommitment:
+	case <-time.After(time.Second):
+		t.Fatal("timed out sending commitment to active precollector")
+	}
+
+	require.Eventually(t, func() bool {
+		rm.roundMutex.RLock()
+		defer rm.roundMutex.RUnlock()
+		return rm.currentRound != nil &&
+			rm.currentRound.Number.Int64() == 2 &&
+			!rm.currentRound.ProposalTime.IsZero()
+	}, 4*time.Second, 25*time.Millisecond, "BFT stub finalization should advance to precollected round 2")
+
+	rm.roundMutex.RLock()
+	currentRound := rm.currentRound
+	roundNumber := currentRound.Number.Int64()
+	preCollected := currentRound.PreCollected
+	proposalPrep := currentRound.ProposalTime.Sub(currentRound.StartTime)
+	commitments := append([]*models.CertificationRequest(nil), currentRound.Commitments...)
+	rm.roundMutex.RUnlock()
+
+	assert.EqualValues(t, 2, roundNumber)
+	assert.True(t, preCollected, "round 2 should come from the active precollector handoff")
+	require.Len(t, commitments, 1)
+	assert.Equal(t, lateCommitment.StateID, commitments[0].StateID)
+	assert.Less(t, proposalPrep, cfg.Processing.CollectPhaseDuration/2,
+		"precollected round should skip the fixed collect phase")
+
+	block1, err := storage.BlockStorage().GetByNumber(ctx, api.NewBigInt(big.NewInt(1)))
+	require.NoError(t, err)
+	assert.NotNil(t, block1, "round 2 should be reached through BFT stub finalization of round 1")
+}
+
 func TestPipelinedChildModeFlow(t *testing.T) {
 	t.Run("SecondRoundUsesPreCollectedData", func(t *testing.T) {
 		ctx, cancel := context.WithCancel(context.Background())
diff --git a/internal/round/precollector.go b/internal/round/precollector.go
index c28a585..b11fbe0 100644
--- a/internal/round/precollector.go
+++ b/internal/round/precollector.go
@@ -31,6 +31,7 @@ type childPrecollector struct {
 	commitmentQueue  interfaces.CommitmentQueue
 	logger           *logger.Logger
 	maxPerRound      int
+	markProofPending func([]*models.CertificationRequest)
 
 	advanceCh chan advanceRequest
 	stopCh    chan struct{}
@@ -42,6 +43,7 @@ func newChildPrecollector(
 	queue interfaces.CommitmentQueue,
 	log *logger.Logger,
 	maxPerRound int,
+	markProofPending func([]*models.CertificationRequest),
 ) *childPrecollector {
 	if maxPerRound <= 0 {
 		maxPerRound = 10000
@@ -51,6 +53,7 @@ func newChildPrecollector(
 		commitmentQueue:  queue,
 		logger:           log,
 		maxPerRound:      maxPerRound,
+		markProofPending: markProofPending,
 		advanceCh:        make(chan advanceRequest),
 		stopCh:           make(chan struct{}),
 		doneCh:           make(chan struct{}),
@@ -102,6 +105,9 @@ func (cp *childPrecollector) run(ctx context.Context, baseSnapshot *smt.ThreadSa
 			return
 		}
 		added, addedLeaves := cp.addBatch(ctx, snapshot, pending)
+		if cp.markProofPending != nil {
+			cp.markProofPending(added)
+		}
 		commitments = append(commitments, added...)
 		leaves = append(leaves, addedLeaves...)
 		count += len(added)
diff --git a/internal/round/round_manager.go b/internal/round/round_manager.go
index 8c28924..66e6e61 100644
--- a/internal/round/round_manager.go
+++ b/internal/round/round_manager.go
@@ -67,6 +67,9 @@ type Round struct {
 	// PendingCommitments contains only commitments whose leaves were successfully added
 	// This is used for creating aggregator records (must match PendingLeaves)
 	PendingCommitments []*models.CertificationRequest
+	// PreCollected is true when this round starts from a precollector snapshot.
+	// These rounds skip the fixed collect window.
+	PreCollected bool
 	// Timing metrics for this round
 	ProcessingTime     time.Duration
 	ProposalTime       time.Time     // When block was proposed to BFT
@@ -105,6 +108,10 @@ type RoundManager struct {
 	finalizationMu sync.RWMutex
 	wg             sync.WaitGroup
 
+	proofCacheMu          sync.RWMutex
+	proofPending          map[string]struct{}
+	latestProofReadyBlock *models.Block
+
 	// Round duration (configurable, default 1 second)
 	roundDuration time.Duration
 
@@ -113,6 +120,7 @@ type RoundManager struct {
 	streamMutex      sync.RWMutex
 	lastFetchedID    string             // Cursor for MongoDB pagination
 	prefetchCancel   context.CancelFunc // Cancel function for running streamer/prefetcher
+	prefetchDone     chan struct{}
 
 	// Adaptive throughput tracking
 	avgProcessingRate float64 // commitments per millisecond
@@ -126,6 +134,7 @@ type RoundManager struct {
 	// Both fields are protected by roundMutex.
 	precollector         *childPrecollector
 	precollectorDisabled bool
+	precollectorDone     chan struct{}
 
 	// Child mode tracks the newest parent UC already accepted for finalization.
 	// This prevents empty rounds from immediately reusing an older parent proof.
@@ -180,9 +189,10 @@ func NewRoundManager(
 		eventBus:            eventBus,
 		roundDuration:       cfg.Processing.RoundDuration,                                        // Configurable round duration (default 1s)
 		commitmentStream:    make(chan *models.CertificationRequest, commitmentStreamBufferSize), // Buffer for queue streamer
-		avgProcessingRate:   1.0,                                                                 // Initial estimate: 1 commitment per ms
-		avgFinalizationTime: 200 * time.Millisecond,                                              // Initial estimate (conservative)
-		avgSMTUpdateTime:    5 * time.Millisecond,                                                // Initial estimate per batch
+		proofPending:        make(map[string]struct{}),
+		avgProcessingRate:   1.0,                    // Initial estimate: 1 commitment per ms
+		avgFinalizationTime: 200 * time.Millisecond, // Initial estimate (conservative)
+		avgSMTUpdateTime:    5 * time.Millisecond,   // Initial estimate per batch
 	}
 
 	if rm.storage != nil && rm.storage.SmtStorage() != nil {
@@ -328,6 +338,46 @@ func (rm *RoundManager) FinalizationReadLock() func() {
 	return func() { rm.finalizationMu.RUnlock() }
 }
 
+func (rm *RoundManager) GetKnownNotReadyBlock(stateID api.StateID) (*models.Block, bool) {
+	rm.proofCacheMu.RLock()
+	defer rm.proofCacheMu.RUnlock()
+
+	if _, ok := rm.proofPending[stateID.String()]; !ok || rm.latestProofReadyBlock == nil {
+		return nil, false
+	}
+	return rm.latestProofReadyBlock, true
+}
+
+func (rm *RoundManager) markProofsPending(commitments []*models.CertificationRequest) {
+	if len(commitments) == 0 {
+		return
+	}
+
+	rm.proofCacheMu.Lock()
+	defer rm.proofCacheMu.Unlock()
+	for _, commitment := range commitments {
+		if commitment != nil {
+			rm.proofPending[commitment.StateID.String()] = struct{}{}
+		}
+	}
+}
+
+func (rm *RoundManager) markProofsReady(block *models.Block, stateIDs []api.StateID) {
+	rm.proofCacheMu.Lock()
+	defer rm.proofCacheMu.Unlock()
+
+	rm.latestProofReadyBlock = block
+	for _, stateID := range stateIDs {
+		delete(rm.proofPending, stateID.String())
+	}
+}
+
+func (rm *RoundManager) clearProofPending() {
+	rm.proofCacheMu.Lock()
+	defer rm.proofCacheMu.Unlock()
+	rm.proofPending = make(map[string]struct{})
+}
+
 // GetStats returns round manager statistics
 func (rm *RoundManager) GetStats() map[string]interface{} {
 	rm.roundMutex.RLock()
@@ -354,12 +404,13 @@ func (rm *RoundManager) GetStats() map[string]interface{} {
 
 // StartNewRound starts a new round for processing commitments (delegates to unified function)
 func (rm *RoundManager) StartNewRound(ctx context.Context, roundNumber *api.BigInt) error {
+	rm.discardActivePrecollector(ctx)
 	return rm.StartNewRoundWithSnapshot(ctx, roundNumber, nil, nil, nil)
 }
 
 // StartNewRoundWithSnapshot starts a new round, optionally with pre-collected data.
-// If snapshot/commitments are nil (first round), an empty snapshot is created.
-// All rounds are processed the same way - no separate collect phase.
+// If snapshot/commitments are nil, an empty snapshot is created and the fixed
+// collect path is used.
 // ErrDeactivated is returned by StartNewRoundWithSnapshot when the round manager
 // has been deactivated. The check and the decision not to start are atomic under roundMutex.
 var ErrDeactivated = fmt.Errorf("round manager deactivated")
@@ -379,7 +430,7 @@ func (rm *RoundManager) StartNewRoundWithSnapshot(
 
 	if rm.precollectorDisabled {
 		rm.roundMutex.Unlock()
-		rm.logger.WithContext(ctx).Info("Skipping round start — deactivated",
+		rm.logger.WithContext(ctx).Info("Skipping round start - deactivated",
 			"roundNumber", roundNumber.String())
 		return ErrDeactivated
 	}
@@ -391,6 +442,7 @@ func (rm *RoundManager) StartNewRoundWithSnapshot(
 			"previousRoundAge", time.Since(rm.currentRound.StartTime).String())
 	}
 
+	preCollected := snapshot != nil
 	if snapshot == nil {
 		snapshot = rm.smt.CreateSnapshot()
 	}
@@ -409,6 +461,7 @@ func (rm *RoundManager) StartNewRoundWithSnapshot(
 		Snapshot:           snapshot,
 		PendingLeaves:      leaves,
 		PendingCommitments: commitments, // In child mode, commitments are already filtered by pre-collection
+		PreCollected:       preCollected,
 	}
 
 	// Start precollector on the first child-mode round so it begins collecting
@@ -419,6 +472,7 @@ func (rm *RoundManager) StartNewRoundWithSnapshot(
 			rm.commitmentQueue,
 			rm.logger,
 			rm.config.Processing.MaxCommitmentsPerRound,
+			rm.markProofsPending,
 		)
 		rm.precollector = cp
 		cp.Start(ctx, snapshot)
@@ -459,9 +513,10 @@ func (rm *RoundManager) processRound(ctx context.Context) error {
 		return fmt.Errorf("no current round to process")
 	}
 	roundNumber := rm.currentRound.Number
+	preCollected := rm.currentRound.PreCollected
 	rm.roundMutex.Unlock()
 
-	if !rm.config.Sharding.Mode.IsChild() {
+	if !preCollected && !rm.config.Sharding.Mode.IsChild() {
 		collectDuration := rm.config.Processing.CollectPhaseDuration
 		if collectDuration <= 0 {
 			collectDuration = 200 * time.Millisecond
@@ -493,6 +548,9 @@ func (rm *RoundManager) processRound(ctx context.Context) error {
 		}
 		rm.roundMutex.Unlock()
 	}
+
+	rm.startActivePrecollectorIfNeeded(ctx)
+
 	rm.roundMutex.Lock()
 	commitmentCount := len(rm.currentRound.Commitments)
 	var rootHash api.HexBytes
@@ -749,6 +807,9 @@ func (rm *RoundManager) Activate(ctx context.Context) error {
 
 	rm.roundMutex.Lock()
 	rm.precollectorDisabled = false
+	if rm.precollectorDone == nil {
+		rm.precollectorDone = make(chan struct{})
+	}
 	rm.roundMutex.Unlock()
 
 	if rm.config.HA.Enabled {
@@ -862,6 +923,10 @@ func (rm *RoundManager) Deactivate(ctx context.Context) error {
 	var cp *childPrecollector
 	rm.roundMutex.Lock()
 	rm.precollectorDisabled = true
+	if rm.precollectorDone != nil {
+		close(rm.precollectorDone)
+		rm.precollectorDone = nil
+	}
 	cp = rm.precollector
 	rm.precollector = nil
 	rm.roundMutex.Unlock()
@@ -869,6 +934,7 @@ func (rm *RoundManager) Deactivate(ctx context.Context) error {
 	if cp != nil {
 		cp.Stop()
 	}
+	rm.clearProofPending()
 	rm.stopCommitmentPrefetcher()
 	if rm.bftClient != nil {
 		rm.bftClient.Stop()
@@ -877,6 +943,175 @@ func (rm *RoundManager) Deactivate(ctx context.Context) error {
 	return nil
 }
 
+func (rm *RoundManager) usesActivePrecollector() bool {
+	if rm.config.Processing.PrecollectorGracePeriod <= 0 {
+		return false
+	}
+	if !rm.config.Storage.UseRedisForCommitments {
+		return false
+	}
+
+	switch rm.config.Sharding.Mode {
+	case config.ShardingModeStandalone, config.ShardingModeBFTShard:
+		return true
+	default:
+		return false
+	}
+}
+
+func (rm *RoundManager) discardActivePrecollector(ctx context.Context) {
+	if !rm.usesActivePrecollector() {
+		return
+	}
+
+	var cp *childPrecollector
+	rm.roundMutex.Lock()
+	cp = rm.precollector
+	rm.precollector = nil
+	rm.roundMutex.Unlock()
+
+	if cp != nil {
+		cp.Stop()
+		rm.clearProofPending()
+		if cs, ok := rm.commitmentQueue.(*redis.CommitmentStorage); ok {
+			restartPrefetcher := rm.stopCommitmentPrefetcherAndWait()
+			rm.drainCommitmentStream()
+			cs.ResetPendingSweep()
+			if restartPrefetcher {
+				rm.startCommitmentPrefetcher(ctx)
+			}
+		}
+	}
+}
+
+func (rm *RoundManager) drainCommitmentStream() int {
+	drained := 0
+	for {
+		select {
+		case _, ok := <-rm.commitmentStream:
+			if !ok {
+				return drained
+			}
+			drained++
+		default:
+			return drained
+		}
+	}
+}
+
+// startActivePrecollectorIfNeeded starts the next-round precollector for
+// standalone/bft-shard only after processRound has finished the fixed collect
+// window, so it is the sole reader of commitmentStream while BFT is pending.
+func (rm *RoundManager) startActivePrecollectorIfNeeded(ctx context.Context) {
+	if !rm.usesActivePrecollector() {
+		return
+	}
+
+	rm.roundMutex.Lock()
+	if rm.precollector != nil ||
+		rm.precollectorDisabled ||
+		rm.currentRound == nil ||
+		rm.currentRound.Snapshot == nil {
+		rm.roundMutex.Unlock()
+		return
+	}
+
+	snapshot := rm.currentRound.Snapshot
+	cp := newChildPrecollector(
+		rm.commitmentStream,
+		rm.commitmentQueue,
+		rm.logger,
+		rm.config.Processing.MaxCommitmentsPerRound,
+		rm.markProofsPending,
+	)
+	rm.precollector = cp
+	rm.roundMutex.Unlock()
+
+	cp.Start(ctx, snapshot)
+}
+
+// StartNextRoundFromPrecollector starts the next standalone/bft-shard round
+// from the active precollector snapshot. If precollection is disabled or no
+// precollector is available, it falls back to the fixed collect path.
+func (rm *RoundManager) StartNextRoundFromPrecollector(ctx context.Context, roundNumber *api.BigInt) error {
+	if !rm.usesActivePrecollector() {
+		return rm.StartNewRound(ctx, roundNumber)
+	}
+
+	rm.roundMutex.RLock()
+	cpExists := rm.precollector != nil
+	precollectorDone := rm.precollectorDone
+	precollectorDisabled := rm.precollectorDisabled
+	rm.roundMutex.RUnlock()
+
+	if precollectorDisabled {
+		return nil
+	}
+
+	if cpExists {
+		if err := rm.waitBeforePrecollectorHandoff(ctx, precollectorDone); err != nil {
+			if errors.Is(err, ErrDeactivated) {
+				return nil
+			}
+			return err
+		}
+	}
+
+	rm.roundMutex.RLock()
+	cp := rm.precollector
+	precollectorDisabled = rm.precollectorDisabled
+	rm.roundMutex.RUnlock()
+
+	if precollectorDisabled {
+		return nil
+	}
+	if cp == nil {
+		return rm.StartNewRound(ctx, roundNumber)
+	}
+
+	preResult, err := cp.AdvanceRound()
+	if err != nil {
+		rm.roundMutex.RLock()
+		precollectorDisabled = rm.precollectorDisabled
+		rm.roundMutex.RUnlock()
+		if precollectorDisabled {
+			return nil
+		}
+
+		rm.logger.WithContext(ctx).Warn("Failed to advance precollector, falling back to fixed collect round",
+			"error", err.Error())
+		return rm.StartNewRound(ctx, roundNumber)
+	}
+
+	preResult.snapshot.SetCommitTarget(rm.smt)
+	if err := rm.StartNewRoundWithSnapshot(ctx, roundNumber, preResult.snapshot, preResult.commitments, preResult.leaves); err != nil {
+		if errors.Is(err, ErrDeactivated) {
+			return nil
+		}
+		return err
+	}
+	return nil
+}
+
+func (rm *RoundManager) waitBeforePrecollectorHandoff(ctx context.Context, done <-chan struct{}) error {
+	grace := rm.config.Processing.PrecollectorGracePeriod
+	if grace <= 0 {
+		return nil
+	}
+
+	timer := time.NewTimer(grace)
+	defer timer.Stop()
+
+	select {
+	case <-timer.C:
+		return nil
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-done:
+		return ErrDeactivated
+	}
+}
+
 func (rm *RoundManager) startCommitmentPrefetcher(ctx context.Context) {
 	rm.streamMutex.Lock()
 	defer rm.streamMutex.Unlock()
@@ -887,7 +1122,9 @@ func (rm *RoundManager) startCommitmentPrefetcher(ctx context.Context) {
 	}
 
 	prefetcherCtx, cancel := context.WithCancel(ctx)
+	done := make(chan struct{})
 	rm.prefetchCancel = cancel
+	rm.prefetchDone = done
 
 	if rm.config.Storage.UseRedisForCommitments {
 		if cs, ok := rm.commitmentQueue.(*redis.CommitmentStorage); ok {
@@ -895,25 +1132,39 @@ func (rm *RoundManager) startCommitmentPrefetcher(ctx context.Context) {
 		}
 		rm.logger.WithContext(ctx).Info("Starting Redis commitment streamer")
 		rm.wg.Go(func() {
+			defer close(done)
 			rm.redisCommitmentStreamer(prefetcherCtx)
 		})
 	} else {
 		rm.logger.WithContext(ctx).Info("Starting MongoDB commitment prefetcher")
 		rm.lastFetchedID = ""
 		rm.wg.Go(func() {
+			defer close(done)
 			rm.commitmentPrefetcher(prefetcherCtx)
 		})
 	}
 }
 
 func (rm *RoundManager) stopCommitmentPrefetcher() {
-	rm.streamMutex.Lock()
-	defer rm.streamMutex.Unlock()
+	rm.stopCommitmentPrefetcherAndWait()
+}
 
+func (rm *RoundManager) stopCommitmentPrefetcherAndWait() bool {
+	rm.streamMutex.Lock()
 	if rm.prefetchCancel == nil {
+		rm.streamMutex.Unlock()
 		rm.logger.Warn("stopCommitmentPrefetcher called but no prefetcher running")
-		return
+		return false
 	}
-	rm.prefetchCancel()
+	cancel := rm.prefetchCancel
+	done := rm.prefetchDone
 	rm.prefetchCancel = nil
+	rm.prefetchDone = nil
+	rm.streamMutex.Unlock()
+
+	cancel()
+	if done != nil {
+		<-done
+	}
+	return true
 }
diff --git a/internal/service/service.go b/internal/service/service.go
index f5912e7..5152c29 100644
--- a/internal/service/service.go
+++ b/internal/service/service.go
@@ -181,16 +181,18 @@ func (as *AggregatorService) CertificationRequest(ctx context.Context, req *api.
 		}, nil
 	}
 
-	// Check if certificationRequest already processed
-	existingRecord, err := as.storage.AggregatorRecordStorage().GetByStateID(ctx, req.StateID)
-	if err != nil {
-		return nil, fmt.Errorf("failed to check existing aggregator record: %w", err)
-	}
+	if !as.config.Processing.SkipDuplicateCheck {
+		// Check if certificationRequest already processed
+		existingRecord, err := as.storage.AggregatorRecordStorage().GetByStateID(ctx, req.StateID)
+		if err != nil {
+			return nil, fmt.Errorf("failed to check existing aggregator record: %w", err)
+		}
 
-	if existingRecord != nil {
-		return &api.CertificationResponse{
-			Status: "STATE_ID_EXISTS",
-		}, nil
+		if existingRecord != nil {
+			return &api.CertificationResponse{
+				Status: "STATE_ID_EXISTS",
+			}, nil
+		}
 	}
 
 	// Store certificationRequest
@@ -233,6 +235,14 @@ func (as *AggregatorService) GetInclusionProofV2(ctx context.Context, req *api.G
 		return nil, fmt.Errorf("unexpected SMT key length: got %d bits, want %d", keyLen, api.StateTreeKeyLengthBits)
 	}
 
+	if block, ok := as.roundManager.GetKnownNotReadyBlock(req.StateID); ok {
+		responseBlockNumber, err := proofBundleBlockNumber(as.config.Sharding.Mode, block)
+		if err != nil {
+			return nil, err
+		}
+		return emptyInclusionProofResponse(responseBlockNumber, block), nil
+	}
+
 	// Bind the UC via the block whose stored rootHash matches the current
 	// raw 32-byte SMT root (which also lives in UC.IR.h).
 	rootHashRaw := api.HexBytes(smtInstance.GetRootHashRaw())
@@ -255,14 +265,7 @@ func (as *AggregatorService) GetInclusionProofV2(ctx context.Context, req *api.G
 	if record == nil || record.BlockNumber.Cmp(block.Index.Int) > 0 {
 		// Non-inclusion is not implemented yet. Return an empty v2 proof
 		// payload so verifiers short-circuit with ErrExclusionNotImpl.
-		return &api.GetInclusionProofResponseV2{
-			BlockNumber: responseBlockNumber,
-			InclusionProof: &api.InclusionProofV2{
-				CertificationData:  nil,
-				CertificateBytes:   nil,
-				UnicityCertificate: types.RawCBOR(block.UnicityCertificate),
-			},
-		}, nil
+		return emptyInclusionProofResponse(responseBlockNumber, block), nil
 	}
 	if record.Version != 2 {
 		return nil, fmt.Errorf("invalid aggregator record version got %d expected 2", record.Version)
@@ -302,6 +305,17 @@ func (as *AggregatorService) GetInclusionProofV2(ctx context.Context, req *api.G
 	}, nil
 }
 
+func emptyInclusionProofResponse(blockNumber uint64, block *models.Block) *api.GetInclusionProofResponseV2 {
+	return &api.GetInclusionProofResponseV2{
+		BlockNumber: blockNumber,
+		InclusionProof: &api.InclusionProofV2{
+			CertificationData:  nil,
+			CertificateBytes:   nil,
+			UnicityCertificate: types.RawCBOR(block.UnicityCertificate),
+		},
+	}
+}
+
 func proofBundleBlockNumber(mode config.ShardingMode, block *models.Block) (uint64, error) {
 	if block == nil {
 		return 0, fmt.Errorf("missing block for proof bundle")
diff --git a/internal/service/service_test.go b/internal/service/service_test.go
index f54cce6..b28470b 100644
--- a/internal/service/service_test.go
+++ b/internal/service/service_test.go
@@ -612,6 +612,9 @@ func (s *stubRoundManager) Deactivate(context.Context) error {
 func (s *stubRoundManager) GetSMT() *smt.ThreadSafeSMT              { return s.smt }
 func (s *stubRoundManager) CheckParentHealth(context.Context) error { return nil }
 func (s *stubRoundManager) FinalizationReadLock() func()            { return func() {} }
+func (s *stubRoundManager) GetKnownNotReadyBlock(api.StateID) (*models.Block, bool) {
+	return nil, false
+}
 
 func newAggregatorServiceForTest(t *testing.T, shardingCfg config.ShardingConfig, baseTree *smt.SparseMerkleTree) *AggregatorService {
 	t.Helper()
diff --git a/internal/storage/redis/commitment.go b/internal/storage/redis/commitment.go
index 31c908a..7a392ad 100644
--- a/internal/storage/redis/commitment.go
+++ b/internal/storage/redis/commitment.go
@@ -94,9 +94,8 @@ func NewCommitmentStorage(client *redis.Client, streamName string, serverID stri
 	return cs
 }
 
-// ResetPendingSweep re-enables the pending-entry sweep for the next
-// StreamCertificationRequests invocation. Call before (re)starting the
-// streamer so a restarted consumer reclaims entries left unacked.
+// ResetPendingSweep re-enables the pending-entry sweep. A running streamer will
+// notice this before its next new-message read and reclaim entries left unacked.
 func (cs *CommitmentStorage) ResetPendingSweep() {
 	cs.pendingExhausted.Store(false)
 }
@@ -715,12 +714,6 @@ func (cs *CommitmentStorage) StreamCertificationRequests(ctx context.Context, co
 	windowStart := time.Now()
 	countThisWindow := 0
 
-	if !cs.pendingExhausted.Load() {
-		if err := cs.drainPendingForConsumer(ctx, commitmentChan); err != nil {
-			return err
-		}
-	}
-
 	for {
 		select {
 		case <-ctx.Done():
@@ -728,6 +721,13 @@ func (cs *CommitmentStorage) StreamCertificationRequests(ctx context.Context, co
 		case <-cs.stopChan:
 			return nil
 		default:
+			if !cs.pendingExhausted.Load() {
+				if err := cs.drainPendingForConsumer(ctx, commitmentChan); err != nil {
+					return err
+				}
+				continue
+			}
+
 			streams := cs.client.XReadGroup(ctx, &redis.XReadGroupArgs{
 				Group:    consumerGroup,
 				Consumer: cs.consumerID,
diff --git a/internal/storage/redis/commitment_integration_test.go b/internal/storage/redis/commitment_integration_test.go
index f9e54fb..f0844d1 100644
--- a/internal/storage/redis/commitment_integration_test.go
+++ b/internal/storage/redis/commitment_integration_test.go
@@ -952,6 +952,130 @@ Collect2:
 	}
 }
 
+// TestPendingExhausted_ResetWhileStreamerRunning: after a reset, the already
+// running streamer reclaims previously-unacked entries without being restarted.
+func (suite *RedisTestSuite) TestPendingExhausted_ResetWhileStreamerRunning() {
+	ctx := suite.ctx
+	t := suite.T()
+
+	commitments := []*models.CertificationRequest{
+		createTestCommitment(),
+		createTestCommitment(),
+		createTestCommitment(),
+	}
+	require.NoError(t, suite.storage.StoreBatch(ctx, commitments))
+
+	streamCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+	ch := make(chan *models.CertificationRequest, 10)
+	done := make(chan struct{})
+	go func() {
+		_ = suite.storage.StreamCertificationRequests(streamCtx, ch)
+		close(done)
+	}()
+	defer func() {
+		cancel()
+		<-done
+	}()
+
+	firstDelivery := make(map[string]struct{}, len(commitments))
+	firstDeadline := time.After(2 * time.Second)
+CollectFirst:
+	for len(firstDelivery) < len(commitments) {
+		select {
+		case c := <-ch:
+			firstDelivery[c.StateID.String()] = struct{}{}
+		case <-firstDeadline:
+			break CollectFirst
+		}
+	}
+	require.Len(t, firstDelivery, len(commitments), "streamer must deliver all stored commitments first")
+
+	suite.storage.ResetPendingSweep()
+
+	redelivered := make(map[string]struct{}, len(commitments))
+	secondDeadline := time.After(2 * time.Second)
+CollectSecond:
+	for len(redelivered) < len(commitments) {
+		select {
+		case c := <-ch:
+			redelivered[c.StateID.String()] = struct{}{}
+		case <-secondDeadline:
+			break CollectSecond
+		}
+	}
+
+	require.Len(t, redelivered, len(commitments),
+		"running streamer must redeliver all unacked PEL entries after reset")
+	for id := range firstDelivery {
+		_, found := redelivered[id]
+		assert.True(t, found, "pending commitment not redelivered after live reset (stateID=%s)", id)
+	}
+}
+
+// TestPendingExhausted_ResetAfterDiscardingLocalBuffer covers the precollector
+// discard path: entries already delivered into the local channel are stale when
+// Redis PEL is about to become the replay source again.
+func (suite *RedisTestSuite) TestPendingExhausted_ResetAfterDiscardingLocalBuffer() {
+	ctx := suite.ctx
+	t := suite.T()
+
+	commitments := []*models.CertificationRequest{
+		createTestCommitment(),
+		createTestCommitment(),
+		createTestCommitment(),
+	}
+	require.NoError(t, suite.storage.StoreBatch(ctx, commitments))
+
+	streamCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+	ch := make(chan *models.CertificationRequest, len(commitments))
+	done := make(chan struct{})
+	go func() {
+		_ = suite.storage.StreamCertificationRequests(streamCtx, ch)
+		close(done)
+	}()
+	defer func() {
+		cancel()
+		<-done
+	}()
+
+	require.Eventually(t, func() bool {
+		return len(ch) == len(commitments)
+	}, 2*time.Second, 25*time.Millisecond, "streamer should fill the local channel")
+
+	discarded := make(map[string]struct{}, len(commitments))
+	for len(discarded) < len(commitments) {
+		select {
+		case c := <-ch:
+			discarded[c.StateID.String()] = struct{}{}
+		default:
+			t.Fatalf("local buffer drained early: got %d/%d", len(discarded), len(commitments))
+		}
+	}
+	require.Len(t, discarded, len(commitments))
+
+	suite.storage.ResetPendingSweep()
+
+	redelivered := make(map[string]int, len(commitments))
+	deadline := time.After(2 * time.Second)
+CollectReplay:
+	for len(redelivered) < len(commitments) {
+		select {
+		case c := <-ch:
+			redelivered[c.StateID.String()]++
+		case <-deadline:
+			break CollectReplay
+		}
+	}
+
+	require.Len(t, redelivered, len(commitments),
+		"running streamer must replay all discarded local-buffer entries from PEL")
+	for id := range discarded {
+		assert.Equal(t, 1, redelivered[id], "stateID=%s should be redelivered exactly once", id)
+	}
+}
+
 // TestPendingSweep_DeliversEachEntryExactlyOnce: the pending sweep delivers
 // each PEL entry to commitmentChan exactly once, regardless of ack timing.
 func (suite *RedisTestSuite) TestPendingSweep_DeliversEachEntryExactlyOnce() {

From 048f4d7d18892d189161997e5c1da25424c656ff Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Mon, 11 May 2026 12:24:58 +0300
Subject: [PATCH 06/13] update compose files

---
 Makefile                 |  16 ++++--
 bft-sharding-compose.yml | 103 ++++++++++++++++++++++++++++++++-------
 docker-compose.yml       |   2 +-
 ha-compose.yml           |   6 ++-
 sharding-compose.yml     |   2 +-
 sharding-ha-compose.yml  |  10 +++-
 6 files changed, 113 insertions(+), 26 deletions(-)

diff --git a/Makefile b/Makefile
index befb4d8..e766b2f 100644
--- a/Makefile
+++ b/Makefile
@@ -165,18 +165,24 @@ docker-run-bft-sh-clean:
 	@echo "Rebuilding fixed 2-shard BFT services with clean state as current user..."
 	@docker compose -f bft-sharding-compose.yml down
 	@rm -rf ./data/bft-sharding ./logs/bft-shard0 ./logs/bft-shard1
-	@mkdir -p ./data/bft-sharding/genesis ./data/bft-sharding/genesis-root ./data/bft-sharding/mongodb_data ./data/bft-sharding/redis_data && chmod -R 777 ./data/bft-sharding
+	@mkdir -p ./data/bft-sharding/genesis ./data/bft-sharding/genesis-root ./data/bft-sharding/mongodb_shard0_data ./data/bft-sharding/mongodb_shard1_data ./data/bft-sharding/redis_data
+	@rm -rf ./data/bft-sharding/genesis/root
+	@chmod -R 777 ./data/bft-sharding
 	@mkdir -p ./logs/bft-shard0 ./logs/bft-shard1 && chmod -R 777 ./logs/bft-shard0 ./logs/bft-shard1
-	@USER_UID=$$(id -u) USER_GID=$$(id -g) LOG_LEVEL=debug docker compose -f bft-sharding-compose.yml up --force-recreate -d --build
+	@docker network inspect aggregator-go_default >/dev/null 2>&1 || docker network create aggregator-go_default >/dev/null
+	@USER_UID=$$(id -u) USER_GID=$$(id -g) LOG_LEVEL=$${LOG_LEVEL:-info} docker compose -f bft-sharding-compose.yml up --force-recreate -d --build
 	@echo "Fixed 2-shard BFT services rebuilt with user UID=$$(id -u):$$(id -g)"
 
 docker-run-bft-sh-clean-keep-tb:
 	@echo "Rebuilding fixed 2-shard BFT services with clean DB/Redis state but preserving BFT genesis as current user..."
 	@docker compose -f bft-sharding-compose.yml down
-	@rm -rf ./data/bft-sharding/mongodb_data ./data/bft-sharding/redis_data
-	@mkdir -p ./data/bft-sharding/genesis ./data/bft-sharding/genesis-root ./data/bft-sharding/mongodb_data ./data/bft-sharding/redis_data && chmod -R 777 ./data/bft-sharding
+	@rm -rf ./data/bft-sharding/mongodb_data ./data/bft-sharding/mongodb_shard0_data ./data/bft-sharding/mongodb_shard1_data ./data/bft-sharding/redis_data
+	@mkdir -p ./data/bft-sharding/genesis ./data/bft-sharding/genesis-root ./data/bft-sharding/mongodb_shard0_data ./data/bft-sharding/mongodb_shard1_data ./data/bft-sharding/redis_data
+	@rm -rf ./data/bft-sharding/genesis/root
+	@chmod -R 777 ./data/bft-sharding
 	@mkdir -p ./logs/bft-shard0 ./logs/bft-shard1 && chmod -R 777 ./logs/bft-shard0 ./logs/bft-shard1
-	@USER_UID=$$(id -u) USER_GID=$$(id -g) LOG_LEVEL=debug docker compose -f bft-sharding-compose.yml up --force-recreate -d --build
+	@docker network inspect aggregator-go_default >/dev/null 2>&1 || docker network create aggregator-go_default >/dev/null
+	@USER_UID=$$(id -u) USER_GID=$$(id -g) LOG_LEVEL=$${LOG_LEVEL:-info} docker compose -f bft-sharding-compose.yml up --force-recreate -d --build
 	@echo "Fixed 2-shard BFT services rebuilt with user UID=$$(id -u):$$(id -g)"
 
 docker-restart-sh-ha:
diff --git a/bft-sharding-compose.yml b/bft-sharding-compose.yml
index 184c190..2658fa3 100644
--- a/bft-sharding-compose.yml
+++ b/bft-sharding-compose.yml
@@ -1,3 +1,35 @@
+# Example 4k/s perf run against this fixed 2-shard compose stack from the repo root:
+#   LOG_LEVEL=info make docker-run-bft-sh-clean
+#
+#   SHARDING_MODE=bft-shard \
+#   SHARD_TARGETS="http://localhost:3001:0,http://localhost:3002:1" \
+#   AGGREGATOR_LOG_PATHS="shard0=logs/bft-shard0/aggregator.log,shard1=logs/bft-shard1/aggregator.log" \
+#   REQUESTS_PER_SEC=4000 \
+#   TEST_DURATION=30s \
+#   SUBMISSION_WORKERS=300 \
+#   PROOF_WORKERS=300 \
+#   HTTP_CLIENT_POOL_SIZE=48 \
+#   PROOF_INITIAL_DELAY=2s \
+#   PROOF_RETRY_DELAY=1s \
+#   go run ./cmd/performance-test
+#
+# Example with faster BFT root cadence for lower proof latency, 1k/s single active shard:
+#
+#   LOG_LEVEL=info ROOT_BLOCK_RATE=400 PRECOLLECTOR_GRACE_PERIOD=100ms \
+#     make docker-run-bft-sh-clean
+#
+#   SHARDING_MODE=bft-shard \
+#   SHARD_TARGETS="http://localhost:3001:0" \
+#   AGGREGATOR_LOG_PATHS="shard0=logs/bft-shard0/aggregator.log" \
+#   REQUESTS_PER_SEC=1000 \
+#   TEST_DURATION=30s \
+#   SUBMISSION_WORKERS=300 \
+#   PROOF_WORKERS=300 \
+#   HTTP_CLIENT_POOL_SIZE=48 \
+#   PROOF_INITIAL_DELAY=1200ms \
+#   PROOF_RETRY_DELAY=1s \
+#   go run ./cmd/performance-test
+
 x-bft-base: &bft-base
   platform: linux/amd64
   user: "${USER_UID:-1001}:${USER_GID:-1001}"
@@ -12,34 +44,33 @@ services:
     ports:
       - "8002:8002"
     volumes:
-      - ./data/bft-sharding/genesis-root:/genesis/root
+      - ./data/bft-sharding/genesis-root:/genesis-root
       - ./data/bft-sharding/genesis:/genesis
     healthcheck:
-      test: ["CMD", "nc", "-zv", "localhost", "8000"]
+      test: ["CMD", "nc", "-zv", "localhost", "8002"]
       interval: 5s
       timeout: 3s
       retries: 12
     entrypoint: ["/busybox/sh", "-c"]
     command:
       - |
-        if [ -f /genesis/root/node-info.json ] && [ -f /genesis/trust-base.json ] && [ -f /genesis/root/trust-base-signed.json ]; then
+        if [ -f /genesis-root/node-info.json ] && [ -f /genesis/trust-base.json ] && [ -f /genesis-root/trust-base-signed.json ]; then
           echo "Genesis files already exist, skipping initialization."
         else
           echo "Creating root genesis..." &&
-          ubft root-node init --home /genesis/root -g &&
+          ubft root-node init --home /genesis-root -g &&
           echo "Creating root trust base..." &&
-          ubft trust-base generate --home /genesis --network-id 3 --node-info /genesis/root/node-info.json &&
+          ubft trust-base generate --home /genesis --network-id 3 --node-info /genesis-root/node-info.json &&
           echo "Signing root trust base..." &&
-          ubft trust-base sign --home /genesis/root --trust-base /genesis/trust-base.json
+          ubft trust-base sign --home /genesis-root --trust-base /genesis/trust-base.json
         fi
         echo "Starting root node..." &&
-        exec ubft root-node run --home /genesis/root --address "/ip4/0.0.0.0/tcp/8000" --trust-base /genesis/trust-base.json --rpc-server-address "0.0.0.0:8002"
+        exec ubft root-node run --home /genesis-root --address "/ip4/0.0.0.0/tcp/8000" --trust-base /genesis/trust-base.json --rpc-server-address "0.0.0.0:8002" --block-rate "${ROOT_BLOCK_RATE:-900}"
 
   bft-shard-genesis-gen:
     <<: *bft-base
     container_name: aggregator-bft-shard-genesis-gen
     volumes:
-      - ./data/bft-sharding/genesis-root:/genesis/root
       - ./data/bft-sharding/genesis:/genesis
     depends_on:
       bft-root:
@@ -58,7 +89,7 @@ services:
           ubft shard-node init --home /genesis/shard_1 --generate &&
           ubft shard-conf generate --home /genesis/shard_1 --t2-timeout 5000 --network-id 3 --partition-id 7 --partition-type-id 7 --shard-id 0xC0 --epoch-start 10 --node-info=/genesis/shard_1/node-info.json
         fi
-        chmod -R 755 /genesis/root /genesis/shard_0 /genesis/shard_1
+        chmod -R 755 /genesis/shard_0 /genesis/shard_1
         chmod 644 /genesis/trust-base.json /genesis/shard_0/shard-conf-7_0.json /genesis/shard_1/shard-conf-7_0.json
         ls -l /genesis/
         ls -l /genesis/shard_0
@@ -98,18 +129,35 @@ services:
           done
         done
 
-  mongodb:
+  mongodb-shard0:
     image: mongo:7.0
-    container_name: aggregator-bft-mongodb
+    container_name: aggregator-bft-mongodb-shard0
     user: "${USER_UID:-1001}:${USER_GID:-1001}"
     restart: unless-stopped
     ports:
       - "27017:27017"
     volumes:
-      - ./data/bft-sharding/mongodb_data:/data/db
+      - ./data/bft-sharding/mongodb_shard0_data:/data/db
+    command: ["--replSet", "rs0", "--bind_ip_all", "--noauth"]
+    healthcheck:
+      test: ["CMD", "mongosh", "--eval", "try { rs.status().ok } catch(e) { rs.initiate({_id:'rs0',members:[{_id:0,host:'mongodb-shard0:27017'}]}) }"]
+      interval: 5s
+      timeout: 5s
+      retries: 10
+      start_period: 10s
+
+  mongodb-shard1:
+    image: mongo:7.0
+    container_name: aggregator-bft-mongodb-shard1
+    user: "${USER_UID:-1001}:${USER_GID:-1001}"
+    restart: unless-stopped
+    ports:
+      - "27018:27017"
+    volumes:
+      - ./data/bft-sharding/mongodb_shard1_data:/data/db
     command: ["--replSet", "rs0", "--bind_ip_all", "--noauth"]
     healthcheck:
-      test: ["CMD", "mongosh", "--eval", "try { rs.status().ok } catch(e) { rs.initiate({_id:'rs0',members:[{_id:0,host:'mongodb:27017'}]}) }"]
+      test: ["CMD", "mongosh", "--eval", "try { rs.status().ok } catch(e) { rs.initiate({_id:'rs0',members:[{_id:0,host:'mongodb-shard1:27017'}]}) }"]
       interval: 5s
       timeout: 5s
       retries: 10
@@ -136,6 +184,7 @@ services:
       context: .
       dockerfile: Dockerfile
     container_name: aggregator-bft-shard0
+    user: "${USER_UID:-1001}:${USER_GID:-1001}"
     restart: unless-stopped
     ports:
       - "3001:3000"
@@ -145,21 +194,25 @@ services:
     depends_on:
       upload-configurations:
         condition: service_completed_successfully
-      mongodb:
+      mongodb-shard0:
         condition: service_healthy
       redis:
         condition: service_healthy
     environment: &aggregator-bft-env
       PORT: "3000"
       HOST: "0.0.0.0"
-      CONCURRENCY_LIMIT: "5000"
+      CONCURRENCY_LIMIT: "10000"
       ENABLE_DOCS: "true"
       ENABLE_CORS: "true"
 
-      MONGODB_URI: "mongodb://mongodb:27017/aggregator?replicaSet=rs0&directConnection=true"
+      MONGODB_URI: "mongodb://mongodb-shard0:27017/aggregator?replicaSet=rs0&directConnection=true"
       MONGODB_DATABASE: "aggregator_bft_shard_0"
       MONGODB_CONNECT_TIMEOUT: "10s"
       MONGODB_SERVER_SELECTION_TIMEOUT: "5s"
+      MONGODB_MAX_POOL_SIZE: "100"
+      MONGODB_MIN_POOL_SIZE: "5"
+      MONGODB_FINALIZATION_INSERT_CHUNK_SIZE: "1000"
+      MONGODB_FINALIZATION_INSERT_CHUNK_WORKERS: "16"
 
       REDIS_HOST: "redis"
       REDIS_PORT: "6379"
@@ -172,6 +225,9 @@ services:
       REDIS_STREAM_NAME: "commitments:bft-shard:0"
       REDIS_FLUSH_INTERVAL: "50ms"
       REDIS_MAX_BATCH_SIZE: "2000"
+      REDIS_READ_BATCH_SIZE: "2000"
+      REDIS_READ_BLOCK_TIMEOUT: "100ms"
+      REDIS_ACK_BATCH_SIZE: "10000"
 
       DISABLE_HIGH_AVAILABILITY: "true"
 
@@ -181,7 +237,10 @@ services:
       LOG_FILE_PATH: "/app/logs/aggregator.log"
 
       BATCH_LIMIT: "1000"
-      MAX_COMMITMENTS_PER_ROUND: "10000"
+      MAX_COMMITMENTS_PER_ROUND: "20000"
+      PRECOLLECTOR_GRACE_PERIOD: "${PRECOLLECTOR_GRACE_PERIOD:-150ms}"
+      COMMITMENT_STREAM_BUFFER_SIZE: "50000"
+      SKIP_DUPLICATE_CHECK: "true"
 
       SHARDING_MODE: "bft-shard"
 
@@ -223,8 +282,16 @@ services:
     volumes:
       - ./data/bft-sharding/genesis:/app/bft-config
       - ./logs/bft-shard1:/app/logs
+    depends_on:
+      upload-configurations:
+        condition: service_completed_successfully
+      mongodb-shard1:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
     environment:
       <<: *aggregator-bft-env
+      MONGODB_URI: "mongodb://mongodb-shard1:27017/aggregator?replicaSet=rs0&directConnection=true"
       MONGODB_DATABASE: "aggregator_bft_shard_1"
       REDIS_STREAM_NAME: "commitments:bft-shard:1"
       SIGNING_KEY_FILE: "/app/bft-config/shard_1/keys.json"
@@ -234,3 +301,5 @@ services:
 
 networks:
   default:
+    name: aggregator-go_default
+    external: true
diff --git a/docker-compose.yml b/docker-compose.yml
index c727b24..94e0caf 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -13,7 +13,7 @@ services:
       - ./data/genesis-root:/genesis/root
       - ./data/genesis:/genesis
     healthcheck:
-      test: ["CMD", "nc", "-zv", "bft-root", "8000"]
+      test: ["CMD", "nc", "-zv", "bft-root", "8002"]
       interval: 5s
     networks:
       - default
diff --git a/ha-compose.yml b/ha-compose.yml
index cb55255..bbe3aa1 100644
--- a/ha-compose.yml
+++ b/ha-compose.yml
@@ -12,7 +12,7 @@ services:
       - ./data/genesis-root:/genesis/root
       - ./data/genesis:/genesis
     healthcheck:
-      test: ["CMD", "nc", "-zv", "bft-root", "8000"]
+      test: ["CMD", "nc", "-zv", "bft-root", "8002"]
       interval: 5s
     networks:
       - default
@@ -177,6 +177,10 @@ services:
     ports:
       - "3000:3000"
       - "8404:8404" # HAProxy stats page
+    ulimits:
+      nofile:
+        soft: 1048576
+        hard: 1048576
     networks:
       - default
     volumes:
diff --git a/sharding-compose.yml b/sharding-compose.yml
index b2a7e84..6f03339 100644
--- a/sharding-compose.yml
+++ b/sharding-compose.yml
@@ -13,7 +13,7 @@ services:
       - ./data/genesis-root:/genesis/root
       - ./data/genesis:/genesis
     healthcheck:
-      test: [ "CMD", "nc", "-zv", "bft-root", "8000" ]
+      test: [ "CMD", "nc", "-zv", "bft-root", "8002" ]
       interval: 5s
     networks:
       - default
diff --git a/sharding-ha-compose.yml b/sharding-ha-compose.yml
index d957c29..5e4ddf6 100644
--- a/sharding-ha-compose.yml
+++ b/sharding-ha-compose.yml
@@ -11,7 +11,7 @@ services:
       - ./data/genesis-root:/genesis/root
       - ./data/genesis:/genesis
     healthcheck:
-      test: ["CMD", "nc", "-zv", "bft-root", "8000"]
+      test: ["CMD", "nc", "-zv", "bft-root", "8002"]
       interval: 5s
     networks:
       - default
@@ -222,6 +222,10 @@ services:
     restart: unless-stopped
     ports:
       - "3001:3000" # External port 3001 maps to internal HAProxy port 3000
+    ulimits:
+      nofile:
+        soft: 1048576
+        hard: 1048576
     networks:
       - default
     volumes:
@@ -251,6 +255,10 @@ services:
     restart: unless-stopped
     ports:
       - "3002:3000" # External port 3002 maps to internal HAProxy port 3000
+    ulimits:
+      nofile:
+        soft: 1048576
+        hard: 1048576
     networks:
       - default
     volumes:

From 06632e42b07137f4502fddc4a2478dc355b1253c Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Mon, 11 May 2026 12:27:45 +0300
Subject: [PATCH 07/13] docs: add aggregator performance results

---
 docs/aggregator-performance.md | 110 +++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 docs/aggregator-performance.md

diff --git a/docs/aggregator-performance.md b/docs/aggregator-performance.md
new file mode 100644
index 0000000..2460706
--- /dev/null
+++ b/docs/aggregator-performance.md
@@ -0,0 +1,110 @@
+# Aggregator Performance Results
+
+This document records measured aggregator performance for the BFT-shard throughput work.
+
+## Single-Shard Default Configuration
+
+The default-cadence single-shard matrix uses the relevant `bft-sharding-compose.yml` settings below. Local helper scripts may be used to reset the stack and collect resource artifacts.
+
+| Setting | Value |
+|---|---:|
+| Test duration | `30s` |
+| `SUBMISSION_WORKERS` | `300` |
+| `HTTP_CLIENT_POOL_SIZE` | `48` |
+| `PRECOLLECTOR_GRACE_PERIOD` | `150ms` |
+| `MONGODB_FINALIZATION_INSERT_CHUNK_SIZE` | `1000` |
+| `MONGODB_FINALIZATION_INSERT_CHUNK_WORKERS` | `16` |
+| `SKIP_DUPLICATE_CHECK` | `true` |
+| `COMMITMENT_STREAM_BUFFER_SIZE` | `50000` |
+| `MAX_COMMITMENTS_PER_ROUND` | `20000` |
+| `REDIS_ACK_BATCH_SIZE` | `10000` |
+| `CONCURRENCY_LIMIT` | `10000` |
+| `ROOT_BLOCK_RATE` | `900` |
+| `PROOF_INITIAL_DELAY` | `2s` |
+| `PROOF_RETRY_DELAY` | `1s` |
+| Host CPU | AMD Ryzen 9 5900XT, 16 cores / 32 threads |
+
+## Single-Shard Scaling Matrix
+
+These runs increase target load on one active shard using the default BFT cadence.
+
+| Target RPS | Submitted | Proofs verified | Client proof p50 | Client proof p95 | Server proofReady p50 | Server proofReady p95 | BFT wait | Finalization | Commitments / round | Redis pending max | Host CPU busy avg/max | Mongo CPU avg | Aggregator CPU avg | Result |
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
+| 1,000 | 29,879 / 29,879 | 29,879 / 29,879 | 2.015s | 3.017s | 1.938s | 2.373s | 1.183s | 24ms | 1,270 | 1,275 | 9.1% / 23.4% | 48% | 56% | pass |
+| 2,000 | 59,838 / 59,838 | 59,838 / 59,838 | 2.019s | 3.019s | 1.905s | 2.438s | 1.175s | 32ms | 2,432 | 2,700 | 17.6% / 41.8% | 91% | 119% | pass |
+| 4,000 | 119,980 / 119,980 | 119,980 / 119,980 | 2.026s | 3.032s | 1.958s | 2.534s | 1.152s | 55ms | 5,074 | 5,100 | 30.8% / 76.9% | 156% | 237% | pass |
+| 6,000 | 179,999 / 179,999 | 179,999 / 179,999 | 2.026s | 3.033s | 1.984s | 2.539s | 1.121s | 75ms | 7,500 | 7,500 | 38.7% / 83.0% | 192% | 376% | pass |
+| 7,000 | 209,966 / 209,966 | 209,966 / 209,966 | 2.037s | 3.058s | 1.935s | 2.524s | 1.098s | 100ms | 9,172 | 8,742 | 39.7% / 83.1% | 206% | 408% | pass |
+| 8,000 | 239,663 / 239,663 | 239,663 / 239,663 | 2.075s | 3.151s | 1.909s | 2.524s | 1.075s | 117ms | 10,557 | 9,933 | 41.8% / 86.2% | 213% | 453% | pass |
+| 9,000 | 268,145 / 268,145 | 268,145 / 268,145 | 2.127s | 3.263s | 1.916s | 2.535s | 1.064s | 143ms | 12,237 | 12,265 | 43.3% / 96.2% | 221% | 490% | pass |
+| 10,000 | 293,520 / 293,520 | 293,520 / 293,520 | 3.013s | 3.593s | 2.099s | 2.785s | 1.088s | 257ms | 14,573 | 14,746 | 45.2% / 97.1% | 235% | 539% | borderline |
+
+## Multi-Shard Scaling
+
+These runs use the default BFT cadence and split load across active shard targets.
+
+On one machine, adding shards mostly redistributes work; it does not add CPU, disk, or network capacity. More shards reduce commitments per round per shard and lower finalization/Mongo pressure, but higher throughput scaling should be measured with shards spread across separate machines or with the perf client moved off the aggregator host.
+
+| Active shards | Mongo groups | Target RPS | Achieved RPS | Submitted | Client proof p50 | Client proof p95 | Server proofReady p50 | Server proofReady p95 | BFT wait | Finalization | Commitments / round / shard | Host CPU busy | Aggregator CPU avg | Mongo CPU avg | Perf tool CPU avg | Result |
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
+| 2 | 2 | 8,000 | 7,974 | 239,245 / 239,245 | 2.048s | 3.127s | 1.884s | 2.454s | 1.123s | 82ms | 4,871 | 37.5% / 89.8% | 317% total | 203% total | 608% | pass |
+| 2 | 2 | 9,000 | 8,850 | 265,964 / 265,964 | 2.098s | 3.264s | 1.923s | 2.525s | 1.112s | 91ms | 5,630 | 38.0% / 90.3% | 345% total | 204% total | 626% | pass |
+| 2 | 2 | 10,000 | 9,523 | 286,420 / 286,420 | 2.172s | 3.509s | 1.880s | 2.524s | 1.061s | 144ms | 6,459 | 41.6% / 98.1% | 374% total | 204% total | 680% | borderline |
+| 4 | 2 | 9,000 | 8,958 | 269,375 / 269,375 | 2.035s | 2.593s | 1.914s | 2.468s | 1.113s | 87ms | 2,796 | 38.5% / 95.0% | 431% total | 187% total | 638% | pass |
+| 4 | 2 | 10,000 | 9,685 | 291,580 / 291,580 | 2.142s | 3.038s | 1.946s | 2.519s | 1.089s | 127ms | 3,092 | 38.3% / 98.4% | 454% total | 181% total | 654% | borderline |
+| 8 | 2 | 10,000 | 9,646 | 292,200 / 292,200 | 2.099s | 2.784s | 1.922s | 2.485s | 1.094s | 107ms | 1,549 | 37.3% / 98.2% | 437% total | 155% total | 649% | borderline |
+
+## Low-Latency BFT Cadence
+
+These runs test whether lower BFT cadence can move proof latency closer to 1s. Keep this separate from the default `ROOT_BLOCK_RATE=900` throughput matrix because it changes consensus timing.
+
+Current low-latency settings:
+
+| Setting | Value |
+|---|---:|
+| `ROOT_BLOCK_RATE` | `400` |
+| `PRECOLLECTOR_GRACE_PERIOD` | `100ms` |
+| `MONGODB_FINALIZATION_INSERT_CHUNK_SIZE` | `1000` |
+| `MONGODB_FINALIZATION_INSERT_CHUNK_WORKERS` | `16` |
+| `SKIP_DUPLICATE_CHECK` | `true` |
+| `COMMITMENT_STREAM_BUFFER_SIZE` | `50000` |
+| `MAX_COMMITMENTS_PER_ROUND` | `20000` |
+| `REDIS_ACK_BATCH_SIZE` | `10000` |
+| `CONCURRENCY_LIMIT` | `10000` |
+| Mongo layout | `MONGO_GROUPS=2` |
+| `PROOF_INITIAL_DELAY` | `1.1s` |
+
+| Target RPS | Submitted | Proofs verified | Client proof p50 | Client proof p95 | Server proofReady p50 | Server proofReady p95 | BFT wait | Finalization | Commitments / round | Host CPU busy | Aggregator CPU avg | Mongo CPU avg | Result |
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
+| 1,000 | 29,701 / 29,701 | 29,701 / 29,701 | 1.113s | 1.118s | 992ms | 1.021s | 489ms | 16ms | 605 | 9.5% | 59% | 46% | pass |
+| 4,000 | 119,700 / 119,700 | 119,700 / 119,700 | 1.115s | 1.133s | 868ms | 1.103s | 462ms | 45ms | 2,395 | 30.2% | 238% | 163% | pass |
+| 6,000 | 179,701 / 179,701 | 179,701 / 179,701 | 1.115s | 1.611s | 894ms | 1.132s | 438ms | 62ms | 3,327 | 39.7% | 376% | 217% | pass |
+| 7,000 | 209,794 / 209,794 | 209,794 / 209,794 | 1.118s | 1.635s | 931ms | 1.228s | 480ms | 70ms | 4,486 | 40.4% | 414% | 205% | pass |
+| 8,000 | 239,807 / 239,807 | 239,807 / 239,807 | 1.129s | 1.677s | 975ms | 1.275s | 490ms | 86ms | 5,225 | 40.8% | 453% | 213% | pass |
+| 9,000 | 269,241 / 269,241 | 269,241 / 269,241 | 1.197s | 1.779s | 1.050s | 1.382s | 537ms | 103ms | 6,464 | 42.6% | 503% | 210% | boundary |
+
+## Gateway Throughput
+
+Gateway tests use the fixed 2-shard `bft-sharding-compose.yml` stack. Request path: gateway -> HAProxy -> active shard leader.
+
+| Shards | Target RPS | Achieved RPS | Submitted | Proofs verified | Host CPU busy avg/max | Gateway CPU avg | Perf client CPU avg | Aggregator CPU total avg | Mongo CPU total avg | HAProxy CPU total avg | Shard proofReady p50 | Client proof p50 | BFT wait | Finalization | Result |
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
+| 2 | 4,000 | 3,992.79 | 119,981 / 119,981 | 119,981 / 119,981 | 40.5% / 75.7% | 183% | 329% | 445% | 150% | 37% | 1.949s | 2.018s | 1.154s | 47ms | pass |
+| 2 | 6,000 | 5,988.67 | 179,981 / 179,981 | 179,981 / 179,981 | 47.3% / 83.2% | 228% | 458% | 494% | 169% | 43% | 1.875s | 2.027s | 1.137s | 69ms | pass |
+| 2 | 7,000 | 6,970.45 | 209,844 / 209,844 | 209,844 / 209,844 | 48.4% / 93.1% | 216% | 494% | 512% | 175% | 46% | 1.895s | 2.058s | 1.120s | 85ms | pass |
+| 2 | 8,000 | 7,876.84 | 238,321 / 238,321 | 238,321 / 238,321 | 53.0% / 98.9% | 216% | 562% | 562% | 210% | 57% | 2.301s | 3.069s | 1.113s | 371ms | boundary |
+
+The 8k gateway run hit the latency ceiling rather than a correctness or connection ceiling: all submissions and proofs completed, but proof request duration, finalization time, aggregator CPU, Mongo CPU, perf-client CPU, and host CPU all rose together. This points to whole-stack pressure on the single test machine, with proof polling and larger finalization batches amplifying backend load once the system is near saturation.
+
+### Gateway Mock Backend Ceiling
+
+This isolates the gateway path by replacing real aggregators with lightweight mock backends. It keeps gateway -> HAProxy -> backend routing and delayed proof polling, but removes BFT, MongoDB, Redis, and proof verification.
+
+| Target RPS | Submitted | Submit rate | Host CPU busy avg/max | Gateway CPU avg | Load client CPU avg | Mock backend CPU total avg | HAProxy CPU total avg | Proof p50 | Proof p95 | Proof request p50 | Proof request p95 | Result |
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
+| 15,000 | 449,600 | ~14,987/s | 73.7% / 88.5% | 639% | 451% | 563% | 137% | 2.015s | 2.031s | 13ms | 26ms | clean |
+| 18,000 | 539,116 | ~17,971/s | 77.4% / 97.4% | 732% | 479% | 616% | 157% | 2.017s | 2.050s | 15ms | 42ms | clean |
+| 19,000 | 563,907 | ~18,797/s | 78.5% / 98.6% | 709% | 481% | 619% | 159% | 2.019s | 2.505s | 16ms | 230ms | boundary |
+| 20,000 | 561,367 | ~18,712/s | 83.5% / 99.1% | 737% | 514% | 711% | 172% | 2.608s | 4.088s | 377ms | 879ms | overloaded |
+
+The mock test shows the gateway path can exceed the real aggregator-stack ceiling on this host. The clean single-machine mock ceiling is around 18k/s; above that, proof request latency rises as gateway, mock backends, HAProxy, and the load client compete for the same CPU.

From f0da959ef4a83fc1839a042929605d814176ab02 Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Mon, 11 May 2026 15:39:58 +0300
Subject: [PATCH 08/13] chore: remove unused block record state lookup

---
 internal/round/precollection_test.go      |  7 +++++--
 internal/storage/interfaces/interfaces.go |  3 ---
 internal/storage/mongodb/block_records.go | 24 -----------------------
 3 files changed, 5 insertions(+), 29 deletions(-)

diff --git a/internal/round/precollection_test.go b/internal/round/precollection_test.go
index 4140e40..c9d9f56 100644
--- a/internal/round/precollection_test.go
+++ b/internal/round/precollection_test.go
@@ -241,9 +241,12 @@ func waitForStateBlockNumber(
 
 	var blockNumber *api.BigInt
 	require.Eventually(t, func() bool {
-		var err error
-		blockNumber, err = storage.BlockRecordsStorage().GetByStateID(ctx, stateID)
+		record, err := storage.AggregatorRecordStorage().GetByStateID(ctx, stateID)
 		require.NoError(t, err)
+		if record == nil {
+			return false
+		}
+		blockNumber = record.BlockNumber
 		return blockNumber != nil
 	}, timeout, 25*time.Millisecond)
 
diff --git a/internal/storage/interfaces/interfaces.go b/internal/storage/interfaces/interfaces.go
index d0db054..c1beffe 100644
--- a/internal/storage/interfaces/interfaces.go
+++ b/internal/storage/interfaces/interfaces.go
@@ -159,9 +159,6 @@ type BlockRecordsStorage interface {
 	// GetByBlockNumber retrieves block records by block number
 	GetByBlockNumber(ctx context.Context, blockNumber *api.BigInt) (*models.BlockRecords, error)
 
-	// GetByStateID retrieves the block number for a state ID
-	GetByStateID(ctx context.Context, stateID api.StateID) (*api.BigInt, error)
-
 	// Count returns the total number of block records
 	Count(ctx context.Context) (int64, error)
 
diff --git a/internal/storage/mongodb/block_records.go b/internal/storage/mongodb/block_records.go
index bd952e3..0991afd 100644
--- a/internal/storage/mongodb/block_records.go
+++ b/internal/storage/mongodb/block_records.go
@@ -67,30 +67,6 @@ func (brs *BlockRecordsStorage) GetByBlockNumber(ctx context.Context, blockNumbe
 	return blockRecords, nil
 }
 
-// GetByStateID retrieves the block number for a state ID.
-func (brs *BlockRecordsStorage) GetByStateID(ctx context.Context, stateID api.StateID) (*api.BigInt, error) {
-	filter := bson.M{"stateIds": stateID.String()}
-	opts := options.FindOne().SetProjection(bson.M{"blockNumber": 1})
-
-	var result struct {
-		BlockNumber primitive.Decimal128 `bson:"blockNumber"`
-	}
-
-	err := brs.collection.FindOne(ctx, filter, opts).Decode(&result)
-	if err != nil {
-		if errors.Is(err, mongo.ErrNoDocuments) {
-			return nil, nil
-		}
-		return nil, fmt.Errorf("failed to get block number by state ID: %w", err)
-	}
-
-	blockNumber, err := api.NewBigIntFromString(result.BlockNumber.String())
-	if err != nil {
-		return nil, fmt.Errorf("failed to parse block number: %w", err)
-	}
-	return blockNumber, nil
-}
-
 // Count returns the total number of block records
 func (brs *BlockRecordsStorage) Count(ctx context.Context) (int64, error) {
 	count, err := brs.collection.CountDocuments(ctx, bson.M{})

From 929aea43c043d8b994f18ccb7f1cddc9e99780e8 Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Mon, 11 May 2026 15:59:15 +0300
Subject: [PATCH 09/13] feat: add leader-only health check

---
 internal/gateway/handlers_rest.go      | 35 ++++++++++++++++++++++++++
 internal/gateway/handlers_rest_test.go |  9 +++++++
 internal/gateway/server.go             |  1 +
 scripts/haproxy.cfg                    |  2 +-
 scripts/sharding/haproxy-shard1.cfg    |  2 +-
 scripts/sharding/haproxy-shard2.cfg    |  2 +-
 6 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/internal/gateway/handlers_rest.go b/internal/gateway/handlers_rest.go
index 7c9e9c3..8d4d153 100644
--- a/internal/gateway/handlers_rest.go
+++ b/internal/gateway/handlers_rest.go
@@ -35,6 +35,41 @@ func (s *Server) handleHealth(c *gin.Context) {
 	c.JSON(http.StatusOK, status)
 }
 
+// handleLeaderHealth handles readiness checks for leader-only traffic.
+func (s *Server) handleLeaderHealth(c *gin.Context) {
+	ctx := c.Request.Context()
+
+	status, err := s.service.GetHealthStatus(ctx)
+	if err != nil {
+		s.logger.WithContext(ctx).Error("Failed to get leader health status", "error", err.Error())
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Internal server error"})
+		return
+	}
+
+	if status.Status == api.HealthStatusUnhealthy {
+		c.JSON(http.StatusServiceUnavailable, status)
+		return
+	}
+
+	if !isLeaderHealthRole(status.Role) {
+		status.Status = api.HealthStatusUnhealthy
+		status.AddDetail("leader", "false")
+		c.JSON(http.StatusServiceUnavailable, status)
+		return
+	}
+
+	c.JSON(http.StatusOK, status)
+}
+
+func isLeaderHealthRole(role string) bool {
+	switch role {
+	case "leader", "standalone", "parent-leader", "parent-standalone":
+		return true
+	default:
+		return false
+	}
+}
+
 // handleDocs handles the API documentation endpoint
 func (s *Server) handleDocs(c *gin.Context) {
 	html := GenerateDocsHTML()
diff --git a/internal/gateway/handlers_rest_test.go b/internal/gateway/handlers_rest_test.go
index d249326..68dc519 100644
--- a/internal/gateway/handlers_rest_test.go
+++ b/internal/gateway/handlers_rest_test.go
@@ -93,6 +93,15 @@ func Test_GetTrustBases(t *testing.T) {
 	})
 }
 
+func TestIsLeaderHealthRole(t *testing.T) {
+	for _, role := range []string{"leader", "standalone", "parent-leader", "parent-standalone"} {
+		require.True(t, isLeaderHealthRole(role), role)
+	}
+	for _, role := range []string{"follower", "parent-follower", ""} {
+		require.False(t, isLeaderHealthRole(role), role)
+	}
+}
+
 func doRequest(t *testing.T, hf gin.HandlerFunc, method, path string) (*http.Response, []byte) {
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(method, path, nil)
diff --git a/internal/gateway/server.go b/internal/gateway/server.go
index 2f7ee98..393260c 100644
--- a/internal/gateway/server.go
+++ b/internal/gateway/server.go
@@ -118,6 +118,7 @@ func NewServer(cfg *config.Config, logger *logger.Logger, service Service) *Serv
 func (s *Server) setupRoutes() {
 	// Health and metrics endpoints
 	s.router.GET("/health", s.handleHealth)
+	s.router.GET("/health/leader", s.handleLeaderHealth)
 	s.router.GET("/metrics", gin.WrapH(promhttp.Handler()))
 	s.router.PUT("/api/v1/trustbases", s.handlePutTrustBase)
 	s.router.GET("/api/v1/trustbases", getTrustBaseHandler(s.logger, s.service))
diff --git a/scripts/haproxy.cfg b/scripts/haproxy.cfg
index 42944fa..a02c602 100644
--- a/scripts/haproxy.cfg
+++ b/scripts/haproxy.cfg
@@ -65,7 +65,7 @@ backend aggregator_backend
     stick on hdr_ip(X-Forwarded-For)
 
     # Health check configuration
-    option httpchk GET /health
+    option httpchk GET /health/leader
     http-check expect status 200
 
     # Cleanup connections when backend is marked down
diff --git a/scripts/sharding/haproxy-shard1.cfg b/scripts/sharding/haproxy-shard1.cfg
index e477832..3ab9a09 100644
--- a/scripts/sharding/haproxy-shard1.cfg
+++ b/scripts/sharding/haproxy-shard1.cfg
@@ -65,7 +65,7 @@ backend shard1_backend
     stick on hdr_ip(X-Forwarded-For)
 
     # Health check configuration
-    option httpchk GET /health
+    option httpchk GET /health/leader
     http-check expect status 200
 
     # Cleanup connections when backend is marked down
diff --git a/scripts/sharding/haproxy-shard2.cfg b/scripts/sharding/haproxy-shard2.cfg
index e8429a6..a085d04 100644
--- a/scripts/sharding/haproxy-shard2.cfg
+++ b/scripts/sharding/haproxy-shard2.cfg
@@ -65,7 +65,7 @@ backend shard2_backend
     stick on hdr_ip(X-Forwarded-For)
 
     # Health check configuration
-    option httpchk GET /health
+    option httpchk GET /health/leader
     http-check expect status 200
 
     # Cleanup connections when backend is marked down

From 7e4a0d7ad928f6715107dbf914f5cb0d7972b29e Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Mon, 11 May 2026 17:17:57 +0300
Subject: [PATCH 10/13] fix: avoid stale proof readiness cache

---
 internal/round/batch_processor.go             |  56 ++++---
 internal/round/finalize_duplicate_test.go     |  12 +-
 internal/round/leaf_add.go                    |  57 +++++++
 internal/round/precollection_test.go          | 149 ++++++++++++++++++
 internal/round/precollector.go                |  57 +------
 internal/round/round_manager.go               |  18 ++-
 .../round/smt_persistence_integration_test.go |   2 +-
 internal/smt/thread_safe_smt_snapshot.go      |  45 ++++++
 internal/smt/thread_safe_smt_snapshot_test.go |  38 +++++
 9 files changed, 349 insertions(+), 85 deletions(-)
 create mode 100644 internal/round/leaf_add.go

diff --git a/internal/round/batch_processor.go b/internal/round/batch_processor.go
index f64f90c..a146be5 100644
--- a/internal/round/batch_processor.go
+++ b/internal/round/batch_processor.go
@@ -21,9 +21,10 @@ import (
 
 // processMiniBatch processes a small batch of commitments into the SMT for efficiency
 // NOTE: The caller is expected to hold rm.roundMutex when calling this function
-func (rm *RoundManager) processMiniBatch(ctx context.Context, commitments []*models.CertificationRequest) error {
+// and ACK returned dropped entries after releasing it.
+func (rm *RoundManager) processMiniBatch(ctx context.Context, commitments []*models.CertificationRequest) ([]interfaces.CertificationRequestAck, error) {
 	if len(commitments) == 0 {
-		return nil
+		return nil, nil
 	}
 
 	// Convert commitments to SMT leaves, tracking valid commitments
@@ -55,28 +56,15 @@ func (rm *RoundManager) processMiniBatch(ctx context.Context, commitments []*mod
 	// Add leaves to the current round's SMT snapshot
 	if rm.currentRound != nil && rm.currentRound.Snapshot != nil {
 		smtStart := time.Now()
-		_, err := rm.currentRound.Snapshot.AddLeaves(leaves)
+		addedCommitments, addedLeaves, dropped := addCommitmentLeaves(ctx, rm.logger, rm.currentRound.Snapshot, leaves, validCommitments)
 		metrics.SMTAddLeavesDuration.Observe(time.Since(smtStart).Seconds())
-		if err != nil {
-			result := tryAddLeavesOneByOne(ctx, rm.logger, rm.commitmentQueue, rm.currentRound.Snapshot, leaves, validCommitments)
-			rm.currentRound.PendingLeaves = append(rm.currentRound.PendingLeaves, result.successLeaves...)
-			rm.currentRound.PendingCommitments = append(rm.currentRound.PendingCommitments, result.successCommitments...)
-			rm.markProofsPending(result.successCommitments)
-		} else {
-			rm.currentRound.PendingLeaves = append(rm.currentRound.PendingLeaves, leaves...)
-			rm.currentRound.PendingCommitments = append(rm.currentRound.PendingCommitments, validCommitments...)
-			rm.markProofsPending(validCommitments)
-		}
+		rm.currentRound.PendingLeaves = append(rm.currentRound.PendingLeaves, addedLeaves...)
+		rm.currentRound.PendingCommitments = append(rm.currentRound.PendingCommitments, addedCommitments...)
+		rm.markProofsPending(addedCommitments)
+		return dropped, nil
 	}
 
-	return nil
-}
-
-// leafAddResult holds results of adding leaves one-by-one to an SMT snapshot.
-type leafAddResult struct {
-	successLeaves      []*smt.Leaf
-	successCommitments []*models.CertificationRequest
-	rejected           []interfaces.CertificationRequestAck
+	return nil, nil
 }
 
 // ProposeBlock creates and proposes a new block with the given data.
@@ -372,10 +360,13 @@ func (rm *RoundManager) FinalizeBlockWithRetry(ctx context.Context, block *model
 		} else if len(unfinalizedBlocks) > 0 {
 			rm.logger.Info("Found unfinalized block, attempting recovery",
 				"blockNumber", unfinalizedBlocks[0].Index.String())
-			_, recoverErr := RecoverUnfinalizedBlock(ctx, rm.logger, rm.storage, rm.commitmentQueue)
+			recoveryResult, recoverErr := RecoverUnfinalizedBlock(ctx, rm.logger, rm.storage, rm.commitmentQueue)
 			if recoverErr != nil {
 				return fmt.Errorf("recovery failed: %w", recoverErr)
 			}
+			if recoveryResult != nil && recoveryResult.Recovered {
+				rm.reconcileRecoveredFinalization(recoveryResult.BlockNumber)
+			}
 			rm.logger.Info("Recovery completed successfully")
 			return nil
 		}
@@ -388,6 +379,27 @@ func (rm *RoundManager) FinalizeBlockWithRetry(ctx context.Context, block *model
 	return fmt.Errorf("FinalizeBlock failed after %d attempts", maxFinalizeRetries)
 }
 
+func (rm *RoundManager) reconcileRecoveredFinalization(blockNumber *api.BigInt) {
+	var snapshot *smt.ThreadSafeSmtSnapshot
+
+	rm.roundMutex.RLock()
+	if blockNumber != nil &&
+		rm.currentRound != nil &&
+		rm.currentRound.Number != nil &&
+		rm.currentRound.Snapshot != nil &&
+		rm.currentRound.Number.Cmp(blockNumber.Int) == 0 {
+		snapshot = rm.currentRound.Snapshot
+	}
+	rm.roundMutex.RUnlock()
+
+	rm.finalizationMu.Lock()
+	if snapshot != nil {
+		snapshot.Commit(rm.smt)
+	}
+	rm.clearProofPending()
+	rm.finalizationMu.Unlock()
+}
+
 // FinalizeBlock creates and persists a new block with the given data
 func (rm *RoundManager) FinalizeBlock(ctx context.Context, block *models.Block) error {
 	if err := rm.validateBlockForMode(block); err != nil {
diff --git a/internal/round/finalize_duplicate_test.go b/internal/round/finalize_duplicate_test.go
index d0ada1a..fa8ee76 100644
--- a/internal/round/finalize_duplicate_test.go
+++ b/internal/round/finalize_duplicate_test.go
@@ -81,7 +81,7 @@ func (s *FinalizeDuplicateTestSuite) Test1_DuplicateRecovery() {
 
 	// Process commitments to populate PendingLeaves
 	rm.roundMutex.Lock()
-	err = rm.processMiniBatch(ctx, commitments)
+	_, err = rm.processMiniBatch(ctx, commitments)
 	rm.roundMutex.Unlock()
 	require.NoError(t, err)
 
@@ -163,7 +163,7 @@ func (s *FinalizeDuplicateTestSuite) Test2_NoDuplicates() {
 	}
 
 	rm.roundMutex.Lock()
-	err = rm.processMiniBatch(ctx, commitments)
+	_, err = rm.processMiniBatch(ctx, commitments)
 	rm.roundMutex.Unlock()
 	require.NoError(t, err)
 
@@ -214,7 +214,7 @@ func (s *FinalizeDuplicateTestSuite) Test3_AllDuplicates() {
 	}
 
 	rm.roundMutex.Lock()
-	err = rm.processMiniBatch(ctx, commitments)
+	_, err = rm.processMiniBatch(ctx, commitments)
 	rm.roundMutex.Unlock()
 	require.NoError(t, err)
 
@@ -285,7 +285,7 @@ func (s *FinalizeDuplicateTestSuite) Test4_DuplicateBlock() {
 	}
 
 	rm.roundMutex.Lock()
-	err = rm.processMiniBatch(ctx, commitments)
+	_, err = rm.processMiniBatch(ctx, commitments)
 	rm.roundMutex.Unlock()
 	require.NoError(t, err)
 
@@ -366,7 +366,7 @@ func (s *FinalizeDuplicateTestSuite) Test5_DuplicateBlockAlreadyFinalized() {
 	}
 
 	rm.roundMutex.Lock()
-	err = rm.processMiniBatch(ctx, commitments)
+	_, err = rm.processMiniBatch(ctx, commitments)
 	rm.roundMutex.Unlock()
 	require.NoError(t, err)
 
@@ -454,7 +454,7 @@ func (s *FinalizeDuplicateTestSuite) Test6_BlockRecordsMatchPendingCommitmentsOn
 	}
 
 	rm.roundMutex.Lock()
-	err = rm.processMiniBatch(ctx, commitments)
+	_, err = rm.processMiniBatch(ctx, commitments)
 	rm.roundMutex.Unlock()
 	require.NoError(t, err)
 
diff --git a/internal/round/leaf_add.go b/internal/round/leaf_add.go
new file mode 100644
index 0000000..216dbcc
--- /dev/null
+++ b/internal/round/leaf_add.go
@@ -0,0 +1,57 @@
+package round
+
+import (
+	"context"
+
+	"github.com/unicitynetwork/aggregator-go/internal/logger"
+	"github.com/unicitynetwork/aggregator-go/internal/models"
+	"github.com/unicitynetwork/aggregator-go/internal/smt"
+	"github.com/unicitynetwork/aggregator-go/internal/storage/interfaces"
+)
+
+func addCommitmentLeaves(
+	ctx context.Context,
+	log *logger.Logger,
+	snapshot *smt.ThreadSafeSmtSnapshot,
+	leaves []*smt.Leaf,
+	commitments []*models.CertificationRequest,
+) ([]*models.CertificationRequest, []*smt.Leaf, []interfaces.CertificationRequestAck) {
+	result := snapshot.AddLeavesClassified(leaves)
+
+	addedCommitments := make([]*models.CertificationRequest, 0, len(result.AddedIndexes))
+	addedLeaves := make([]*smt.Leaf, 0, len(result.AddedIndexes))
+	for _, idx := range result.AddedIndexes {
+		addedCommitments = append(addedCommitments, commitments[idx])
+		addedLeaves = append(addedLeaves, leaves[idx])
+	}
+
+	dropped := make([]interfaces.CertificationRequestAck, 0, len(result.DuplicateIndexes)+len(result.Rejected))
+	for _, idx := range result.DuplicateIndexes {
+		dropped = append(dropped, interfaces.CertificationRequestAck{
+			StateID:  commitments[idx].StateID,
+			StreamID: commitments[idx].StreamID,
+		})
+	}
+	for _, rejected := range result.Rejected {
+		log.WithContext(ctx).Warn("Rejected commitment leaf",
+			"path", leaves[rejected.Index].Path.String(),
+			"error", rejected.Err.Error())
+		dropped = append(dropped, interfaces.CertificationRequestAck{
+			StateID:  commitments[rejected.Index].StateID,
+			StreamID: commitments[rejected.Index].StreamID,
+		})
+	}
+
+	return addedCommitments, addedLeaves, dropped
+}
+
+func ackDroppedCommitments(ctx context.Context, log *logger.Logger, queue interfaces.CommitmentQueue, dropped []interfaces.CertificationRequestAck) {
+	if len(dropped) == 0 || queue == nil {
+		return
+	}
+	if err := queue.MarkProcessed(ctx, dropped); err != nil {
+		log.WithContext(ctx).Error("Failed to mark dropped commitments as processed",
+			"count", len(dropped),
+			"error", err.Error())
+	}
+}
diff --git a/internal/round/precollection_test.go b/internal/round/precollection_test.go
index c9d9f56..b5f289c 100644
--- a/internal/round/precollection_test.go
+++ b/internal/round/precollection_test.go
@@ -279,6 +279,155 @@ func newTestPrecollector(t *testing.T, stream chan *models.CertificationRequest,
 	return cp, smtInstance
 }
 
+func TestProcessMiniBatch_SkipsExistingDuplicateWithoutProofPending(t *testing.T) {
+	ctx := context.Background()
+	cfg := &config.Config{
+		Processing: config.ProcessingConfig{
+			RoundDuration: time.Second,
+			BatchLimit:    1000,
+		},
+		Sharding: config.ShardingConfig{
+			Mode: config.ShardingModeStandalone,
+		},
+	}
+	testLogger := newTestLogger(t)
+	smtInstance := smt.NewThreadSafeSMT(smt.NewSparseMerkleTree(api.SHA256, api.StateTreeKeyLengthBits))
+	rm, err := NewRoundManager(ctx, cfg, testLogger, nil, nil, nil, state.NewSyncStateTracker(), nil, events.NewEventBus(testLogger), smtInstance, nil)
+	require.NoError(t, err)
+
+	existing := testutil.CreateTestCertificationRequest(t, "existing_duplicate")
+	initialSnapshot := smtInstance.CreateSnapshot()
+	_, err = initialSnapshot.AddLeaves([]*smt.Leaf{getLeafFromCommitment(t, existing)})
+	require.NoError(t, err)
+	initialSnapshot.Commit(smtInstance)
+
+	newCommitment := testutil.CreateTestCertificationRequest(t, "new_commitment")
+	duplicate := *existing
+	duplicate.StreamID = "duplicate-stream-id"
+
+	rm.currentRound = &Round{
+		Number:   api.NewBigInt(big.NewInt(1)),
+		State:    RoundStateProcessing,
+		Snapshot: smtInstance.CreateSnapshot(),
+	}
+
+	rm.roundMutex.Lock()
+	_, err = rm.processMiniBatch(ctx, []*models.CertificationRequest{&duplicate, newCommitment})
+	rm.roundMutex.Unlock()
+	require.NoError(t, err)
+
+	require.Len(t, rm.currentRound.PendingCommitments, 1)
+	require.Equal(t, newCommitment.StateID, rm.currentRound.PendingCommitments[0].StateID)
+	require.Len(t, rm.currentRound.PendingLeaves, 1)
+
+	rm.proofCacheMu.RLock()
+	_, duplicatePending := rm.proofPending[existing.StateID.String()]
+	_, newPending := rm.proofPending[newCommitment.StateID.String()]
+	rm.proofCacheMu.RUnlock()
+	require.False(t, duplicatePending)
+	require.True(t, newPending)
+}
+
+func TestReconcileRecoveredFinalization_CommitsMatchingSnapshotAndClearsProofPending(t *testing.T) {
+	ctx := context.Background()
+	cfg := &config.Config{
+		Processing: config.ProcessingConfig{
+			RoundDuration: time.Second,
+			BatchLimit:    1000,
+		},
+		Sharding: config.ShardingConfig{
+			Mode: config.ShardingModeStandalone,
+		},
+	}
+	testLogger := newTestLogger(t)
+	smtInstance := smt.NewThreadSafeSMT(smt.NewSparseMerkleTree(api.SHA256, api.StateTreeKeyLengthBits))
+	rm, err := NewRoundManager(ctx, cfg, testLogger, nil, nil, nil, state.NewSyncStateTracker(), nil, events.NewEventBus(testLogger), smtInstance, nil)
+	require.NoError(t, err)
+
+	commitment := testutil.CreateTestCertificationRequest(t, "recovered_finalization")
+	blockNumber := api.NewBigInt(big.NewInt(12))
+	rm.currentRound = &Round{
+		Number:   blockNumber,
+		State:    RoundStateProcessing,
+		Snapshot: smtInstance.CreateSnapshot(),
+	}
+
+	rm.roundMutex.Lock()
+	_, err = rm.processMiniBatch(ctx, []*models.CertificationRequest{commitment})
+	rm.roundMutex.Unlock()
+	require.NoError(t, err)
+
+	key, err := commitment.StateID.GetTreeKey()
+	require.NoError(t, err)
+	_, err = smtInstance.GetInclusionCert(key)
+	require.Error(t, err)
+
+	rm.proofCacheMu.RLock()
+	_, pendingBefore := rm.proofPending[commitment.StateID.String()]
+	rm.proofCacheMu.RUnlock()
+	require.True(t, pendingBefore)
+
+	rm.reconcileRecoveredFinalization(blockNumber)
+
+	_, err = smtInstance.GetInclusionCert(key)
+	require.NoError(t, err)
+
+	rm.proofCacheMu.RLock()
+	_, pendingAfter := rm.proofPending[commitment.StateID.String()]
+	rm.proofCacheMu.RUnlock()
+	require.False(t, pendingAfter)
+}
+
+func TestReconcileRecoveredFinalization_MismatchedBlockClearsProofPendingOnly(t *testing.T) {
+	ctx := context.Background()
+	cfg := &config.Config{
+		Processing: config.ProcessingConfig{
+			RoundDuration: time.Second,
+			BatchLimit:    1000,
+		},
+		Sharding: config.ShardingConfig{
+			Mode: config.ShardingModeStandalone,
+		},
+	}
+	testLogger := newTestLogger(t)
+	smtInstance := smt.NewThreadSafeSMT(smt.NewSparseMerkleTree(api.SHA256, api.StateTreeKeyLengthBits))
+	rm, err := NewRoundManager(ctx, cfg, testLogger, nil, nil, nil, state.NewSyncStateTracker(), nil, events.NewEventBus(testLogger), smtInstance, nil)
+	require.NoError(t, err)
+
+	commitment := testutil.CreateTestCertificationRequest(t, "mismatched_recovered_finalization")
+	currentBlockNumber := api.NewBigInt(big.NewInt(20))
+	recoveredBlockNumber := api.NewBigInt(big.NewInt(19))
+	rm.currentRound = &Round{
+		Number:   currentBlockNumber,
+		State:    RoundStateProcessing,
+		Snapshot: smtInstance.CreateSnapshot(),
+	}
+
+	originalRoot := smtInstance.GetRootHash()
+	rm.roundMutex.Lock()
+	_, err = rm.processMiniBatch(ctx, []*models.CertificationRequest{commitment})
+	rm.roundMutex.Unlock()
+	require.NoError(t, err)
+
+	rm.proofCacheMu.RLock()
+	_, pendingBefore := rm.proofPending[commitment.StateID.String()]
+	rm.proofCacheMu.RUnlock()
+	require.True(t, pendingBefore)
+
+	rm.reconcileRecoveredFinalization(recoveredBlockNumber)
+
+	require.Equal(t, originalRoot, smtInstance.GetRootHash())
+	key, err := commitment.StateID.GetTreeKey()
+	require.NoError(t, err)
+	_, err = smtInstance.GetInclusionCert(key)
+	require.Error(t, err)
+
+	rm.proofCacheMu.RLock()
+	_, pendingAfter := rm.proofPending[commitment.StateID.String()]
+	rm.proofCacheMu.RUnlock()
+	require.False(t, pendingAfter)
+}
+
 func TestDrainBufferedCommitments_StopsAtRoundBoundary(t *testing.T) {
 	stream := make(chan *models.CertificationRequest, 8)
 	pending := make([]*models.CertificationRequest, 0, miniBatchSize)
diff --git a/internal/round/precollector.go b/internal/round/precollector.go
index b11fbe0..6b95966 100644
--- a/internal/round/precollector.go
+++ b/internal/round/precollector.go
@@ -2,7 +2,6 @@ package round
 
 import (
 	"context"
-	"errors"
 	"fmt"
 
 	"github.com/unicitynetwork/aggregator-go/internal/logger"
@@ -215,57 +214,7 @@ func (cp *childPrecollector) addBatch(
 		return nil, nil
 	}
 
-	if _, err := snapshot.AddLeaves(leavesToAdd); err != nil {
-		result := tryAddLeavesOneByOne(ctx, cp.logger, cp.commitmentQueue, snapshot, leavesToAdd, valid)
-		return result.successCommitments, result.successLeaves
-	}
-
-	return valid, leavesToAdd
-}
-
-// tryAddLeavesOneByOne adds leaves one-by-one to a snapshot and returns results.
-// Package-level function usable by both standalone processMiniBatch and childPrecollector.
-func tryAddLeavesOneByOne(
-	ctx context.Context,
-	log *logger.Logger,
-	queue interfaces.CommitmentQueue,
-	snapshot *smt.ThreadSafeSmtSnapshot,
-	leaves []*smt.Leaf,
-	commitments []*models.CertificationRequest,
-) leafAddResult {
-	result := leafAddResult{
-		successLeaves:      make([]*smt.Leaf, 0, len(leaves)),
-		successCommitments: make([]*models.CertificationRequest, 0, len(commitments)),
-		rejected:           nil,
-	}
-
-	for i, leaf := range leaves {
-		if err := snapshot.AddLeaf(leaf.Path, leaf.Value); err != nil {
-			if errors.Is(err, smt.ErrDuplicateLeaf) {
-				result.successLeaves = append(result.successLeaves, leaf)
-				result.successCommitments = append(result.successCommitments, commitments[i])
-				continue
-			}
-			log.WithContext(ctx).Warn("Rejected conflicting leaf",
-				"path", leaf.Path.String(),
-				"error", err.Error())
-			result.rejected = append(result.rejected, interfaces.CertificationRequestAck{
-				StateID:  commitments[i].StateID,
-				StreamID: commitments[i].StreamID,
-			})
-			continue
-		}
-		result.successLeaves = append(result.successLeaves, leaf)
-		result.successCommitments = append(result.successCommitments, commitments[i])
-	}
-
-	if len(result.rejected) > 0 && queue != nil {
-		if err := queue.MarkProcessed(ctx, result.rejected); err != nil {
-			log.WithContext(ctx).Error("Failed to mark rejected commitments as processed",
-				"count", len(result.rejected),
-				"error", err.Error())
-		}
-	}
-
-	return result
+	added, addedLeaves, dropped := addCommitmentLeaves(ctx, cp.logger, snapshot, leavesToAdd, valid)
+	ackDroppedCommitments(ctx, cp.logger, cp.commitmentQueue, dropped)
+	return added, addedLeaves
 }
diff --git a/internal/round/round_manager.go b/internal/round/round_manager.go
index 66e6e61..cf45a56 100644
--- a/internal/round/round_manager.go
+++ b/internal/round/round_manager.go
@@ -528,11 +528,18 @@ func (rm *RoundManager) processRound(ctx context.Context) error {
 			case commitment := <-rm.commitmentStream:
 				rm.roundMutex.Lock()
 				rm.currentRound.Commitments = append(rm.currentRound.Commitments, commitment)
+				var dropped []interfaces.CertificationRequestAck
 				if len(rm.currentRound.Commitments)%100 == 0 {
 					batch := rm.currentRound.Commitments[len(rm.currentRound.Commitments)-100:]
-					rm.processMiniBatch(ctx, batch)
+					var err error
+					dropped, err = rm.processMiniBatch(ctx, batch)
+					if err != nil {
+						rm.roundMutex.Unlock()
+						return err
+					}
 				}
 				rm.roundMutex.Unlock()
+				ackDroppedCommitments(ctx, rm.logger, rm.commitmentQueue, dropped)
 			case <-ctx.Done():
 				return ctx.Err()
 			default:
@@ -542,11 +549,18 @@ func (rm *RoundManager) processRound(ctx context.Context) error {
 
 		rm.roundMutex.Lock()
 		remaining := len(rm.currentRound.Commitments) % 100
+		var dropped []interfaces.CertificationRequestAck
 		if remaining > 0 {
 			batch := rm.currentRound.Commitments[len(rm.currentRound.Commitments)-remaining:]
-			rm.processMiniBatch(ctx, batch)
+			var err error
+			dropped, err = rm.processMiniBatch(ctx, batch)
+			if err != nil {
+				rm.roundMutex.Unlock()
+				return err
+			}
 		}
 		rm.roundMutex.Unlock()
+		ackDroppedCommitments(ctx, rm.logger, rm.commitmentQueue, dropped)
 	}
 
 	rm.startActivePrecollectorIfNeeded(ctx)
diff --git a/internal/round/smt_persistence_integration_test.go b/internal/round/smt_persistence_integration_test.go
index e7cd120..768378d 100644
--- a/internal/round/smt_persistence_integration_test.go
+++ b/internal/round/smt_persistence_integration_test.go
@@ -210,7 +210,7 @@ func TestCompleteWorkflowWithRestart(t *testing.T) {
 
 	// Process the commitments (processMiniBatch assumes caller holds mutex)
 	rm.roundMutex.Lock()
-	err = rm.processMiniBatch(ctx, testCommitments)
+	_, err = rm.processMiniBatch(ctx, testCommitments)
 	rm.roundMutex.Unlock()
 	require.NoError(t, err, "processMiniBatch should succeed")
 
diff --git a/internal/smt/thread_safe_smt_snapshot.go b/internal/smt/thread_safe_smt_snapshot.go
index 7b482f3..4fd121f 100644
--- a/internal/smt/thread_safe_smt_snapshot.go
+++ b/internal/smt/thread_safe_smt_snapshot.go
@@ -1,6 +1,7 @@
 package smt
 
 import (
+	"errors"
 	"fmt"
 	"math/big"
 	"sync"
@@ -16,6 +17,20 @@ type ThreadSafeSmtSnapshot struct {
 	rwMux    sync.RWMutex // RWMutex allows multiple readers but exclusive writers
 }
 
+// AddLeavesClassifiedResult reports input indexes for each add outcome.
+// Duplicates are normal no-ops; rejected leaves carry the add error.
+type AddLeavesClassifiedResult struct {
+	AddedIndexes     []int
+	DuplicateIndexes []int
+	Rejected         []RejectedLeaf
+}
+
+// RejectedLeaf records a leaf that could not be added to the snapshot.
+type RejectedLeaf struct {
+	Index int
+	Err   error
+}
+
 // NewThreadSafeSmtSnapshot creates a new thread-safe SMT snapshot wrapper
 func NewThreadSafeSmtSnapshot(snapshot *SmtSnapshot) *ThreadSafeSmtSnapshot {
 	// Prime hash caches before publishing the snapshot wrapper so concurrent
@@ -45,6 +60,36 @@ func (tss *ThreadSafeSmtSnapshot) addLeavesUnsafe(leaves []*Leaf) (string, error
 	return tss.snapshot.GetRootHashHex(), nil
 }
 
+// AddLeavesClassified adds valid leaves and reports duplicates/conflicts without aborting the batch.
+func (tss *ThreadSafeSmtSnapshot) AddLeavesClassified(leaves []*Leaf) AddLeavesClassifiedResult {
+	result := AddLeavesClassifiedResult{
+		AddedIndexes:     make([]int, 0, len(leaves)),
+		DuplicateIndexes: make([]int, 0),
+		Rejected:         make([]RejectedLeaf, 0),
+	}
+	if len(leaves) == 0 {
+		return result
+	}
+
+	tss.rwMux.Lock()
+	defer tss.rwMux.Unlock()
+
+	for i, leaf := range leaves {
+		if err := tss.snapshot.AddLeaf(leaf.Path, leaf.Value); err != nil {
+			if errors.Is(err, ErrDuplicateLeaf) {
+				result.DuplicateIndexes = append(result.DuplicateIndexes, i)
+				continue
+			}
+			result.Rejected = append(result.Rejected, RejectedLeaf{Index: i, Err: err})
+			continue
+		}
+		result.AddedIndexes = append(result.AddedIndexes, i)
+	}
+
+	tss.snapshot.ensureHashes()
+	return result
+}
+
 // AddLeaf adds a single leaf to the snapshot
 // This operation is exclusive and blocks all other operations on this snapshot
 func (tss *ThreadSafeSmtSnapshot) AddLeaf(path *big.Int, value []byte) error {
diff --git a/internal/smt/thread_safe_smt_snapshot_test.go b/internal/smt/thread_safe_smt_snapshot_test.go
index 7083381..d579542 100644
--- a/internal/smt/thread_safe_smt_snapshot_test.go
+++ b/internal/smt/thread_safe_smt_snapshot_test.go
@@ -83,6 +83,44 @@ func TestThreadSafeSMTSnapshot(t *testing.T) {
 		}
 	})
 
+	t.Run("ClassifiedBatchSkipsDuplicatesAndRejectsConflicts", func(t *testing.T) {
+		smtInstance := NewSparseMerkleTree(api.SHA256, 2)
+		threadSafeSMT := NewThreadSafeSMT(smtInstance)
+
+		initialSnapshot := threadSafeSMT.CreateSnapshot()
+		initial := []*Leaf{
+			NewLeaf(big.NewInt(0b100), []byte{1}),
+			NewLeaf(big.NewInt(0b101), []byte{2}),
+		}
+		initialResult := initialSnapshot.AddLeavesClassified(initial)
+		require.Equal(t, []int{0, 1}, initialResult.AddedIndexes)
+		require.Empty(t, initialResult.DuplicateIndexes)
+		require.Empty(t, initialResult.Rejected)
+		initialSnapshot.Commit(threadSafeSMT)
+
+		snapshot := threadSafeSMT.CreateSnapshot()
+		batch := []*Leaf{
+			NewLeaf(big.NewInt(0b100), []byte{1}),
+			NewLeaf(big.NewInt(0b100), []byte{9}),
+			NewLeaf(big.NewInt(0b111), []byte{3}),
+		}
+		result := snapshot.AddLeavesClassified(batch)
+
+		require.Equal(t, []int{2}, result.AddedIndexes)
+		require.Equal(t, []int{0}, result.DuplicateIndexes)
+		require.Len(t, result.Rejected, 1)
+		require.Equal(t, 1, result.Rejected[0].Index)
+		require.ErrorIs(t, result.Rejected[0].Err, ErrLeafModification)
+
+		snapshot.Commit(threadSafeSMT)
+		unchanged, err := threadSafeSMT.GetLeaf(big.NewInt(0b100))
+		require.NoError(t, err)
+		require.Equal(t, []byte{1}, unchanged.Value)
+		added, err := threadSafeSMT.GetLeaf(big.NewInt(0b111))
+		require.NoError(t, err)
+		require.Equal(t, []byte{3}, added.Value)
+	})
+
 	t.Run("ConcurrentSnapshots", func(t *testing.T) {
 		// Create ThreadSafeSMT instance with initial data
 		smtInstance := NewSparseMerkleTree(api.SHA256, 2)

From 84389bf11511b8362ac6228b7412400a61b88184 Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Mon, 11 May 2026 19:29:19 +0300
Subject: [PATCH 11/13] test: handle perf response marshal errors

---
 cmd/performance-test/main.go | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/cmd/performance-test/main.go b/cmd/performance-test/main.go
index d4b9307..ef27326 100644
--- a/cmd/performance-test/main.go
+++ b/cmd/performance-test/main.go
@@ -148,7 +148,10 @@ func getStartingBlock(sc *ShardClient) (int64, error) {
 	}
 
 	var heightResp GetBlockHeightResponse
-	respBytes, _ := json.Marshal(resp.Result)
+	respBytes, err := json.Marshal(resp.Result)
+	if err != nil {
+		return 0, fmt.Errorf("marshal block height response: %w", err)
+	}
 	if err := json.Unmarshal(respBytes, &heightResp); err != nil {
 		return 0, fmt.Errorf("parse block height response: %w", err)
 	}
@@ -565,7 +568,15 @@ func commitmentWorker(ctx context.Context, shardClients []*ShardClient, metrics
 
 				// Parse response
 				var submitResp api.CertificationResponse
-				respBytes, _ := json.Marshal(resp.Result)
+				respBytes, err := json.Marshal(resp.Result)
+				if err != nil {
+					atomic.AddInt64(&metrics.failedRequests, 1)
+					if sm := metrics.shard(shardIdx); sm != nil {
+						sm.failedRequests.Add(1)
+					}
+					metrics.recordError(fmt.Sprintf("submit failed: marshal response result: %v", err))
+					return
+				}
 				if err := json.Unmarshal(respBytes, &submitResp); err != nil {
 					atomic.AddInt64(&metrics.failedRequests, 1)
 					return
@@ -718,7 +729,15 @@ func verifyProofJob(ctx context.Context, shardClients []*ShardClient, metrics *M
 		}
 
 		var proofResp api.GetInclusionProofResponseV2
-		respBytes, _ := json.Marshal(resp.Result)
+		respBytes, err := json.Marshal(resp.Result)
+		if err != nil {
+			metrics.recordError(fmt.Sprintf("Failed to marshal proof response: %v", err))
+			atomic.AddInt64(&metrics.proofFailed, 1)
+			if sm := metrics.shard(shardIdx); sm != nil {
+				sm.proofFailed.Add(1)
+			}
+			return
+		}
 		if err := json.Unmarshal(respBytes, &proofResp); err != nil {
 			metrics.recordError(fmt.Sprintf("Failed to parse proof response: %v", err))
 			atomic.AddInt64(&metrics.proofFailed, 1)

From 1ac68dc6eab2b2da6fb9ccf929e43e010c82916b Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Tue, 12 May 2026 11:13:09 +0300
Subject: [PATCH 12/13] update docs

---
 docs/aggregator-performance.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/aggregator-performance.md b/docs/aggregator-performance.md
index 2460706..a6a0a52 100644
--- a/docs/aggregator-performance.md
+++ b/docs/aggregator-performance.md
@@ -83,6 +83,15 @@ Current low-latency settings:
 | 8,000 | 239,807 / 239,807 | 239,807 / 239,807 | 1.129s | 1.677s | 975ms | 1.275s | 490ms | 86ms | 5,225 | 40.8% | 453% | 213% | pass |
 | 9,000 | 269,241 / 269,241 | 269,241 / 269,241 | 1.197s | 1.779s | 1.050s | 1.382s | 537ms | 103ms | 6,464 | 42.6% | 503% | 210% | boundary |
 
+### Lower BFT Cadence Variant
+
+This variant uses `ROOT_BLOCK_RATE=350`, `PRECOLLECTOR_GRACE_PERIOD=75ms`, `PROOF_INITIAL_DELAY=1s`, and `PROOF_RETRY_DELAY=500ms`. At 8k/s, most proofs are ready on the first 1s poll, while the remaining tail succeeds on the second poll.
+
+| Target RPS | Submitted | Proofs verified | Attempt 1 proofs | Client proof p50 | Client proof p95 | Server proofReady p50 | Server proofReady p95 | BFT wait | Finalization | Commitments / round | Host CPU busy | Aggregator CPU avg | Mongo CPU avg | Result |
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
+| 4,000 | 119,704 / 119,704 | 119,704 / 119,704 | 119,200 / 119,704 (99.6%) | 1.013s | 1.027s | 782ms | 998ms | 415ms | 40ms | 2,090 | 30.1% / 77.1% | 196% | 160% | pass |
+| 8,000 | 239,843 / 239,843 | 239,843 / 239,843 | 189,159 / 239,843 (78.9%) | 1.024s | 1.570s | 885ms | 1.161s | 442ms | 89ms | 4,818 | 41.3% / 82.4% | 404% | 208% | pass |
+
 ## Gateway Throughput
 
 Gateway tests use the fixed 2-shard `bft-sharding-compose.yml` stack. Request path: gateway -> HAProxy -> active shard leader.

From b6bc68cbc6a8b88f028eca9705eb97b8dc4c51c0 Mon Sep 17 00:00:00 2001
From: jait91 <johannesait91@gmail.com>
Date: Fri, 15 May 2026 11:26:30 +0300
Subject: [PATCH 13/13] PR fixes

---
 bft-sharding-compose.yml                      |  6 ++++++
 docs/aggregator-performance.md                |  8 ++++----
 internal/gateway/handlers_rest.go             |  2 +-
 internal/gateway/handlers_rest_test.go        |  5 +++--
 internal/round/round_manager.go               |  9 ++-------
 internal/service/parent_service.go            |  6 +++---
 internal/service/service.go                   |  9 ++++++---
 internal/storage/mongodb/aggregator_record.go |  2 ++
 internal/storage/mongodb/batch_insert.go      |  3 +++
 pkg/api/types.go                              | 10 ++++++++++
 10 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/bft-sharding-compose.yml b/bft-sharding-compose.yml
index 2658fa3..7544349 100644
--- a/bft-sharding-compose.yml
+++ b/bft-sharding-compose.yml
@@ -1,3 +1,8 @@
+# This stack uses the shared external Docker network `aggregator-go_default` so
+# gateway/proxy containers can join the same network. The Makefile targets create
+# it automatically; direct docker compose users can create it with:
+#   docker network create aggregator-go_default
+#
 # Example 4k/s perf run against this fixed 2-shard compose stack from the repo root:
 #   LOG_LEVEL=info make docker-run-bft-sh-clean
 #
@@ -64,6 +69,7 @@ services:
           echo "Signing root trust base..." &&
           ubft trust-base sign --home /genesis-root --trust-base /genesis/trust-base.json
         fi
+        # ROOT_BLOCK_RATE is in milliseconds.
         echo "Starting root node..." &&
         exec ubft root-node run --home /genesis-root --address "/ip4/0.0.0.0/tcp/8000" --trust-base /genesis/trust-base.json --rpc-server-address "0.0.0.0:8002" --block-rate "${ROOT_BLOCK_RATE:-900}"
 
diff --git a/docs/aggregator-performance.md b/docs/aggregator-performance.md
index a6a0a52..7d04e70 100644
--- a/docs/aggregator-performance.md
+++ b/docs/aggregator-performance.md
@@ -19,7 +19,7 @@ The default-cadence single-shard matrix uses the relevant `bft-sharding-compose.
 | `MAX_COMMITMENTS_PER_ROUND` | `20000` |
 | `REDIS_ACK_BATCH_SIZE` | `10000` |
 | `CONCURRENCY_LIMIT` | `10000` |
-| `ROOT_BLOCK_RATE` | `900` |
+| `ROOT_BLOCK_RATE` (ms) | `900` |
 | `PROOF_INITIAL_DELAY` | `2s` |
 | `PROOF_RETRY_DELAY` | `1s` |
 | Host CPU | AMD Ryzen 9 5900XT, 16 cores / 32 threads |
@@ -56,13 +56,13 @@ On one machine, adding shards mostly redistributes work; it does not add CPU, di
 
 ## Low-Latency BFT Cadence
 
-These runs test whether lower BFT cadence can move proof latency closer to 1s. Keep this separate from the default `ROOT_BLOCK_RATE=900` throughput matrix because it changes consensus timing.
+These runs test whether lower BFT cadence can move proof latency closer to 1s. Keep this separate from the default `ROOT_BLOCK_RATE=900` millisecond throughput matrix because it changes consensus timing.
 
 Current low-latency settings:
 
 | Setting | Value |
 |---|---:|
-| `ROOT_BLOCK_RATE` | `400` |
+| `ROOT_BLOCK_RATE` (ms) | `400` |
 | `PRECOLLECTOR_GRACE_PERIOD` | `100ms` |
 | `MONGODB_FINALIZATION_INSERT_CHUNK_SIZE` | `1000` |
 | `MONGODB_FINALIZATION_INSERT_CHUNK_WORKERS` | `16` |
@@ -85,7 +85,7 @@ Current low-latency settings:
 
 ### Lower BFT Cadence Variant
 
-This variant uses `ROOT_BLOCK_RATE=350`, `PRECOLLECTOR_GRACE_PERIOD=75ms`, `PROOF_INITIAL_DELAY=1s`, and `PROOF_RETRY_DELAY=500ms`. At 8k/s, most proofs are ready on the first 1s poll, while the remaining tail succeeds on the second poll.
+This variant uses `ROOT_BLOCK_RATE=350` milliseconds, `PRECOLLECTOR_GRACE_PERIOD=75ms`, `PROOF_INITIAL_DELAY=1s`, and `PROOF_RETRY_DELAY=500ms`. At 8k/s, most proofs are ready on the first 1s poll, while the remaining tail succeeds on the second poll.
 
 | Target RPS | Submitted | Proofs verified | Attempt 1 proofs | Client proof p50 | Client proof p95 | Server proofReady p50 | Server proofReady p95 | BFT wait | Finalization | Commitments / round | Host CPU busy | Aggregator CPU avg | Mongo CPU avg | Result |
 |---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
diff --git a/internal/gateway/handlers_rest.go b/internal/gateway/handlers_rest.go
index 8d4d153..24f664f 100644
--- a/internal/gateway/handlers_rest.go
+++ b/internal/gateway/handlers_rest.go
@@ -63,7 +63,7 @@ func (s *Server) handleLeaderHealth(c *gin.Context) {
 
 func isLeaderHealthRole(role string) bool {
 	switch role {
-	case "leader", "standalone", "parent-leader", "parent-standalone":
+	case api.HealthRoleLeader, api.HealthRoleStandalone, api.HealthRoleParentLeader, api.HealthRoleParentStandalone:
 		return true
 	default:
 		return false
diff --git a/internal/gateway/handlers_rest_test.go b/internal/gateway/handlers_rest_test.go
index 68dc519..276638f 100644
--- a/internal/gateway/handlers_rest_test.go
+++ b/internal/gateway/handlers_rest_test.go
@@ -16,6 +16,7 @@ import (
 	"github.com/unicitynetwork/bft-go-base/types"
 
 	"github.com/unicitynetwork/aggregator-go/internal/logger"
+	"github.com/unicitynetwork/aggregator-go/pkg/api"
 )
 
 func Test_GetTrustBases(t *testing.T) {
@@ -94,10 +95,10 @@ func Test_GetTrustBases(t *testing.T) {
 }
 
 func TestIsLeaderHealthRole(t *testing.T) {
-	for _, role := range []string{"leader", "standalone", "parent-leader", "parent-standalone"} {
+	for _, role := range []string{api.HealthRoleLeader, api.HealthRoleStandalone, api.HealthRoleParentLeader, api.HealthRoleParentStandalone} {
 		require.True(t, isLeaderHealthRole(role), role)
 	}
-	for _, role := range []string{"follower", "parent-follower", ""} {
+	for _, role := range []string{api.HealthRoleFollower, api.HealthRoleParentFollower, ""} {
 		require.False(t, isLeaderHealthRole(role), role)
 	}
 }
diff --git a/internal/round/round_manager.go b/internal/round/round_manager.go
index cf45a56..44b87e9 100644
--- a/internal/round/round_manager.go
+++ b/internal/round/round_manager.go
@@ -173,11 +173,6 @@ func NewRoundManager(
 	threadSafeSmt *smt.ThreadSafeSMT,
 	trustBaseProvider interfaces.TrustBaseProvider,
 ) (*RoundManager, error) {
-	commitmentStreamBufferSize := cfg.Processing.CommitmentStreamBufferSize
-	if commitmentStreamBufferSize <= 0 {
-		commitmentStreamBufferSize = 10000
-	}
-
 	rm := &RoundManager{
 		config:              cfg,
 		logger:              logger,
@@ -187,8 +182,8 @@ func NewRoundManager(
 		rootClient:          rootAggregatorClient,
 		stateTracker:        stateTracker,
 		eventBus:            eventBus,
-		roundDuration:       cfg.Processing.RoundDuration,                                        // Configurable round duration (default 1s)
-		commitmentStream:    make(chan *models.CertificationRequest, commitmentStreamBufferSize), // Buffer for queue streamer
+		roundDuration:       cfg.Processing.RoundDuration,                                                       // Configurable round duration (default 1s)
+		commitmentStream:    make(chan *models.CertificationRequest, cfg.Processing.CommitmentStreamBufferSize), // Buffer for queue streamer
 		proofPending:        make(map[string]struct{}),
 		avgProcessingRate:   1.0,                    // Initial estimate: 1 commitment per ms
 		avgFinalizationTime: 200 * time.Millisecond, // Initial estimate (conservative)
diff --git a/internal/service/parent_service.go b/internal/service/parent_service.go
index 0b5fa8a..809737f 100644
--- a/internal/service/parent_service.go
+++ b/internal/service/parent_service.go
@@ -289,12 +289,12 @@ func (pas *ParentAggregatorService) GetHealthStatus(ctx context.Context) (*api.H
 		}
 
 		if isLeader {
-			role = "parent-leader"
+			role = api.HealthRoleParentLeader
 		} else {
-			role = "parent-follower"
+			role = api.HealthRoleParentFollower
 		}
 	} else {
-		role = "parent-standalone"
+		role = api.HealthRoleParentStandalone
 		isLeader = true
 	}
 
diff --git a/internal/service/service.go b/internal/service/service.go
index 5152c29..97963f2 100644
--- a/internal/service/service.go
+++ b/internal/service/service.go
@@ -235,6 +235,9 @@ func (as *AggregatorService) GetInclusionProofV2(ctx context.Context, req *api.G
 		return nil, fmt.Errorf("unexpected SMT key length: got %d bits, want %d", keyLen, api.StateTreeKeyLengthBits)
 	}
 
+	// Known-pending requests return the latest finalized UC with an empty proof.
+	// This is only a cheap "not ready" response; it does not identify the block
+	// where the pending state will eventually finalize.
 	if block, ok := as.roundManager.GetKnownNotReadyBlock(req.StateID); ok {
 		responseBlockNumber, err := proofBundleBlockNumber(as.config.Sharding.Mode, block)
 		if err != nil {
@@ -435,12 +438,12 @@ func (as *AggregatorService) GetHealthStatus(ctx context.Context) (*api.HealthSt
 		}
 
 		if isLeader {
-			role = "leader"
+			role = api.HealthRoleLeader
 		} else {
-			role = "follower"
+			role = api.HealthRoleFollower
 		}
 	} else {
-		role = "standalone"
+		role = api.HealthRoleStandalone
 	}
 
 	sharding := buildShardingHealth(as.config)
diff --git a/internal/storage/mongodb/aggregator_record.go b/internal/storage/mongodb/aggregator_record.go
index 51df8f5..3f11fc0 100644
--- a/internal/storage/mongodb/aggregator_record.go
+++ b/internal/storage/mongodb/aggregator_record.go
@@ -134,6 +134,8 @@ func (ars *AggregatorRecordStorage) GetByBlockNumber(ctx context.Context, blockN
 		return nil, fmt.Errorf("cursor error: %w", err)
 	}
 
+	// Preserve get_block_records ordering after dropping the write-heavy
+	// {blockNumber, leafIndex} Mongo index.
 	sort.SliceStable(records, func(i, j int) bool {
 		left := records[i].LeafIndex
 		right := records[j].LeafIndex
diff --git a/internal/storage/mongodb/batch_insert.go b/internal/storage/mongodb/batch_insert.go
index ae55d8c..0385609 100644
--- a/internal/storage/mongodb/batch_insert.go
+++ b/internal/storage/mongodb/batch_insert.go
@@ -13,6 +13,9 @@ type finalizationInsertOptions struct {
 	workers   int
 }
 
+// insertManyFinalizationBatch is used for idempotent finalization writes. On
+// non-duplicate errors, some chunks may already be inserted; callers must
+// tolerate retry/recovery, with duplicate-key errors ignored on replay.
 func insertManyFinalizationBatch(
 	ctx context.Context,
 	collection *mongo.Collection,
diff --git a/pkg/api/types.go b/pkg/api/types.go
index 6adc3d7..07e2b51 100644
--- a/pkg/api/types.go
+++ b/pkg/api/types.go
@@ -251,6 +251,16 @@ const (
 	HealthStatusDegraded  = "degraded"
 )
 
+// Health role values returned by the health endpoint.
+const (
+	HealthRoleLeader           = "leader"
+	HealthRoleFollower         = "follower"
+	HealthRoleStandalone       = "standalone"
+	HealthRoleParentLeader     = "parent-leader"
+	HealthRoleParentFollower   = "parent-follower"
+	HealthRoleParentStandalone = "parent-standalone"
+)
+
 // SubmitShardRootRequest represents the submit_shard_root JSON-RPC request
 type SubmitShardRootRequest struct {
 	ShardID  ShardID  `json:"shardId"`