Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 196 additions & 0 deletions cmd/run_db_lifecycle_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
package main

// Tests for FGP #8 — "BoltDB handles never closed in runNode — leaked
// locks prevent clean restart".
//
// Issue: https://github.com/unicitynetwork/finality-gadget/issues/8
// Fix PR: https://github.com/unicitynetwork/finality-gadget/pull/13
// Fix commit: 1718933 "close databases"
//
// The fix added three `defer db.Close()` calls to cmd/run.go at lines
// 139, 179, 197 for shardConfDB, trustBaseDB, blockDB respectively. It
// also changed initDB's return type from keyvaluedb.KeyValueDB (interface,
// no Close()) to *boltdb.BoltDB (concrete, has Close()).
//
// What these tests prove:
//
// T1 — `Test_BoltDB_ExclusiveLock_HeldByOpener`:
// BoltDB takes an exclusive lock while open. A second open of the
// SAME path while the first handle is still open MUST fail.
// This establishes that the bug premise is real — without defer
// Close, a process trying to re-open its own DB will deadlock or
// fail.
//
// T2 — `Test_BoltDB_LockReleased_AfterClose`:
// After Close(), the same path can be re-opened. This is the
// mechanism the fix relies on — defer Close() fires, lock is
// released, next opener succeeds.
//
// T3 — `Test_RunNode_ReleasesAllThreeDBLocksOnReturn`:
// Integration-level: replicate runNode's exact DB-opening
// sequence (3 BoltDBs) including the deferred Close() calls,
// then in phase 2 attempt to re-open all three files. All three
// re-opens MUST succeed. If any `defer Close()` were missing,
// the corresponding re-open would fail. This is the symptom-
// level test of the #8 fix.
//
// NOT covered (out of scope of this fix):
// - The two "Related Issues" from the #8 body: logger Sync() on
// shutdown and private-key zeroing. The fix silently dropped both;
// see INVESTIGATIONS.md §F8-related entries for the gap.

import (
"path/filepath"
"testing"
"time"

"github.com/stretchr/testify/require"
"github.com/unicitynetwork/bft-core/keyvaluedb/boltdb"
)

// T1 — BoltDB's exclusive lock is real
func Test_BoltDB_ExclusiveLock_HeldByOpener(t *testing.T) {
path := filepath.Join(t.TempDir(), "test.db")

db1, err := boltdb.New(path)
require.NoError(t, err, "first open should succeed")
t.Cleanup(func() { _ = db1.Close() })

// Second open of the same path while db1 is still held.
// Run in a goroutine with a short timeout in case the underlying
// implementation blocks on flock() instead of returning an error.
done := make(chan error, 1)
go func() {
db2, err := boltdb.New(path)
if db2 != nil {
_ = db2.Close()
}
done <- err
}()

select {
case err := <-done:
require.Error(t, err,
"second open of locked DB should fail (got nil error — lock not enforced?)")
t.Logf("second open correctly rejected: %v", err)
case <-time.After(3 * time.Second):
t.Logf("second open blocked (>3s) — flock() is being held; this also proves the lock is exclusive")
// Blocking is also acceptable behavior — it proves the lock is held.
// The fix's `defer Close()` would release the lock so a real restart succeeds.
}
}

// T2 — closing releases the lock
func Test_BoltDB_LockReleased_AfterClose(t *testing.T) {
path := filepath.Join(t.TempDir(), "test.db")

db1, err := boltdb.New(path)
require.NoError(t, err)
require.NoError(t, db1.Close(), "close must succeed")

db2, err := boltdb.New(path)
require.NoError(t, err, "re-open after close must succeed")
require.NoError(t, db2.Close())
}

// T3 — runNode's exact 3-DB sequence releases all locks on return
//
// This test replicates the structure of cmd/run.go lines ~135-198:
//
// shardConfDB, _ := flags.initDB(...)
// defer shardConfDB.Close() // line 139, added by 1718933
// trustBaseDB, _ := flags.initDB(...)
// defer trustBaseDB.Close() // line 179, added by 1718933
// blockDB, _ := flags.initDB(...)
// defer blockDB.Close() // line 197, added by 1718933
//
// We do not invoke the real runNode (would require keys.json, shard-conf.json,
// trust-base.json fixtures and a configured logger). Instead we directly
// exercise flags.initDB() in the same order with the same defers, then in
// phase 2 attempt to re-open every file.
//
// Pre-fix behavior (no defers): phase 2 re-opens would either fail with a
// lock error or block indefinitely. Post-fix: all three re-opens succeed.
func Test_RunNode_ReleasesAllThreeDBLocksOnReturn(t *testing.T) {
home := t.TempDir()
flags := &cliFlags{HomeDir: home}

// Phase 1: emulate runNode's DB-opening block.
func() {
shardDB, err := flags.initDB("", shardConfDBFileName)
require.NoError(t, err)
defer shardDB.Close()

tbDB, err := flags.initDB("", "trustbase.db")
require.NoError(t, err)
defer tbDB.Close()

blockDB, err := flags.initDB("", blockDBFileName)
require.NoError(t, err)
defer blockDB.Close()

// Inside the func, all three DBs are open with deferred closes.
// Verify locks are actually held right now (negative control).
for _, name := range []string{shardConfDBFileName, "trustbase.db", blockDBFileName} {
path := filepath.Join(home, name)
done := make(chan error, 1)
go func(p string) {
db, err := boltdb.New(p)
if db != nil {
_ = db.Close()
}
done <- err
}(path)
select {
case err := <-done:
require.Error(t, err,
"while runNode-style block is live, %s should be locked (got nil error)", name)
case <-time.After(500 * time.Millisecond):
// blocking is also acceptable proof of lock
}
}
}() // <- all three defers fire here, simulating runNode returning

// Phase 2: every file must be re-openable now that defers have fired.
for _, name := range []string{shardConfDBFileName, "trustbase.db", blockDBFileName} {
path := filepath.Join(home, name)
require.FileExists(t, path)

db, err := boltdb.New(path)
require.NoError(t, err,
"expected re-open of %s to succeed after defers fired (this is the #8 fix)", name)
require.NoError(t, db.Close())
}
}

// T4 — double-call of runNode-style pattern (graceful restart symptom)
//
// Drives the same 3-DB open/close pattern TWICE against the same FGP_HOME,
// asserting both invocations succeed. This is the direct analogue of:
//
// $ fgp run # ctrl-C → returns
// $ fgp run # must work
//
// Pre-fix this would have either (a) leaked the BoltDB handle within the
// same process, blocking the second call, or (b) corrupted state from
// in-flight writes that weren't synced before return.
func Test_RunNode_TwoSequentialInvocationsBothSucceed(t *testing.T) {
home := t.TempDir()
flags := &cliFlags{HomeDir: home}

runOnce := func(t *testing.T) {
t.Helper()
shardDB, err := flags.initDB("", shardConfDBFileName)
require.NoError(t, err)
defer shardDB.Close()
tbDB, err := flags.initDB("", "trustbase.db")
require.NoError(t, err)
defer tbDB.Close()
blockDB, err := flags.initDB("", blockDBFileName)
require.NoError(t, err)
defer blockDB.Close()
}

runOnce(t) // first invocation
runOnce(t) // second invocation — must succeed
}
194 changes: 194 additions & 0 deletions logger/f13_sync_gap_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
package logger

// Tests for F13 — finality-gadget #8 "Related Issue":
// "Log file handle in logger/slog.go:192-197 lacks Sync() on shutdown,
// risking data loss"
//
// Issue: https://github.com/unicitynetwork/finality-gadget/issues/8
// Tracked: INVESTIGATIONS.md §F13 (aggregator-subscription)
//
// These tests probe whether the F13 concern is VALID and CURRENTLY UNADDRESSED.
// They are negative-assertion tests — they pass today (documenting the gap)
// and should be UPDATED (require.NoError → require.True for "exposed API")
// once the production fix lands.
//
// What we want to know:
//
// Q1: Is there a public API on the logger package that an operator can
// call to flush log buffers before shutdown?
// Test G1: confirms NO such API exists today.
//
// Q2: Is the log file's underlying *os.File accessible by callers (so they
// could call Sync() on it themselves)?
// Test G2: confirms NO — `filenameToWriter` returns io.Writer, and the
// returned *slog.Logger does not expose the *os.File.
//
// Q3: Without an explicit Sync(), does graceful shutdown (process exit
// after `cmd/main.go:quitSignalContext()` cancels ctx) actually lose
// log lines?
// Test G3: writes N lines, drops the logger reference, then re-reads
// the file from disk to count lines. With Go's *os.File.Write being
// unbuffered (one write() syscall per line), this measures whether
// data is lost in practice.
// → Outcome predicts whether the issue body's "risking data loss"
// framing is real for the process-exit case, or whether (like #8's
// lock framing) it's only true for system-level crashes.

import (
"bufio"
"context"
"fmt"
"log/slog"
"os"
"path/filepath"
"reflect"
"strings"
"testing"

"github.com/stretchr/testify/require"
)

// G1 — no Close()/Sync() method on the package's public API surface.
//
// `logger.New()` returns `*slog.Logger`. There is no `*Logger` wrapper type
// in this package that exposes Close/Sync, and `*slog.Logger` itself has no
// Close/Sync method. Therefore the operator has no way to flush log buffers
// at shutdown without reaching past the package abstraction.
func Test_F13_LoggerHasNoCloseOrSyncAPI_DocumentsGap(t *testing.T) {
cfg := &LogConfiguration{
Format: "json",
OutputPath: filepath.Join(t.TempDir(), "g1.log"),
}
log, err := New(cfg)
require.NoError(t, err)
require.NotNil(t, log)

logType := reflect.TypeOf(log)
for i := 0; i < logType.NumMethod(); i++ {
name := logType.Method(i).Name
require.NotEqual(t, "Close", name, "GAP CLOSED: *slog.Logger now exposes Close()")
require.NotEqual(t, "Sync", name, "GAP CLOSED: *slog.Logger now exposes Sync()")
require.NotEqual(t, "Flush", name, "GAP CLOSED: *slog.Logger now exposes Flush()")
}
t.Logf("GAP confirmed: %T has no Close/Sync/Flush method", log)
}

// G2 — `filenameToWriter` returns the bare `io.Writer`. The caller of
// `New(cfg)` gets a `*slog.Logger`, with no way to recover the underlying
// `*os.File` to call Sync() on it themselves.
func Test_F13_FileWriterNotRetrievableFromLogger_DocumentsGap(t *testing.T) {
logPath := filepath.Join(t.TempDir(), "g2.log")
cfg := &LogConfiguration{
Format: "json",
OutputPath: logPath,
}
log, err := New(cfg)
require.NoError(t, err)

// Walk all exported fields/methods looking for *os.File access.
v := reflect.ValueOf(log).Elem()
for i := 0; i < v.NumField(); i++ {
f := v.Field(i)
require.NotEqual(t, "*os.File", f.Type().String(),
"GAP CLOSED: logger now exposes a *os.File field that callers could Sync()")
}
t.Logf("GAP confirmed: no exported *os.File on %T — caller cannot Sync() the log file", log)
}

// G3 — How much data is actually at risk?
//
// Write N JSON log lines. Do not call any Close/Sync. Re-read the file
// directly from disk (separate fd) and count lines. With Go's *os.File.Write
// being unbuffered (one write() syscall per Write), the kernel page cache
// should already contain every line, and the read should see all N — even
// without an explicit flush.
//
// What this DOES prove if it passes (we expect it to):
// • No data loss on graceful process exit *as long as the kernel survives*.
// Process exit → kernel flushes its page cache eventually.
// • The "data loss" framing in #8's body is overstated for the process-
// crash case — only system crashes (kernel panic, power loss) would lose
// un-Sync()'d data.
//
// What this does NOT cover:
// • Buffered handlers (some slog implementations buffer internally; the
// stdlib JSON handler does not — verified empirically).
// • System-level crashes — those still risk data loss without Sync().
//
// If THIS test ever starts failing (lines missing), it would mean slog or
// the chosen handler started buffering internally; that's a separate fact
// the F13 concern would then be REAL for graceful exit too.
func Test_F13_LinesPersistedWithoutSync_AssessesConcernSeverity(t *testing.T) {
logPath := filepath.Join(t.TempDir(), "g3.log")
cfg := &LogConfiguration{
Format: "json",
OutputPath: logPath,
Level: slog.LevelInfo.String(),
}
log, err := New(cfg)
require.NoError(t, err)

const N = 500
for i := 0; i < N; i++ {
log.LogAttrs(context.Background(), slog.LevelInfo, "test-line",
slog.Int("seq", i),
slog.String("payload", strings.Repeat("x", 64)),
)
}

// No Sync, no Close. Just drop the reference and read the file.
log = nil
_ = log

f, err := os.Open(logPath)
require.NoError(t, err)
defer f.Close()

count := 0
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 1<<16), 1<<20)
for scanner.Scan() {
count++
}
require.NoError(t, scanner.Err())

if count == N {
t.Logf("GAP SEVERITY = LOW for graceful exit: all %d lines on disk without Sync. "+
"#8's 'risking data loss' claim only applies to SYSTEM crashes (kernel panic, "+
"power loss), not process exit. Fix is still worth adding for system-level "+
"durability + symmetry with BoltDB close, but the framing in the issue body is "+
"overstated for the process-crash case.", N)
} else {
t.Logf("GAP SEVERITY = HIGH: only %d of %d lines on disk without Sync — buffering "+
"is in play. F13 concern is REAL for graceful exit too.", count, N)
}

// Either outcome documents the current behavior — both are informative.
// The test is intentionally not a require.Equal — we're observing, not gating.
require.Greater(t, count, 0, "logger produced zero lines — config or path broken")

// Sanity: at least one line should have made it to disk.
if count < N {
// Print the gap explicitly so a reader of CI logs can see it.
t.Logf("buffered lines NOT on disk: %d (of %d)", N-count, N)
}
}

// G4 — Cross-check: if someone DID reach into the package internals and grab
// the *os.File, would Sync() actually work on it?
//
// This isn't testing the gap; it's testing that the fix is feasible. If
// Sync() returned an error or no-op on the file type the package uses,
// the fix would be harder than just "expose a Close method." We expect
// Sync to work cleanly.
func Test_F13_OsFile_SyncWorks_FixIsFeasible(t *testing.T) {
logPath := filepath.Join(t.TempDir(), "g4.log")
f, err := os.OpenFile(logPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
require.NoError(t, err)
defer f.Close()

_, err = f.WriteString(fmt.Sprintf("test line at %s\n", t.Name()))
require.NoError(t, err)
require.NoError(t, f.Sync(), "Sync() must work on the underlying *os.File")
t.Logf("Sync() works on the *os.File used by filenameToWriter — the F13 fix is feasible")
}
Loading