diff --git a/.changelog/6311.feature.md b/.changelog/6311.feature.md new file mode 100644 index 00000000000..d091b6e0bd0 --- /dev/null +++ b/.changelog/6311.feature.md @@ -0,0 +1,8 @@ +go/oasis-node: Add new command for compacting consensus databases + +A new experimental command `oasis-node storage compact-experimental` +was added. + +The command triggers manual compactions for all the consensus databases. +This way node operators can forcefuly release disk space if enabling late +pruning. diff --git a/docs/oasis-node/cli.md b/docs/oasis-node/cli.md index 61cfb14faf1..464540bf148 100644 --- a/docs/oasis-node/cli.md +++ b/docs/oasis-node/cli.md @@ -331,3 +331,36 @@ response: ``` oasis1qqncl383h8458mr9cytatygctzwsx02n4c5f8ed7 ``` + +## storage + +### compact-experimental + +Run + +```sh +oasis-node storage compact-experimental --config /path/to/config/file +``` + +to trigger manual compaction of consensus database instances: + +```sh +{"caller":"storage.go:310","level":"info","module":"cmd/storage", \ +"msg":"Starting database compactions. This may take a while...", \ +"ts":"2025-10-08T09:18:22.185451554Z"} +``` + +If pruning was not enabled from the start or was recently increased, then even +after successful pruning, the disk usage may stay the same. + +This is due to the LSM-tree storage design that BadgerDB uses. Concretely, +deleting a key only marks it as ready to be deleted (a tombstone entry). The +actual removal of the stale data happens later during the compaction. + +During normal operation, compaction happens in the background. However, BadgerDB +is intentionally lazy, trading write throughput for disk space among other +things. Therefore it is expected that in case of late pruning, the disk space +may stay constant or not be reclaimed for a very long time. + +This command gives operators manual control to release disk space during +maintenance periods. diff --git a/go/consensus/cometbft/db/badger/badger.go b/go/consensus/cometbft/db/badger/badger.go index 71695c95d28..45abc9a8e15 100644 --- a/go/consensus/cometbft/db/badger/badger.go +++ b/go/consensus/cometbft/db/badger/badger.go @@ -63,13 +63,7 @@ func New(fn string, noSuffix bool) (dbm.DB, error) { logger := baseLogger.With("path", fn) - opts := badger.DefaultOptions(fn) // This may benefit from LSMOnlyOptions. - opts = opts.WithLogger(cmnBadger.NewLogAdapter(logger)) - opts = opts.WithSyncWrites(false) - opts = opts.WithCompression(options.Snappy) - opts = opts.WithBlockCacheSize(64 * 1024 * 1024) - - db, err := badger.Open(opts) + db, err := OpenBadger(fn, logger) if err != nil { return nil, fmt.Errorf("cometbft/db/badger: failed to open database: %w", err) } @@ -86,6 +80,17 @@ func New(fn string, noSuffix bool) (dbm.DB, error) { return impl, nil } +// OpenBadger opens badgerDB instance used for constructing instance that implements +// CometBFT DB interface. +func OpenBadger(path string, logger *logging.Logger) (*badger.DB, error) { + opts := badger.DefaultOptions(path) // This may benefit from LSMOnlyOptions. + opts = opts.WithLogger(cmnBadger.NewLogAdapter(logger)) + opts = opts.WithSyncWrites(false) + opts = opts.WithCompression(options.Snappy) + opts = opts.WithBlockCacheSize(64 * 1024 * 1024) + return badger.Open(opts) +} + func (d *badgerDBImpl) Get(key []byte) ([]byte, error) { k := toDBKey(key) diff --git a/go/oasis-node/cmd/storage/storage.go b/go/oasis-node/cmd/storage/storage.go index 323597ea6aa..baccd32a3ef 100644 --- a/go/oasis-node/cmd/storage/storage.go +++ b/go/oasis-node/cmd/storage/storage.go @@ -5,16 +5,22 @@ import ( "context" "errors" "fmt" + "io/fs" "os" "path/filepath" + "strings" "time" + badgerDB "github.com/dgraph-io/badger/v4" "github.com/spf13/cobra" "github.com/oasisprotocol/oasis-core/go/common" "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" "github.com/oasisprotocol/oasis-core/go/common/logging" "github.com/oasisprotocol/oasis-core/go/config" + "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/abci" + cmtCommon "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/common" + cmtDBProvider "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/db/badger" cmdCommon "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common" roothash "github.com/oasisprotocol/oasis-core/go/roothash/api" "github.com/oasisprotocol/oasis-core/go/runtime/bundle" @@ -53,6 +59,17 @@ var ( RunE: doRenameNs, } + storageCompactCmd = &cobra.Command{ + Use: "compact-experimental", + Args: cobra.NoArgs, + Short: "EXPERIMENTAL: trigger compaction for all consensus databases", + Long: `EXPERIMENTAL: Optimize the storage for all consensus databases by manually compacting the underlying storage engines. + +WARNING: Ensure you have at least as much of a free disk as your largest database. +`, + RunE: doDBCompactions, + } + logger = logging.GetLogger("cmd/storage") pretty = cmdCommon.Isatty(1) @@ -283,6 +300,112 @@ func doRenameNs(_ *cobra.Command, args []string) error { return nil } +func doDBCompactions(_ *cobra.Command, args []string) error { + if err := cmdCommon.Init(); err != nil { + cmdCommon.EarlyLogAndExit(err) + } + + dataDir := cmdCommon.DataDir() + + logger.Info("Starting database compactions. This may take a while...") + + // Compact CometBFT managed databases: block store, evidence and state (NOT application state). + if err := compactCometDBs(dataDir); err != nil { + return fmt.Errorf("failed to compact CometBFT managed databases: %w", err) + } + + if err := compactConsensusNodeDB(dataDir); err != nil { + return fmt.Errorf("failed to compact consensus NodeDB: %w", err) + } + + return nil +} + +func compactCometDBs(dataDir string) error { + paths, err := findCometDBs(dataDir) + if err != nil { + return fmt.Errorf("failed to find database instances: %w", err) + } + for _, path := range paths { + if err := compactCometDB(path); err != nil { + return fmt.Errorf("failed to compact %s: %w", path, err) + } + } + return nil +} + +func compactCometDB(path string) error { + logger := logger.With("path", path) + db, err := cmtDBProvider.OpenBadger(path, logger) + if err != nil { + return fmt.Errorf("failed to open BadgerDB: %w", err) + } + + if err := flattenBadgerDB(db, logger); err != nil { + return fmt.Errorf("failed to compact %s: %w", path, err) + } + + return nil +} + +func findCometDBs(dataDir string) ([]string, error) { + dir := fmt.Sprintf("%s/consensus/data", dataDir) + + var dbDirs []string + err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() && strings.HasSuffix(d.Name(), ".db") { + dbDirs = append(dbDirs, path) + } + return nil + }) + if err != nil { + return nil, fmt.Errorf("failed to walk dir %s: %w", dataDir, err) + } + + if len(dbDirs) == 0 { + return nil, fmt.Errorf("zero database instances found") + } + + return dbDirs, nil +} + +func flattenBadgerDB(db *badgerDB.DB, logger *logging.Logger) error { + logger.Info("compacting") + + if err := db.Flatten(1); err != nil { + return fmt.Errorf("failed to flatten db: %w", err) + } + + logger.Info("compaction completed") + + return nil +} + +func compactConsensusNodeDB(dataDir string) error { + ldb, ndb, _, err := abci.InitStateStorage( + &abci.ApplicationConfig{ + DataDir: filepath.Join(dataDir, cmtCommon.StateDir), + StorageBackend: config.GlobalConfig.Storage.Backend, + MemoryOnlyStorage: false, + ReadOnlyStorage: false, + DisableCheckpointer: true, + }, + ) + if err != nil { + return fmt.Errorf("failed to initialize ABCI storage backend: %w", err) + } + + // Close the resources. Both Close and Cleanup only close NodeDB. + // Closing both here, to prevent resource leaks if things change in the future. + defer ndb.Close() + defer ldb.Cleanup() + + return ndb.Compact() +} + // Register registers the client sub-command and all of its children. func Register(parentCmd *cobra.Command) { storageMigrateCmd.Flags().AddFlagSet(bundle.Flags) @@ -290,5 +413,6 @@ func Register(parentCmd *cobra.Command) { storageCmd.AddCommand(storageMigrateCmd) storageCmd.AddCommand(storageCheckCmd) storageCmd.AddCommand(storageRenameNsCmd) + storageCmd.AddCommand(storageCompactCmd) parentCmd.AddCommand(storageCmd) } diff --git a/go/storage/mkvs/db/api/api.go b/go/storage/mkvs/db/api/api.go index 290ef31fcb1..d7a5fd6f8ff 100644 --- a/go/storage/mkvs/db/api/api.go +++ b/go/storage/mkvs/db/api/api.go @@ -184,6 +184,12 @@ type NodeDB interface { // Only the earliest version can be pruned, passing any other version will result in an error. Prune(version uint64) error + // Compact triggers compaction of the NodeDB underlying storage engine. + // + // Warning: Depending on the NodeDB implementation this may be only safe to call when no + // writes are happening. + Compact() error + // Size returns the size of the database in bytes. Size() (int64, error) @@ -294,6 +300,10 @@ func (d *nopNodeDB) Prune(uint64) error { return nil } +func (d *nopNodeDB) Compact() error { + return nil +} + func (d *nopNodeDB) Size() (int64, error) { return 0, nil } diff --git a/go/storage/mkvs/db/badger/badger.go b/go/storage/mkvs/db/badger/badger.go index 6741b31220f..2ea64f95dde 100644 --- a/go/storage/mkvs/db/badger/badger.go +++ b/go/storage/mkvs/db/badger/badger.go @@ -101,6 +101,17 @@ func New(cfg *api.Config) (api.NodeDB, error) { db.gc = cmnBadger.NewGCWorker(db.logger, db.db) db.gc.Start() + // Setting a discard timestamp of the BadgerDB is not persistent and is currently + // only done during the prune operation. + // + // Imagine a scenario where during the previous boot of the BadgerDB, data was successfully pruned, + // but not yet compacted. Then the NodeDB is restarted, only this time with pruning disabled. + // Unless setting discard timestamp to the earliest version manually, the data stored for the + // already pruned versions may never be compacted, resulting in redundant disk usage. + if discardTs := versionToTs(db.GetEarliestVersion()) - 1; discardTs > tsMetadata { + db.db.SetDiscardTs(discardTs) + } + return db, nil } @@ -915,6 +926,18 @@ func (d *badgerNodeDB) NewBatch(oldRoot node.Root, version uint64, chunk bool) ( }, nil } +func (d *badgerNodeDB) Compact() error { + d.logger.Info("compacting") + + if err := d.db.Flatten(1); err != nil { + return fmt.Errorf("failed to flatten db: %w", err) + } + + d.logger.Info("compaction completed") + + return nil +} + func (d *badgerNodeDB) Size() (int64, error) { lsm, vlog := d.db.Size() return lsm + vlog, nil diff --git a/go/storage/mkvs/db/pathbadger/pathbadger.go b/go/storage/mkvs/db/pathbadger/pathbadger.go index 5ee8961571a..4a33292bd03 100644 --- a/go/storage/mkvs/db/pathbadger/pathbadger.go +++ b/go/storage/mkvs/db/pathbadger/pathbadger.go @@ -58,6 +58,17 @@ func New(cfg *api.Config) (api.NodeDB, error) { db.gc = cmnBadger.NewGCWorker(db.logger, db.db) db.gc.Start() + // Setting a discard timestamp of the BadgerDB is not persistent and is currently + // only done during the prune operation. + // + // Imagine a scenario where during the previous boot of the BadgerDB, data was successfully pruned, + // but not yet compacted. Then the NodeDB is restarted, only this time with pruning disabled. + // Unless setting discard timestamp to the earliest version manually, the data stored for the + // already pruned versions may never be compacted, resulting in redundant disk usage. + if discardTs := versionToTs(db.GetEarliestVersion()) - 1; discardTs > tsMetadata { + db.db.SetDiscardTs(discardTs) + } + return db, nil } @@ -726,6 +737,18 @@ func (d *badgerNodeDB) NewBatch(oldRoot node.Root, version uint64, chunk bool) ( }, nil } +func (d *badgerNodeDB) Compact() error { + d.logger.Info("compacting") + + if err := d.db.Flatten(1); err != nil { + return fmt.Errorf("failed to flatten db: %w", err) + } + + d.logger.Info("compaction completed") + + return nil +} + // Implements api.NodeDB. func (d *badgerNodeDB) Size() (int64, error) { lsm, vlog := d.db.Size()