From 18f1ff0b9d7c8c398ce6eee71469e51278a3e35b Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 18 Nov 2025 16:27:49 -0500 Subject: [PATCH] Add corruption recovery to history.db history.db currently lacks the automatic corruption recovery that cache.db has, causing BuildKit to fail on startup if history.db is corrupted. This is inconsistent since both databases are disposable (losing history is inconvenient but not fatal). This commit: - Extracts the safe database opening logic to util/db/boltutil/SafeOpen - Updates cache.db to use the shared SafeOpen function - Applies the same recovery mechanism to history.db The recovery mechanism backs up corrupted databases and creates fresh ones, allowing BuildKit to start successfully even after abrupt shutdowns or snapshot-related corruption (common with NoSync + network block devices like Ceph RBD). Fixes startup failures when history.db is corrupted, matching the resilience already present for cache.db since commit ccc06b7ff. Signed-off-by: Aditya Maru Signed-off-by: Claude --- cmd/buildkitd/main.go | 2 +- solver/bboltcachestorage/storage.go | 53 +-------------------------- util/db/boltutil/safe_open.go | 56 +++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 53 deletions(-) create mode 100644 util/db/boltutil/safe_open.go diff --git a/cmd/buildkitd/main.go b/cmd/buildkitd/main.go index b7d1ac8f5982..73684ce7d862 100644 --- a/cmd/buildkitd/main.go +++ b/cmd/buildkitd/main.go @@ -811,7 +811,7 @@ func newController(ctx context.Context, c *cli.Context, cfg *config.Config) (*co return nil, err } - historyDB, err := boltutil.Open(filepath.Join(cfg.Root, "history.db"), 0600, nil) + historyDB, err := boltutil.SafeOpen(filepath.Join(cfg.Root, "history.db"), 0600, nil) if err != nil { return nil, err } diff --git a/solver/bboltcachestorage/storage.go b/solver/bboltcachestorage/storage.go index c374bb6cd53a..55a9270ad0b9 100644 --- a/solver/bboltcachestorage/storage.go +++ b/solver/bboltcachestorage/storage.go @@ -4,11 +4,8 @@ import ( "bytes" "encoding/json" "fmt" - "os" - "github.com/moby/buildkit/identity" "github.com/moby/buildkit/solver" - "github.com/moby/buildkit/util/bklog" "github.com/moby/buildkit/util/db" "github.com/moby/buildkit/util/db/boltutil" digest "github.com/opencontainers/go-digest" @@ -28,7 +25,7 @@ type Store struct { } func NewStore(dbPath string) (*Store, error) { - db, err := safeOpenDB(dbPath, &bolt.Options{ + db, err := boltutil.SafeOpen(dbPath, 0600, &bolt.Options{ NoSync: true, }) if err != nil { @@ -465,51 +462,3 @@ func isEmptyBucket(b *bolt.Bucket) bool { k, _ := b.Cursor().First() return k == nil } - -// safeOpenDB opens a bolt database and recovers from panic that -// can be caused by a corrupted database file. -func safeOpenDB(dbPath string, opts *bolt.Options) (db db.DB, err error) { - defer func() { - if r := recover(); r != nil { - err = errors.Errorf("%v", r) - } - - // If we get an error when opening the database, but we have - // access to the file and the file looks like it has content, - // then fallback to resetting the database since the database - // may be corrupt. - if err != nil && fileHasContent(dbPath) { - db, err = fallbackOpenDB(dbPath, opts, err) - } - }() - return openDB(dbPath, opts) -} - -// fallbackOpenDB performs database recovery and opens the new database -// file when the database fails to open. Called after the first database -// open fails. -func fallbackOpenDB(dbPath string, opts *bolt.Options, openErr error) (db.DB, error) { - backupPath := dbPath + "." + identity.NewID() + ".bak" - bklog.L.Errorf("failed to open database file %s, resetting to empty. Old database is backed up to %s. "+ - "This error signifies that buildkitd likely crashed or was sigkilled abrubtly, leaving the database corrupted. "+ - "If you see logs from a previous panic then please report in the issue tracker at https://github.com/moby/buildkit . %+v", dbPath, backupPath, openErr) - if err := os.Rename(dbPath, backupPath); err != nil { - return nil, errors.Wrapf(err, "failed to rename database file %s to %s", dbPath, backupPath) - } - - // Attempt to open the database again. This should be a new database. - // If this fails, it is a permanent error. - return openDB(dbPath, opts) -} - -// openDB opens a bolt database in user-only read/write mode. -func openDB(dbPath string, opts *bolt.Options) (db.DB, error) { - return boltutil.Open(dbPath, 0600, opts) -} - -// fileHasContent checks if we have access to the file with appropriate -// permissions and the file has a non-zero size. -func fileHasContent(dbPath string) bool { - st, err := os.Stat(dbPath) - return err == nil && st.Size() > 0 -} diff --git a/util/db/boltutil/safe_open.go b/util/db/boltutil/safe_open.go new file mode 100644 index 000000000000..621542024b8a --- /dev/null +++ b/util/db/boltutil/safe_open.go @@ -0,0 +1,56 @@ +package boltutil + +import ( + "os" + + "github.com/moby/buildkit/identity" + "github.com/moby/buildkit/util/bklog" + "github.com/moby/buildkit/util/db" + "github.com/pkg/errors" + bolt "go.etcd.io/bbolt" +) + +// SafeOpen opens a bolt database with automatic recovery from corruption. +// If the database file is corrupted, it backs up the corrupted file and creates +// a new empty database. This is useful for disposable databases like cache or +// history where data loss is acceptable but startup failure is not. +func SafeOpen(dbPath string, mode os.FileMode, opts *bolt.Options) (db db.DB, err error) { + defer func() { + if r := recover(); r != nil { + err = errors.Errorf("%v", r) + } + + // If we get an error when opening the database, but we have + // access to the file and the file looks like it has content, + // then fallback to resetting the database since the database + // may be corrupt. + if err != nil && fileHasContent(dbPath) { + db, err = fallbackOpen(dbPath, mode, opts, err) + } + }() + return Open(dbPath, mode, opts) +} + +// fallbackOpen performs database recovery and opens a new database +// file when the database fails to open. Called after the first database +// open fails. +func fallbackOpen(dbPath string, mode os.FileMode, opts *bolt.Options, openErr error) (db.DB, error) { + backupPath := dbPath + "." + identity.NewID() + ".bak" + bklog.L.Errorf("failed to open database file %s, resetting to empty. Old database is backed up to %s. "+ + "This error signifies that buildkitd likely crashed or was sigkilled abruptly, leaving the database corrupted. "+ + "If you see logs from a previous panic then please report in the issue tracker at https://github.com/moby/buildkit . %+v", dbPath, backupPath, openErr) + if err := os.Rename(dbPath, backupPath); err != nil { + return nil, errors.Wrapf(err, "failed to rename database file %s to %s", dbPath, backupPath) + } + + // Attempt to open the database again. This should be a new database. + // If this fails, it is a permanent error. + return Open(dbPath, mode, opts) +} + +// fileHasContent checks if we have access to the file with appropriate +// permissions and the file has a non-zero size. +func fileHasContent(dbPath string) bool { + st, err := os.Stat(dbPath) + return err == nil && st.Size() > 0 +}