From fc3b548625793579cdb05bd38587744bd3f21b00 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 6 Apr 2024 19:21:21 +0200 Subject: [PATCH] prune: move logic into repository package --- cmd/restic/cmd_prune.go | 576 +--------------------- cmd/restic/cmd_prune_integration_test.go | 3 +- cmd/restic/delete.go | 38 -- internal/repository/prune.go | 581 +++++++++++++++++++++++ 4 files changed, 599 insertions(+), 599 deletions(-) delete mode 100644 cmd/restic/delete.go create mode 100644 internal/repository/prune.go diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index eef66cc8e..578414f52 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -4,14 +4,11 @@ import ( "context" "math" "runtime" - "sort" "strconv" "strings" "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" - "github.com/restic/restic/internal/index" - "github.com/restic/restic/internal/pack" "github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/ui" @@ -21,10 +18,6 @@ import ( "github.com/spf13/cobra" ) -var errorIndexIncomplete = errors.Fatal("index is not complete") -var errorPacksMissing = errors.Fatal("packs from index missing in repo") -var errorSizeNotMatching = errors.Fatal("pack size does not match calculated size from index") - var cmdPrune = &cobra.Command{ Use: "prune [flags]", Short: "Remove unneeded data from the repository", @@ -194,14 +187,26 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, gopts GlobalOption return err } - plan, stats, err := PlanPrune(ctx, opts, repo, func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error) { + popts := repository.PruneOptions{ + DryRun: opts.DryRun, + UnsafeRecovery: opts.unsafeRecovery, + + MaxUnusedBytes: opts.maxUnusedBytes, + MaxRepackBytes: opts.MaxRepackBytes, + + RepackCachableOnly: opts.RepackCachableOnly, + RepackSmall: opts.RepackSmall, + RepackUncompressed: opts.RepackUncompressed, + } + + plan, stats, err := repository.PlanPrune(ctx, popts, repo, func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error) { return getUsedBlobs(ctx, repo, ignoreSnapshots, printer) }, printer) if err != nil { return err } - if opts.DryRun { + if popts.DryRun { printer.P("\nWould have made the following changes:") } @@ -213,459 +218,11 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, gopts GlobalOption // Trigger GC to reset garbage collection threshold runtime.GC() - return DoPrune(ctx, opts, repo, plan, printer) -} - -type PruneStats struct { - Blobs struct { - Used uint - Duplicate uint - Unused uint - Remove uint - Repack uint - Repackrm uint - } - Size struct { - Used uint64 - Duplicate uint64 - Unused uint64 - Remove uint64 - Repack uint64 - Repackrm uint64 - Unref uint64 - Uncompressed uint64 - } - Packs struct { - Used uint - Unused uint - PartlyUsed uint - Unref uint - Keep uint - Repack uint - Remove uint - } -} - -type PrunePlan struct { - removePacksFirst restic.IDSet // packs to remove first (unreferenced packs) - repackPacks restic.IDSet // packs to repack - keepBlobs restic.CountedBlobSet // blobs to keep during repacking - removePacks restic.IDSet // packs to remove - ignorePacks restic.IDSet // packs to ignore when rebuilding the index -} - -type packInfo struct { - usedBlobs uint - unusedBlobs uint - usedSize uint64 - unusedSize uint64 - tpe restic.BlobType - uncompressed bool -} - -type packInfoWithID struct { - ID restic.ID - packInfo - mustCompress bool -} - -// PlanPrune selects which files to rewrite and which to delete and which blobs to keep. -// Also some summary statistics are returned. -func PlanPrune(ctx context.Context, opts PruneOptions, repo restic.Repository, getUsedBlobs func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error), printer progress.Printer) (PrunePlan, PruneStats, error) { - var stats PruneStats - - usedBlobs, err := getUsedBlobs(ctx, repo) - if err != nil { - return PrunePlan{}, stats, err - } - - printer.P("searching used packs...\n") - keepBlobs, indexPack, err := packInfoFromIndex(ctx, repo.Index(), usedBlobs, &stats, printer) - if err != nil { - return PrunePlan{}, stats, err - } - - printer.P("collecting packs for deletion and repacking\n") - plan, err := decidePackAction(ctx, opts, repo, indexPack, &stats, printer) - if err != nil { - return PrunePlan{}, stats, err - } - - if len(plan.repackPacks) != 0 { - blobCount := keepBlobs.Len() - // when repacking, we do not want to keep blobs which are - // already contained in kept packs, so delete them from keepBlobs - repo.Index().Each(ctx, func(blob restic.PackedBlob) { - if plan.removePacks.Has(blob.PackID) || plan.repackPacks.Has(blob.PackID) { - return - } - keepBlobs.Delete(blob.BlobHandle) - }) - - if keepBlobs.Len() < blobCount/2 { - // replace with copy to shrink map to necessary size if there's a chance to benefit - keepBlobs = keepBlobs.Copy() - } - } else { - // keepBlobs is only needed if packs are repacked - keepBlobs = nil - } - plan.keepBlobs = keepBlobs - - return plan, stats, nil -} - -func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs restic.CountedBlobSet, stats *PruneStats, printer progress.Printer) (restic.CountedBlobSet, map[restic.ID]packInfo, error) { - // iterate over all blobs in index to find out which blobs are duplicates - // The counter in usedBlobs describes how many instances of the blob exist in the repository index - // Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist - idx.Each(ctx, func(blob restic.PackedBlob) { - bh := blob.BlobHandle - count, ok := usedBlobs[bh] - if ok { - if count < math.MaxUint8 { - // don't overflow, but saturate count at 255 - // this can lead to a non-optimal pack selection, but won't cause - // problems otherwise - count++ - } - - usedBlobs[bh] = count - } - }) - - // Check if all used blobs have been found in index - missingBlobs := restic.NewBlobSet() - for bh, count := range usedBlobs { - if count == 0 { - // blob does not exist in any pack files - missingBlobs.Insert(bh) - } - } - - if len(missingBlobs) != 0 { - printer.E("%v not found in the index\n\n"+ - "Integrity check failed: Data seems to be missing.\n"+ - "Will not start prune to prevent (additional) data loss!\n"+ - "Please report this error (along with the output of the 'prune' run) at\n"+ - "https://github.com/restic/restic/issues/new/choose\n", missingBlobs) - return nil, nil, errorIndexIncomplete - } - - indexPack := make(map[restic.ID]packInfo) - - // save computed pack header size - for pid, hdrSize := range pack.Size(ctx, idx, true) { - // initialize tpe with NumBlobTypes to indicate it's not set - indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)} - } - - hasDuplicates := false - // iterate over all blobs in index to generate packInfo - idx.Each(ctx, func(blob restic.PackedBlob) { - ip := indexPack[blob.PackID] - - // Set blob type if not yet set - if ip.tpe == restic.NumBlobTypes { - ip.tpe = blob.Type - } - - // mark mixed packs with "Invalid blob type" - if ip.tpe != blob.Type { - ip.tpe = restic.InvalidBlob - } - - bh := blob.BlobHandle - size := uint64(blob.Length) - dupCount := usedBlobs[bh] - switch { - case dupCount >= 2: - hasDuplicates = true - // mark as unused for now, we will later on select one copy - ip.unusedSize += size - ip.unusedBlobs++ - - // count as duplicate, will later on change one copy to be counted as used - stats.Size.Duplicate += size - stats.Blobs.Duplicate++ - case dupCount == 1: // used blob, not duplicate - ip.usedSize += size - ip.usedBlobs++ - - stats.Size.Used += size - stats.Blobs.Used++ - default: // unused blob - ip.unusedSize += size - ip.unusedBlobs++ - - stats.Size.Unused += size - stats.Blobs.Unused++ - } - if !blob.IsCompressed() { - ip.uncompressed = true - } - // update indexPack - indexPack[blob.PackID] = ip - }) - - // if duplicate blobs exist, those will be set to either "used" or "unused": - // - mark only one occurrence of duplicate blobs as used - // - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used" - // - if there are no used blobs in a pack, possibly mark duplicates as "unused" - if hasDuplicates { - // iterate again over all blobs in index (this is pretty cheap, all in-mem) - idx.Each(ctx, func(blob restic.PackedBlob) { - bh := blob.BlobHandle - count, ok := usedBlobs[bh] - // skip non-duplicate, aka. normal blobs - // count == 0 is used to mark that this was a duplicate blob with only a single occurrence remaining - if !ok || count == 1 { - return - } - - ip := indexPack[blob.PackID] - size := uint64(blob.Length) - switch { - case ip.usedBlobs > 0, count == 0: - // other used blobs in pack or "last" occurrence -> transition to used - ip.usedSize += size - ip.usedBlobs++ - ip.unusedSize -= size - ip.unusedBlobs-- - // same for the global statistics - stats.Size.Used += size - stats.Blobs.Used++ - stats.Size.Duplicate -= size - stats.Blobs.Duplicate-- - // let other occurrences remain marked as unused - usedBlobs[bh] = 1 - default: - // remain unused and decrease counter - count-- - if count == 1 { - // setting count to 1 would lead to forgetting that this blob had duplicates - // thus use the special value zero. This will select the last instance of the blob for keeping. - count = 0 - } - usedBlobs[bh] = count - } - // update indexPack - indexPack[blob.PackID] = ip - }) - } - - // Sanity check. If no duplicates exist, all blobs have value 1. After handling - // duplicates, this also applies to duplicates. - for _, count := range usedBlobs { - if count != 1 { - panic("internal error during blob selection") - } - } - - return usedBlobs, indexPack, nil -} - -func decidePackAction(ctx context.Context, opts PruneOptions, repo restic.Repository, indexPack map[restic.ID]packInfo, stats *PruneStats, printer progress.Printer) (PrunePlan, error) { - removePacksFirst := restic.NewIDSet() - removePacks := restic.NewIDSet() - repackPacks := restic.NewIDSet() - - var repackCandidates []packInfoWithID - var repackSmallCandidates []packInfoWithID - repoVersion := repo.Config().Version - // only repack very small files by default - targetPackSize := repo.PackSize() / 25 - if opts.RepackSmall { - // consider files with at least 80% of the target size as large enough - targetPackSize = repo.PackSize() / 5 * 4 - } - - // loop over all packs and decide what to do - bar := printer.NewCounter("packs processed") - bar.SetMax(uint64(len(indexPack))) - err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error { - p, ok := indexPack[id] - if !ok { - // Pack was not referenced in index and is not used => immediately remove! - printer.V("will remove pack %v as it is unused and not indexed\n", id.Str()) - removePacksFirst.Insert(id) - stats.Size.Unref += uint64(packSize) - return nil - } - - if p.unusedSize+p.usedSize != uint64(packSize) && p.usedBlobs != 0 { - // Pack size does not fit and pack is needed => error - // If the pack is not needed, this is no error, the pack can - // and will be simply removed, see below. - printer.E("pack %s: calculated size %d does not match real size %d\nRun 'restic repair index'.\n", - id.Str(), p.unusedSize+p.usedSize, packSize) - return errorSizeNotMatching - } - - // statistics - switch { - case p.usedBlobs == 0: - stats.Packs.Unused++ - case p.unusedBlobs == 0: - stats.Packs.Used++ - default: - stats.Packs.PartlyUsed++ - } - - if p.uncompressed { - stats.Size.Uncompressed += p.unusedSize + p.usedSize - } - mustCompress := false - if repoVersion >= 2 { - // repo v2: always repack tree blobs if uncompressed - // compress data blobs if requested - mustCompress = (p.tpe == restic.TreeBlob || opts.RepackUncompressed) && p.uncompressed - } - - // decide what to do - switch { - case p.usedBlobs == 0: - // All blobs in pack are no longer used => remove pack! - removePacks.Insert(id) - stats.Blobs.Remove += p.unusedBlobs - stats.Size.Remove += p.unusedSize - - case opts.RepackCachableOnly && p.tpe == restic.DataBlob: - // if this is a data pack and --repack-cacheable-only is set => keep pack! - stats.Packs.Keep++ - - case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress: - if packSize >= int64(targetPackSize) { - // All blobs in pack are used and not mixed => keep pack! - stats.Packs.Keep++ - } else { - repackSmallCandidates = append(repackSmallCandidates, packInfoWithID{ID: id, packInfo: p, mustCompress: mustCompress}) - } - - default: - // all other packs are candidates for repacking - repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p, mustCompress: mustCompress}) - } - - delete(indexPack, id) - bar.Add(1) - return nil - }) - bar.Done() - if err != nil { - return PrunePlan{}, err - } - - // At this point indexPacks contains only missing packs! - - // missing packs that are not needed can be ignored - ignorePacks := restic.NewIDSet() - for id, p := range indexPack { - if p.usedBlobs == 0 { - ignorePacks.Insert(id) - stats.Blobs.Remove += p.unusedBlobs - stats.Size.Remove += p.unusedSize - delete(indexPack, id) - } - } - - if len(indexPack) != 0 { - printer.E("The index references %d needed pack files which are missing from the repository:\n", len(indexPack)) - for id := range indexPack { - printer.E(" %v\n", id) - } - return PrunePlan{}, errorPacksMissing - } - if len(ignorePacks) != 0 { - printer.E("Missing but unneeded pack files are referenced in the index, will be repaired\n") - for id := range ignorePacks { - printer.E("will forget missing pack file %v\n", id) - } - } - - if len(repackSmallCandidates) < 10 { - // too few small files to be worth the trouble, this also prevents endlessly repacking - // if there is just a single pack file below the target size - stats.Packs.Keep += uint(len(repackSmallCandidates)) - } else { - repackCandidates = append(repackCandidates, repackSmallCandidates...) - } - - // Sort repackCandidates such that packs with highest ratio unused/used space are picked first. - // This is equivalent to sorting by unused / total space. - // Instead of unused[i] / used[i] > unused[j] / used[j] we use - // unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64 - // Moreover packs containing trees and too small packs are sorted to the beginning - sort.Slice(repackCandidates, func(i, j int) bool { - pi := repackCandidates[i].packInfo - pj := repackCandidates[j].packInfo - switch { - case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob: - return true - case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob: - return false - case pi.unusedSize+pi.usedSize < uint64(targetPackSize) && pj.unusedSize+pj.usedSize >= uint64(targetPackSize): - return true - case pj.unusedSize+pj.usedSize < uint64(targetPackSize) && pi.unusedSize+pi.usedSize >= uint64(targetPackSize): - return false - } - return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize - }) - - repack := func(id restic.ID, p packInfo) { - repackPacks.Insert(id) - stats.Blobs.Repack += p.unusedBlobs + p.usedBlobs - stats.Size.Repack += p.unusedSize + p.usedSize - stats.Blobs.Repackrm += p.unusedBlobs - stats.Size.Repackrm += p.unusedSize - if p.uncompressed { - stats.Size.Uncompressed -= p.unusedSize + p.usedSize - } - } - - // calculate limit for number of unused bytes in the repo after repacking - maxUnusedSizeAfter := opts.maxUnusedBytes(stats.Size.Used) - - for _, p := range repackCandidates { - reachedUnusedSizeAfter := (stats.Size.Unused-stats.Size.Remove-stats.Size.Repackrm < maxUnusedSizeAfter) - reachedRepackSize := stats.Size.Repack+p.unusedSize+p.usedSize >= opts.MaxRepackBytes - packIsLargeEnough := p.unusedSize+p.usedSize >= uint64(targetPackSize) - - switch { - case reachedRepackSize: - stats.Packs.Keep++ - - case p.tpe != restic.DataBlob, p.mustCompress: - // repacking non-data packs / uncompressed-trees is only limited by repackSize - repack(p.ID, p.packInfo) - - case reachedUnusedSizeAfter && packIsLargeEnough: - // for all other packs stop repacking if tolerated unused size is reached. - stats.Packs.Keep++ - - default: - repack(p.ID, p.packInfo) - } - } - - stats.Packs.Unref = uint(len(removePacksFirst)) - stats.Packs.Repack = uint(len(repackPacks)) - stats.Packs.Remove = uint(len(removePacks)) - - if repo.Config().Version < 2 { - // compression not supported for repository format version 1 - stats.Size.Uncompressed = 0 - } - - return PrunePlan{removePacksFirst: removePacksFirst, - removePacks: removePacks, - repackPacks: repackPacks, - ignorePacks: ignorePacks, - }, nil + return repository.DoPrune(ctx, popts, repo, plan, printer) } // printPruneStats prints out the statistics -func printPruneStats(printer progress.Printer, stats PruneStats) error { +func printPruneStats(printer progress.Printer, stats repository.PruneStats) error { printer.V("\nused: %10d blobs / %s\n", stats.Blobs.Used, ui.FormatBytes(stats.Size.Used)) if stats.Blobs.Duplicate > 0 { printer.V("duplicates: %10d blobs / %s\n", stats.Blobs.Duplicate, ui.FormatBytes(stats.Size.Duplicate)) @@ -706,107 +263,6 @@ func printPruneStats(printer progress.Printer, stats PruneStats) error { return nil } -// DoPrune does the actual pruning: -// - remove unreferenced packs first -// - repack given pack files while keeping the given blobs -// - rebuild the index while ignoring all files that will be deleted -// - delete the files -// plan.removePacks and plan.ignorePacks are modified in this function. -func DoPrune(ctx context.Context, opts PruneOptions, repo restic.Repository, plan PrunePlan, printer progress.Printer) (err error) { - if opts.DryRun { - printer.V("Repeated prune dry-runs can report slightly different amounts of data to keep or repack. This is expected behavior.\n\n") - if len(plan.removePacksFirst) > 0 { - printer.V("Would have removed the following unreferenced packs:\n%v\n\n", plan.removePacksFirst) - } - printer.V("Would have repacked and removed the following packs:\n%v\n\n", plan.repackPacks) - printer.V("Would have removed the following no longer used packs:\n%v\n\n", plan.removePacks) - // Always quit here if DryRun was set! - return nil - } - - // unreferenced packs can be safely deleted first - if len(plan.removePacksFirst) != 0 { - printer.P("deleting unreferenced packs\n") - DeleteFiles(ctx, repo, plan.removePacksFirst, restic.PackFile, printer) - } - - if len(plan.repackPacks) != 0 { - printer.P("repacking packs\n") - bar := printer.NewCounter("packs repacked") - bar.SetMax(uint64(len(plan.repackPacks))) - _, err := repository.Repack(ctx, repo, repo, plan.repackPacks, plan.keepBlobs, bar) - bar.Done() - if err != nil { - return errors.Fatal(err.Error()) - } - - // Also remove repacked packs - plan.removePacks.Merge(plan.repackPacks) - - if len(plan.keepBlobs) != 0 { - printer.E("%v was not repacked\n\n"+ - "Integrity check failed.\n"+ - "Please report this error (along with the output of the 'prune' run) at\n"+ - "https://github.com/restic/restic/issues/new/choose\n", plan.keepBlobs) - return errors.Fatal("internal error: blobs were not repacked") - } - - // allow GC of the blob set - plan.keepBlobs = nil - } - - if len(plan.ignorePacks) == 0 { - plan.ignorePacks = plan.removePacks - } else { - plan.ignorePacks.Merge(plan.removePacks) - } - - if opts.unsafeRecovery { - printer.P("deleting index files\n") - indexFiles := repo.Index().(*index.MasterIndex).IDs() - err = DeleteFilesChecked(ctx, repo, indexFiles, restic.IndexFile, printer) - if err != nil { - return errors.Fatalf("%s", err) - } - } else if len(plan.ignorePacks) != 0 { - err = rebuildIndexFiles(ctx, repo, plan.ignorePacks, nil, false, printer) - if err != nil { - return errors.Fatalf("%s", err) - } - } - - if len(plan.removePacks) != 0 { - printer.P("removing %d old packs\n", len(plan.removePacks)) - DeleteFiles(ctx, repo, plan.removePacks, restic.PackFile, printer) - } - - if opts.unsafeRecovery { - err = rebuildIndexFiles(ctx, repo, plan.ignorePacks, nil, true, printer) - if err != nil { - return errors.Fatalf("%s", err) - } - } - - printer.P("done\n") - return nil -} - -func rebuildIndexFiles(ctx context.Context, repo restic.Repository, removePacks restic.IDSet, extraObsolete restic.IDs, skipDeletion bool, printer progress.Printer) error { - printer.P("rebuilding index\n") - - bar := printer.NewCounter("packs processed") - return repo.Index().Save(ctx, repo, removePacks, extraObsolete, restic.MasterIndexSaveOpts{ - SaveProgress: bar, - DeleteProgress: func() *progress.Counter { - return printer.NewCounter("old indexes deleted") - }, - DeleteReport: func(id restic.ID, _ error) { - printer.VV("removed index %v\n", id.String()) - }, - SkipDeletion: skipDeletion, - }) -} - func getUsedBlobs(ctx context.Context, repo restic.Repository, ignoreSnapshots restic.IDSet, printer progress.Printer) (usedBlobs restic.CountedBlobSet, err error) { var snapshotTrees restic.IDs printer.P("loading all snapshots...\n") diff --git a/cmd/restic/cmd_prune_integration_test.go b/cmd/restic/cmd_prune_integration_test.go index 4c21940c4..f5d2e1f6b 100644 --- a/cmd/restic/cmd_prune_integration_test.go +++ b/cmd/restic/cmd_prune_integration_test.go @@ -7,6 +7,7 @@ import ( "testing" "github.com/restic/restic/internal/backend" + "github.com/restic/restic/internal/repository" rtest "github.com/restic/restic/internal/test" "github.com/restic/restic/internal/ui/termstatus" ) @@ -145,7 +146,7 @@ func TestPruneWithDamagedRepository(t *testing.T) { // prune should fail rtest.Assert(t, withTermStatus(env.gopts, func(ctx context.Context, term *termstatus.Terminal) error { return runPrune(context.TODO(), pruneDefaultOptions, env.gopts, term) - }) == errorPacksMissing, + }) == repository.ErrPacksMissing, "prune should have reported index not complete error") } diff --git a/cmd/restic/delete.go b/cmd/restic/delete.go deleted file mode 100644 index 34f71d91a..000000000 --- a/cmd/restic/delete.go +++ /dev/null @@ -1,38 +0,0 @@ -package main - -import ( - "context" - - "github.com/restic/restic/internal/restic" - "github.com/restic/restic/internal/ui/progress" -) - -// DeleteFiles deletes the given fileList of fileType in parallel -// it will print a warning if there is an error, but continue deleting the remaining files -func DeleteFiles(ctx context.Context, repo restic.Repository, fileList restic.IDSet, fileType restic.FileType, printer progress.Printer) { - _ = deleteFiles(ctx, true, repo, fileList, fileType, printer) -} - -// DeleteFilesChecked deletes the given fileList of fileType in parallel -// if an error occurs, it will cancel and return this error -func DeleteFilesChecked(ctx context.Context, repo restic.Repository, fileList restic.IDSet, fileType restic.FileType, printer progress.Printer) error { - return deleteFiles(ctx, false, repo, fileList, fileType, printer) -} - -// deleteFiles deletes the given fileList of fileType in parallel -// if ignoreError=true, it will print a warning if there was an error, else it will abort. -func deleteFiles(ctx context.Context, ignoreError bool, repo restic.Repository, fileList restic.IDSet, fileType restic.FileType, printer progress.Printer) error { - bar := printer.NewCounter("files deleted") - defer bar.Done() - - return restic.ParallelRemove(ctx, repo, fileList, fileType, func(id restic.ID, err error) error { - if err != nil { - printer.E("unable to remove %v/%v from the repository\n", fileType, id) - if !ignoreError { - return err - } - } - printer.VV("removed %v/%v\n", fileType, id) - return nil - }, bar) -} diff --git a/internal/repository/prune.go b/internal/repository/prune.go new file mode 100644 index 000000000..653705bf4 --- /dev/null +++ b/internal/repository/prune.go @@ -0,0 +1,581 @@ +package repository + +import ( + "context" + "math" + "sort" + + "github.com/restic/restic/internal/errors" + "github.com/restic/restic/internal/index" + "github.com/restic/restic/internal/pack" + "github.com/restic/restic/internal/restic" + "github.com/restic/restic/internal/ui/progress" +) + +var ErrIndexIncomplete = errors.Fatal("index is not complete") +var ErrPacksMissing = errors.Fatal("packs from index missing in repo") +var ErrSizeNotMatching = errors.Fatal("pack size does not match calculated size from index") + +// PruneOptions collects all options for the cleanup command. +type PruneOptions struct { + DryRun bool + UnsafeRecovery bool + + MaxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused + MaxRepackBytes uint64 + + RepackCachableOnly bool + RepackSmall bool + RepackUncompressed bool +} + +type PruneStats struct { + Blobs struct { + Used uint + Duplicate uint + Unused uint + Remove uint + Repack uint + Repackrm uint + } + Size struct { + Used uint64 + Duplicate uint64 + Unused uint64 + Remove uint64 + Repack uint64 + Repackrm uint64 + Unref uint64 + Uncompressed uint64 + } + Packs struct { + Used uint + Unused uint + PartlyUsed uint + Unref uint + Keep uint + Repack uint + Remove uint + } +} + +type PrunePlan struct { + removePacksFirst restic.IDSet // packs to remove first (unreferenced packs) + repackPacks restic.IDSet // packs to repack + keepBlobs restic.CountedBlobSet // blobs to keep during repacking + removePacks restic.IDSet // packs to remove + ignorePacks restic.IDSet // packs to ignore when rebuilding the index +} + +type packInfo struct { + usedBlobs uint + unusedBlobs uint + usedSize uint64 + unusedSize uint64 + tpe restic.BlobType + uncompressed bool +} + +type packInfoWithID struct { + ID restic.ID + packInfo + mustCompress bool +} + +// PlanPrune selects which files to rewrite and which to delete and which blobs to keep. +// Also some summary statistics are returned. +func PlanPrune(ctx context.Context, opts PruneOptions, repo restic.Repository, getUsedBlobs func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error), printer progress.Printer) (PrunePlan, PruneStats, error) { + var stats PruneStats + + usedBlobs, err := getUsedBlobs(ctx, repo) + if err != nil { + return PrunePlan{}, stats, err + } + + printer.P("searching used packs...\n") + keepBlobs, indexPack, err := packInfoFromIndex(ctx, repo.Index(), usedBlobs, &stats, printer) + if err != nil { + return PrunePlan{}, stats, err + } + + printer.P("collecting packs for deletion and repacking\n") + plan, err := decidePackAction(ctx, opts, repo, indexPack, &stats, printer) + if err != nil { + return PrunePlan{}, stats, err + } + + if len(plan.repackPacks) != 0 { + blobCount := keepBlobs.Len() + // when repacking, we do not want to keep blobs which are + // already contained in kept packs, so delete them from keepBlobs + repo.Index().Each(ctx, func(blob restic.PackedBlob) { + if plan.removePacks.Has(blob.PackID) || plan.repackPacks.Has(blob.PackID) { + return + } + keepBlobs.Delete(blob.BlobHandle) + }) + + if keepBlobs.Len() < blobCount/2 { + // replace with copy to shrink map to necessary size if there's a chance to benefit + keepBlobs = keepBlobs.Copy() + } + } else { + // keepBlobs is only needed if packs are repacked + keepBlobs = nil + } + plan.keepBlobs = keepBlobs + + return plan, stats, nil +} + +func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs restic.CountedBlobSet, stats *PruneStats, printer progress.Printer) (restic.CountedBlobSet, map[restic.ID]packInfo, error) { + // iterate over all blobs in index to find out which blobs are duplicates + // The counter in usedBlobs describes how many instances of the blob exist in the repository index + // Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist + idx.Each(ctx, func(blob restic.PackedBlob) { + bh := blob.BlobHandle + count, ok := usedBlobs[bh] + if ok { + if count < math.MaxUint8 { + // don't overflow, but saturate count at 255 + // this can lead to a non-optimal pack selection, but won't cause + // problems otherwise + count++ + } + + usedBlobs[bh] = count + } + }) + + // Check if all used blobs have been found in index + missingBlobs := restic.NewBlobSet() + for bh, count := range usedBlobs { + if count == 0 { + // blob does not exist in any pack files + missingBlobs.Insert(bh) + } + } + + if len(missingBlobs) != 0 { + printer.E("%v not found in the index\n\n"+ + "Integrity check failed: Data seems to be missing.\n"+ + "Will not start prune to prevent (additional) data loss!\n"+ + "Please report this error (along with the output of the 'prune' run) at\n"+ + "https://github.com/restic/restic/issues/new/choose\n", missingBlobs) + return nil, nil, ErrIndexIncomplete + } + + indexPack := make(map[restic.ID]packInfo) + + // save computed pack header size + for pid, hdrSize := range pack.Size(ctx, idx, true) { + // initialize tpe with NumBlobTypes to indicate it's not set + indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)} + } + + hasDuplicates := false + // iterate over all blobs in index to generate packInfo + idx.Each(ctx, func(blob restic.PackedBlob) { + ip := indexPack[blob.PackID] + + // Set blob type if not yet set + if ip.tpe == restic.NumBlobTypes { + ip.tpe = blob.Type + } + + // mark mixed packs with "Invalid blob type" + if ip.tpe != blob.Type { + ip.tpe = restic.InvalidBlob + } + + bh := blob.BlobHandle + size := uint64(blob.Length) + dupCount := usedBlobs[bh] + switch { + case dupCount >= 2: + hasDuplicates = true + // mark as unused for now, we will later on select one copy + ip.unusedSize += size + ip.unusedBlobs++ + + // count as duplicate, will later on change one copy to be counted as used + stats.Size.Duplicate += size + stats.Blobs.Duplicate++ + case dupCount == 1: // used blob, not duplicate + ip.usedSize += size + ip.usedBlobs++ + + stats.Size.Used += size + stats.Blobs.Used++ + default: // unused blob + ip.unusedSize += size + ip.unusedBlobs++ + + stats.Size.Unused += size + stats.Blobs.Unused++ + } + if !blob.IsCompressed() { + ip.uncompressed = true + } + // update indexPack + indexPack[blob.PackID] = ip + }) + + // if duplicate blobs exist, those will be set to either "used" or "unused": + // - mark only one occurrence of duplicate blobs as used + // - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used" + // - if there are no used blobs in a pack, possibly mark duplicates as "unused" + if hasDuplicates { + // iterate again over all blobs in index (this is pretty cheap, all in-mem) + idx.Each(ctx, func(blob restic.PackedBlob) { + bh := blob.BlobHandle + count, ok := usedBlobs[bh] + // skip non-duplicate, aka. normal blobs + // count == 0 is used to mark that this was a duplicate blob with only a single occurrence remaining + if !ok || count == 1 { + return + } + + ip := indexPack[blob.PackID] + size := uint64(blob.Length) + switch { + case ip.usedBlobs > 0, count == 0: + // other used blobs in pack or "last" occurrence -> transition to used + ip.usedSize += size + ip.usedBlobs++ + ip.unusedSize -= size + ip.unusedBlobs-- + // same for the global statistics + stats.Size.Used += size + stats.Blobs.Used++ + stats.Size.Duplicate -= size + stats.Blobs.Duplicate-- + // let other occurrences remain marked as unused + usedBlobs[bh] = 1 + default: + // remain unused and decrease counter + count-- + if count == 1 { + // setting count to 1 would lead to forgetting that this blob had duplicates + // thus use the special value zero. This will select the last instance of the blob for keeping. + count = 0 + } + usedBlobs[bh] = count + } + // update indexPack + indexPack[blob.PackID] = ip + }) + } + + // Sanity check. If no duplicates exist, all blobs have value 1. After handling + // duplicates, this also applies to duplicates. + for _, count := range usedBlobs { + if count != 1 { + panic("internal error during blob selection") + } + } + + return usedBlobs, indexPack, nil +} + +func decidePackAction(ctx context.Context, opts PruneOptions, repo restic.Repository, indexPack map[restic.ID]packInfo, stats *PruneStats, printer progress.Printer) (PrunePlan, error) { + removePacksFirst := restic.NewIDSet() + removePacks := restic.NewIDSet() + repackPacks := restic.NewIDSet() + + var repackCandidates []packInfoWithID + var repackSmallCandidates []packInfoWithID + repoVersion := repo.Config().Version + // only repack very small files by default + targetPackSize := repo.PackSize() / 25 + if opts.RepackSmall { + // consider files with at least 80% of the target size as large enough + targetPackSize = repo.PackSize() / 5 * 4 + } + + // loop over all packs and decide what to do + bar := printer.NewCounter("packs processed") + bar.SetMax(uint64(len(indexPack))) + err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error { + p, ok := indexPack[id] + if !ok { + // Pack was not referenced in index and is not used => immediately remove! + printer.V("will remove pack %v as it is unused and not indexed\n", id.Str()) + removePacksFirst.Insert(id) + stats.Size.Unref += uint64(packSize) + return nil + } + + if p.unusedSize+p.usedSize != uint64(packSize) && p.usedBlobs != 0 { + // Pack size does not fit and pack is needed => error + // If the pack is not needed, this is no error, the pack can + // and will be simply removed, see below. + printer.E("pack %s: calculated size %d does not match real size %d\nRun 'restic repair index'.\n", + id.Str(), p.unusedSize+p.usedSize, packSize) + return ErrSizeNotMatching + } + + // statistics + switch { + case p.usedBlobs == 0: + stats.Packs.Unused++ + case p.unusedBlobs == 0: + stats.Packs.Used++ + default: + stats.Packs.PartlyUsed++ + } + + if p.uncompressed { + stats.Size.Uncompressed += p.unusedSize + p.usedSize + } + mustCompress := false + if repoVersion >= 2 { + // repo v2: always repack tree blobs if uncompressed + // compress data blobs if requested + mustCompress = (p.tpe == restic.TreeBlob || opts.RepackUncompressed) && p.uncompressed + } + + // decide what to do + switch { + case p.usedBlobs == 0: + // All blobs in pack are no longer used => remove pack! + removePacks.Insert(id) + stats.Blobs.Remove += p.unusedBlobs + stats.Size.Remove += p.unusedSize + + case opts.RepackCachableOnly && p.tpe == restic.DataBlob: + // if this is a data pack and --repack-cacheable-only is set => keep pack! + stats.Packs.Keep++ + + case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress: + if packSize >= int64(targetPackSize) { + // All blobs in pack are used and not mixed => keep pack! + stats.Packs.Keep++ + } else { + repackSmallCandidates = append(repackSmallCandidates, packInfoWithID{ID: id, packInfo: p, mustCompress: mustCompress}) + } + + default: + // all other packs are candidates for repacking + repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p, mustCompress: mustCompress}) + } + + delete(indexPack, id) + bar.Add(1) + return nil + }) + bar.Done() + if err != nil { + return PrunePlan{}, err + } + + // At this point indexPacks contains only missing packs! + + // missing packs that are not needed can be ignored + ignorePacks := restic.NewIDSet() + for id, p := range indexPack { + if p.usedBlobs == 0 { + ignorePacks.Insert(id) + stats.Blobs.Remove += p.unusedBlobs + stats.Size.Remove += p.unusedSize + delete(indexPack, id) + } + } + + if len(indexPack) != 0 { + printer.E("The index references %d needed pack files which are missing from the repository:\n", len(indexPack)) + for id := range indexPack { + printer.E(" %v\n", id) + } + return PrunePlan{}, ErrPacksMissing + } + if len(ignorePacks) != 0 { + printer.E("Missing but unneeded pack files are referenced in the index, will be repaired\n") + for id := range ignorePacks { + printer.E("will forget missing pack file %v\n", id) + } + } + + if len(repackSmallCandidates) < 10 { + // too few small files to be worth the trouble, this also prevents endlessly repacking + // if there is just a single pack file below the target size + stats.Packs.Keep += uint(len(repackSmallCandidates)) + } else { + repackCandidates = append(repackCandidates, repackSmallCandidates...) + } + + // Sort repackCandidates such that packs with highest ratio unused/used space are picked first. + // This is equivalent to sorting by unused / total space. + // Instead of unused[i] / used[i] > unused[j] / used[j] we use + // unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64 + // Moreover packs containing trees and too small packs are sorted to the beginning + sort.Slice(repackCandidates, func(i, j int) bool { + pi := repackCandidates[i].packInfo + pj := repackCandidates[j].packInfo + switch { + case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob: + return true + case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob: + return false + case pi.unusedSize+pi.usedSize < uint64(targetPackSize) && pj.unusedSize+pj.usedSize >= uint64(targetPackSize): + return true + case pj.unusedSize+pj.usedSize < uint64(targetPackSize) && pi.unusedSize+pi.usedSize >= uint64(targetPackSize): + return false + } + return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize + }) + + repack := func(id restic.ID, p packInfo) { + repackPacks.Insert(id) + stats.Blobs.Repack += p.unusedBlobs + p.usedBlobs + stats.Size.Repack += p.unusedSize + p.usedSize + stats.Blobs.Repackrm += p.unusedBlobs + stats.Size.Repackrm += p.unusedSize + if p.uncompressed { + stats.Size.Uncompressed -= p.unusedSize + p.usedSize + } + } + + // calculate limit for number of unused bytes in the repo after repacking + maxUnusedSizeAfter := opts.MaxUnusedBytes(stats.Size.Used) + + for _, p := range repackCandidates { + reachedUnusedSizeAfter := (stats.Size.Unused-stats.Size.Remove-stats.Size.Repackrm < maxUnusedSizeAfter) + reachedRepackSize := stats.Size.Repack+p.unusedSize+p.usedSize >= opts.MaxRepackBytes + packIsLargeEnough := p.unusedSize+p.usedSize >= uint64(targetPackSize) + + switch { + case reachedRepackSize: + stats.Packs.Keep++ + + case p.tpe != restic.DataBlob, p.mustCompress: + // repacking non-data packs / uncompressed-trees is only limited by repackSize + repack(p.ID, p.packInfo) + + case reachedUnusedSizeAfter && packIsLargeEnough: + // for all other packs stop repacking if tolerated unused size is reached. + stats.Packs.Keep++ + + default: + repack(p.ID, p.packInfo) + } + } + + stats.Packs.Unref = uint(len(removePacksFirst)) + stats.Packs.Repack = uint(len(repackPacks)) + stats.Packs.Remove = uint(len(removePacks)) + + if repo.Config().Version < 2 { + // compression not supported for repository format version 1 + stats.Size.Uncompressed = 0 + } + + return PrunePlan{removePacksFirst: removePacksFirst, + removePacks: removePacks, + repackPacks: repackPacks, + ignorePacks: ignorePacks, + }, nil +} + +// DoPrune does the actual pruning: +// - remove unreferenced packs first +// - repack given pack files while keeping the given blobs +// - rebuild the index while ignoring all files that will be deleted +// - delete the files +// plan.removePacks and plan.ignorePacks are modified in this function. +func DoPrune(ctx context.Context, opts PruneOptions, repo restic.Repository, plan PrunePlan, printer progress.Printer) (err error) { + if opts.DryRun { + printer.V("Repeated prune dry-runs can report slightly different amounts of data to keep or repack. This is expected behavior.\n\n") + if len(plan.removePacksFirst) > 0 { + printer.V("Would have removed the following unreferenced packs:\n%v\n\n", plan.removePacksFirst) + } + printer.V("Would have repacked and removed the following packs:\n%v\n\n", plan.repackPacks) + printer.V("Would have removed the following no longer used packs:\n%v\n\n", plan.removePacks) + // Always quit here if DryRun was set! + return nil + } + + // unreferenced packs can be safely deleted first + if len(plan.removePacksFirst) != 0 { + printer.P("deleting unreferenced packs\n") + _ = deleteFiles(ctx, true, repo, plan.removePacksFirst, restic.PackFile, printer) + } + + if len(plan.repackPacks) != 0 { + printer.P("repacking packs\n") + bar := printer.NewCounter("packs repacked") + bar.SetMax(uint64(len(plan.repackPacks))) + _, err := Repack(ctx, repo, repo, plan.repackPacks, plan.keepBlobs, bar) + bar.Done() + if err != nil { + return errors.Fatal(err.Error()) + } + + // Also remove repacked packs + plan.removePacks.Merge(plan.repackPacks) + + if len(plan.keepBlobs) != 0 { + printer.E("%v was not repacked\n\n"+ + "Integrity check failed.\n"+ + "Please report this error (along with the output of the 'prune' run) at\n"+ + "https://github.com/restic/restic/issues/new/choose\n", plan.keepBlobs) + return errors.Fatal("internal error: blobs were not repacked") + } + + // allow GC of the blob set + plan.keepBlobs = nil + } + + if len(plan.ignorePacks) == 0 { + plan.ignorePacks = plan.removePacks + } else { + plan.ignorePacks.Merge(plan.removePacks) + } + + if opts.UnsafeRecovery { + printer.P("deleting index files\n") + indexFiles := repo.Index().(*index.MasterIndex).IDs() + err = deleteFiles(ctx, false, repo, indexFiles, restic.IndexFile, printer) + if err != nil { + return errors.Fatalf("%s", err) + } + } else if len(plan.ignorePacks) != 0 { + err = rebuildIndexFiles(ctx, repo, plan.ignorePacks, nil, false, printer) + if err != nil { + return errors.Fatalf("%s", err) + } + } + + if len(plan.removePacks) != 0 { + printer.P("removing %d old packs\n", len(plan.removePacks)) + _ = deleteFiles(ctx, true, repo, plan.removePacks, restic.PackFile, printer) + } + + if opts.UnsafeRecovery { + err = rebuildIndexFiles(ctx, repo, plan.ignorePacks, nil, true, printer) + if err != nil { + return errors.Fatalf("%s", err) + } + } + + printer.P("done\n") + return nil +} + +// deleteFiles deletes the given fileList of fileType in parallel +// if ignoreError=true, it will print a warning if there was an error, else it will abort. +func deleteFiles(ctx context.Context, ignoreError bool, repo restic.Repository, fileList restic.IDSet, fileType restic.FileType, printer progress.Printer) error { + bar := printer.NewCounter("files deleted") + defer bar.Done() + + return restic.ParallelRemove(ctx, repo, fileList, fileType, func(id restic.ID, err error) error { + if err != nil { + printer.E("unable to remove %v/%v from the repository\n", fileType, id) + if !ignoreError { + return err + } + } + printer.VV("removed %v/%v\n", fileType, id) + return nil + }, bar) +}