diff --git a/changelog/unreleased/pull-3508 b/changelog/unreleased/pull-3508 new file mode 100644 index 000000000..c6e0f575e --- /dev/null +++ b/changelog/unreleased/pull-3508 @@ -0,0 +1,10 @@ +Enhancement: Cache blobs read by the dump command + +When dumping a file using the `restic dump` command, restic did not cache blobs +in any way, so even consecutive runs of the same blob did get loaded from the +repository again and again, slowing down the dump. + +Now, the caching mechanism already used by the `fuse` command is also used by +the `dump` command. This makes dumping much faster, especially for sparse files. + +https://github.com/restic/restic/pull/3508 diff --git a/cmd/restic/cmd_dump.go b/cmd/restic/cmd_dump.go index 667dc3875..4c8ed5b1d 100644 --- a/cmd/restic/cmd_dump.go +++ b/cmd/restic/cmd_dump.go @@ -95,7 +95,8 @@ func printFromTree(ctx context.Context, tree *restic.Tree, repo restic.Repositor if node.Name == pathComponents[0] { switch { case l == 1 && dump.IsFile(node): - return dump.GetNodeData(ctx, os.Stdout, repo, node) + cache := dump.NewCache() + return dump.WriteNodeData(ctx, os.Stdout, repo, node, cache) case l > 1 && dump.IsDir(node): subtree, err := repo.LoadTree(ctx, *node.Subtree) if err != nil { diff --git a/internal/fuse/blobcache.go b/internal/bloblru/cache.go similarity index 52% rename from internal/fuse/blobcache.go rename to internal/bloblru/cache.go index 728457b11..dc977e650 100644 --- a/internal/fuse/blobcache.go +++ b/internal/bloblru/cache.go @@ -1,4 +1,4 @@ -package fuse +package bloblru import ( "sync" @@ -10,12 +10,12 @@ import ( ) // Crude estimate of the overhead per blob: a SHA-256, a linked list node -// and some pointers. See comment in blobCache.add. -const cacheOverhead = len(restic.ID{}) + 64 +// and some pointers. See comment in Cache.add. +const overhead = len(restic.ID{}) + 64 -// A blobCache is a fixed-size cache of blob contents. +// A Cache is a fixed-size LRU cache of blob contents. // It is safe for concurrent access. -type blobCache struct { +type Cache struct { mu sync.Mutex c *simplelru.LRU @@ -23,16 +23,16 @@ type blobCache struct { } // Construct a blob cache that stores at most size bytes worth of blobs. -func newBlobCache(size int) *blobCache { - c := &blobCache{ +func New(size int) *Cache { + c := &Cache{ free: size, size: size, } // NewLRU wants us to specify some max. number of entries, else it errors. - // The actual maximum will be smaller than size/cacheOverhead, because we + // The actual maximum will be smaller than size/overhead, because we // evict entries (RemoveOldest in add) to maintain our size bound. - maxEntries := size / cacheOverhead + maxEntries := size / overhead lru, err := simplelru.NewLRU(maxEntries, c.evict) if err != nil { panic(err) // Can only be maxEntries <= 0. @@ -42,10 +42,12 @@ func newBlobCache(size int) *blobCache { return c } -func (c *blobCache) add(id restic.ID, blob []byte) { - debug.Log("blobCache: add %v", id) +// Add adds key id with value blob to c. +// It may return an evicted buffer for reuse. +func (c *Cache) Add(id restic.ID, blob []byte) (old []byte) { + debug.Log("bloblru.Cache: add %v", id) - size := len(blob) + cacheOverhead + size := len(blob) + overhead if size > c.size { return } @@ -59,29 +61,36 @@ func (c *blobCache) add(id restic.ID, blob []byte) { return } - // This loop takes at most min(maxEntries, maxchunksize/cacheOverhead) + // This loop takes at most min(maxEntries, maxchunksize/overhead) // iterations. for size > c.free { - c.c.RemoveOldest() + _, val, _ := c.c.RemoveOldest() + b := val.([]byte) + if len(b) > len(old) { + // We can only return one buffer, so pick the largest. + old = b + } } c.c.Add(key, blob) c.free -= size + + return old } -func (c *blobCache) get(id restic.ID) ([]byte, bool) { +func (c *Cache) Get(id restic.ID) ([]byte, bool) { c.mu.Lock() value, ok := c.c.Get(id) c.mu.Unlock() - debug.Log("blobCache: get %v, hit %v", id, ok) + debug.Log("bloblru.Cache: get %v, hit %v", id, ok) blob, ok := value.([]byte) return blob, ok } -func (c *blobCache) evict(key, value interface{}) { +func (c *Cache) evict(key, value interface{}) { blob := value.([]byte) - debug.Log("blobCache: evict %v, %d bytes", key, len(blob)) - c.free += len(blob) + cacheOverhead + debug.Log("bloblru.Cache: evict %v, %d bytes", key, len(blob)) + c.free += len(blob) + overhead } diff --git a/internal/bloblru/cache_test.go b/internal/bloblru/cache_test.go new file mode 100644 index 000000000..c257a95e2 --- /dev/null +++ b/internal/bloblru/cache_test.go @@ -0,0 +1,50 @@ +package bloblru + +import ( + "testing" + + "github.com/restic/restic/internal/restic" + rtest "github.com/restic/restic/internal/test" +) + +func TestCache(t *testing.T) { + var id1, id2, id3 restic.ID + id1[0] = 1 + id2[0] = 2 + id3[0] = 3 + + const ( + kiB = 1 << 10 + cacheSize = 64*kiB + 3*overhead + ) + + c := New(cacheSize) + + addAndCheck := func(id restic.ID, exp []byte) { + c.Add(id, exp) + blob, ok := c.Get(id) + rtest.Assert(t, ok, "blob %v added but not found in cache", id) + rtest.Equals(t, &exp[0], &blob[0]) + rtest.Equals(t, exp, blob) + } + + addAndCheck(id1, make([]byte, 32*kiB)) + addAndCheck(id2, make([]byte, 30*kiB)) + addAndCheck(id3, make([]byte, 10*kiB)) + + _, ok := c.Get(id2) + rtest.Assert(t, ok, "blob %v not present", id2) + _, ok = c.Get(id1) + rtest.Assert(t, !ok, "blob %v present, but should have been evicted", id1) + + c.Add(id1, make([]byte, 1+c.size)) + _, ok = c.Get(id1) + rtest.Assert(t, !ok, "blob %v too large but still added to cache") + + c.c.Remove(id1) + c.c.Remove(id3) + c.c.Remove(id2) + + rtest.Equals(t, cacheSize, c.size) + rtest.Equals(t, cacheSize, c.free) +} diff --git a/internal/dump/common.go b/internal/dump/common.go index eb02b1c2e..7ef0c93e4 100644 --- a/internal/dump/common.go +++ b/internal/dump/common.go @@ -5,6 +5,7 @@ import ( "io" "path" + "github.com/restic/restic/internal/bloblru" "github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/walker" @@ -20,7 +21,11 @@ type dumper interface { // It will loop over all nodes in the tree and dump them recursively. type WriteDump func(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dst io.Writer) error -func writeDump(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dmp dumper, dst io.Writer) error { +func NewCache() *bloblru.Cache { + return bloblru.New(64 << 20) +} + +func writeDump(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dmp dumper) error { for _, rootNode := range tree.Nodes { rootNode.Path = rootPath err := dumpTree(ctx, repo, rootNode, rootPath, dmp) @@ -71,20 +76,24 @@ func dumpTree(ctx context.Context, repo restic.Repository, rootNode *restic.Node return err } -// GetNodeData will write the contents of the node to the given output. -func GetNodeData(ctx context.Context, output io.Writer, repo restic.Repository, node *restic.Node) error { +// WriteNodeData writes the contents of the node to the given Writer. +func WriteNodeData(ctx context.Context, w io.Writer, repo restic.Repository, node *restic.Node, cache *bloblru.Cache) error { var ( buf []byte err error ) for _, id := range node.Content { - buf, err = repo.LoadBlob(ctx, restic.DataBlob, id, buf) - if err != nil { - return err + blob, ok := cache.Get(id) + if !ok { + blob, err = repo.LoadBlob(ctx, restic.DataBlob, id, buf) + if err != nil { + return err + } + + buf = cache.Add(id, blob) // Reuse evicted buffer. } - _, err = output.Write(buf) - if err != nil { + if _, err := w.Write(blob); err != nil { return errors.Wrap(err, "Write") } } diff --git a/internal/dump/tar.go b/internal/dump/tar.go index f9716d152..57225cf66 100644 --- a/internal/dump/tar.go +++ b/internal/dump/tar.go @@ -8,25 +8,29 @@ import ( "path/filepath" "strings" + "github.com/restic/restic/internal/bloblru" "github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/restic" ) type tarDumper struct { - w *tar.Writer + cache *bloblru.Cache + w *tar.Writer } // Statically ensure that tarDumper implements dumper. -var _ dumper = tarDumper{} +var _ dumper = &tarDumper{} // WriteTar will write the contents of the given tree, encoded as a tar to the given destination. func WriteTar(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dst io.Writer) error { - dmp := tarDumper{w: tar.NewWriter(dst)} - - return writeDump(ctx, repo, tree, rootPath, dmp, dst) + dmp := &tarDumper{ + cache: NewCache(), + w: tar.NewWriter(dst), + } + return writeDump(ctx, repo, tree, rootPath, dmp) } -func (dmp tarDumper) Close() error { +func (dmp *tarDumper) Close() error { return dmp.w.Close() } @@ -39,7 +43,7 @@ const ( cISVTX = 0o1000 // Save text (sticky bit) ) -func (dmp tarDumper) dumpNode(ctx context.Context, node *restic.Node, repo restic.Repository) error { +func (dmp *tarDumper) dumpNode(ctx context.Context, node *restic.Node, repo restic.Repository) error { relPath, err := filepath.Rel("/", node.Path) if err != nil { return err @@ -90,7 +94,7 @@ func (dmp tarDumper) dumpNode(ctx context.Context, node *restic.Node, repo resti return errors.Wrap(err, "TarHeader") } - return GetNodeData(ctx, dmp.w, repo, node) + return WriteNodeData(ctx, dmp.w, repo, node, dmp.cache) } func parseXattrs(xattrs []restic.ExtendedAttribute) map[string]string { diff --git a/internal/dump/zip.go b/internal/dump/zip.go index f1e779d8e..96e2c95b9 100644 --- a/internal/dump/zip.go +++ b/internal/dump/zip.go @@ -6,29 +6,33 @@ import ( "io" "path/filepath" + "github.com/restic/restic/internal/bloblru" "github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/restic" ) type zipDumper struct { - w *zip.Writer + cache *bloblru.Cache + w *zip.Writer } // Statically ensure that zipDumper implements dumper. -var _ dumper = zipDumper{} +var _ dumper = &zipDumper{} // WriteZip will write the contents of the given tree, encoded as a zip to the given destination. func WriteZip(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dst io.Writer) error { - dmp := zipDumper{w: zip.NewWriter(dst)} - - return writeDump(ctx, repo, tree, rootPath, dmp, dst) + dmp := &zipDumper{ + cache: NewCache(), + w: zip.NewWriter(dst), + } + return writeDump(ctx, repo, tree, rootPath, dmp) } -func (dmp zipDumper) Close() error { +func (dmp *zipDumper) Close() error { return dmp.w.Close() } -func (dmp zipDumper) dumpNode(ctx context.Context, node *restic.Node, repo restic.Repository) error { +func (dmp *zipDumper) dumpNode(ctx context.Context, node *restic.Node, repo restic.Repository) error { relPath, err := filepath.Rel("/", node.Path) if err != nil { return err @@ -58,5 +62,5 @@ func (dmp zipDumper) dumpNode(ctx context.Context, node *restic.Node, repo resti return nil } - return GetNodeData(ctx, w, repo, node) + return WriteNodeData(ctx, w, repo, node, dmp.cache) } diff --git a/internal/fuse/file.go b/internal/fuse/file.go index def7ae85e..2de2660e3 100644 --- a/internal/fuse/file.go +++ b/internal/fuse/file.go @@ -96,7 +96,7 @@ func (f *file) Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.OpenR func (f *openFile) getBlobAt(ctx context.Context, i int) (blob []byte, err error) { - blob, ok := f.root.blobCache.get(f.node.Content[i]) + blob, ok := f.root.blobCache.Get(f.node.Content[i]) if ok { return blob, nil } @@ -107,7 +107,7 @@ func (f *openFile) getBlobAt(ctx context.Context, i int) (blob []byte, err error return nil, err } - f.root.blobCache.add(f.node.Content[i], blob) + f.root.blobCache.Add(f.node.Content[i], blob) return blob, nil } diff --git a/internal/fuse/fuse_test.go b/internal/fuse/fuse_test.go index 24d8cf03d..690df770a 100644 --- a/internal/fuse/fuse_test.go +++ b/internal/fuse/fuse_test.go @@ -1,3 +1,4 @@ +//go:build darwin || freebsd || linux // +build darwin freebsd linux package fuse @@ -10,6 +11,7 @@ import ( "testing" "time" + "github.com/restic/restic/internal/bloblru" "github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/restic" @@ -19,48 +21,6 @@ import ( rtest "github.com/restic/restic/internal/test" ) -func TestCache(t *testing.T) { - var id1, id2, id3 restic.ID - id1[0] = 1 - id2[0] = 2 - id3[0] = 3 - - const ( - kiB = 1 << 10 - cacheSize = 64*kiB + 3*cacheOverhead - ) - - c := newBlobCache(cacheSize) - - addAndCheck := func(id restic.ID, exp []byte) { - c.add(id, exp) - blob, ok := c.get(id) - rtest.Assert(t, ok, "blob %v added but not found in cache", id) - rtest.Equals(t, &exp[0], &blob[0]) - rtest.Equals(t, exp, blob) - } - - addAndCheck(id1, make([]byte, 32*kiB)) - addAndCheck(id2, make([]byte, 30*kiB)) - addAndCheck(id3, make([]byte, 10*kiB)) - - _, ok := c.get(id2) - rtest.Assert(t, ok, "blob %v not present", id2) - _, ok = c.get(id1) - rtest.Assert(t, !ok, "blob %v present, but should have been evicted", id1) - - c.add(id1, make([]byte, 1+c.size)) - _, ok = c.get(id1) - rtest.Assert(t, !ok, "blob %v too large but still added to cache") - - c.c.Remove(id1) - c.c.Remove(id3) - c.c.Remove(id2) - - rtest.Equals(t, cacheSize, c.size) - rtest.Equals(t, cacheSize, c.free) -} - func testRead(t testing.TB, f fs.Handle, offset, length int, data []byte) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -156,7 +116,7 @@ func TestFuseFile(t *testing.T) { Size: filesize, Content: content, } - root := &Root{repo: repo, blobCache: newBlobCache(blobCacheSize)} + root := &Root{repo: repo, blobCache: bloblru.New(blobCacheSize)} inode := fs.GenerateDynamicInode(1, "foo") f, err := newFile(context.TODO(), root, inode, node) @@ -191,7 +151,7 @@ func TestFuseDir(t *testing.T) { repo, cleanup := repository.TestRepository(t) defer cleanup() - root := &Root{repo: repo, blobCache: newBlobCache(blobCacheSize)} + root := &Root{repo: repo, blobCache: bloblru.New(blobCacheSize)} node := &restic.Node{ Mode: 0755, diff --git a/internal/fuse/root.go b/internal/fuse/root.go index ecbd10546..bed760f02 100644 --- a/internal/fuse/root.go +++ b/internal/fuse/root.go @@ -1,3 +1,4 @@ +//go:build darwin || freebsd || linux // +build darwin freebsd linux package fuse @@ -6,6 +7,7 @@ import ( "os" "time" + "github.com/restic/restic/internal/bloblru" "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/restic" @@ -27,7 +29,7 @@ type Root struct { cfg Config inode uint64 snapshots restic.Snapshots - blobCache *blobCache + blobCache *bloblru.Cache snCount int lastCheck time.Time @@ -54,7 +56,7 @@ func NewRoot(repo restic.Repository, cfg Config) *Root { repo: repo, inode: rootInode, cfg: cfg, - blobCache: newBlobCache(blobCacheSize), + blobCache: bloblru.New(blobCacheSize), } if !cfg.OwnerIsRoot {