From d42c169458c5bab1f659db72bf1e29c53f4d548f Mon Sep 17 00:00:00 2001 From: greatroar <@> Date: Sun, 14 Jun 2020 20:48:52 +0200 Subject: [PATCH 1/3] Fix quadratic file reading in restic mount --- internal/fuse/file.go | 54 +++++++++++++++++--------------------- internal/fuse/fuse_test.go | 2 -- 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/internal/fuse/file.go b/internal/fuse/file.go index 7d7f489c8..791265e13 100644 --- a/internal/fuse/file.go +++ b/internal/fuse/file.go @@ -3,6 +3,8 @@ package fuse import ( + "sort" + "github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/restic" @@ -18,21 +20,26 @@ const blockSize = 512 // Statically ensure that *file implements the given interface var _ = fs.HandleReader(&file{}) -var _ = fs.HandleReleaser(&file{}) type file struct { root *Root node *restic.Node inode uint64 - sizes []int - blobs [][]byte + // cumsize[i] holds the cumulative size of blobs[:i]. + cumsize []uint64 + + // Cached blob and its index in the blobs of node. + cached struct { + blob []byte + index int + } } func newFile(ctx context.Context, root *Root, inode uint64, node *restic.Node) (fusefile *file, err error) { debug.Log("create new file for %v with %d blobs", node.Name, len(node.Content)) var bytes uint64 - sizes := make([]int, len(node.Content)) + cumsize := make([]uint64, 1+len(node.Content)) for i, id := range node.Content { size, ok := root.blobSizeCache.Lookup(id) if !ok { @@ -43,8 +50,8 @@ func newFile(ctx context.Context, root *Root, inode uint64, node *restic.Node) ( } } - sizes[i] = int(size) bytes += uint64(size) + cumsize[i+1] = bytes } if bytes != node.Size { @@ -56,8 +63,8 @@ func newFile(ctx context.Context, root *Root, inode uint64, node *restic.Node) ( inode: inode, root: root, node: node, - sizes: sizes, - blobs: make([][]byte, len(node.Content)), + + cumsize: cumsize, }, nil } @@ -84,13 +91,8 @@ func (f *file) Attr(ctx context.Context, a *fuse.Attr) error { func (f *file) getBlobAt(ctx context.Context, i int) (blob []byte, err error) { debug.Log("getBlobAt(%v, %v)", f.node.Name, i) - if f.blobs[i] != nil { - return f.blobs[i], nil - } - - // release earlier blobs - for j := 0; j < i; j++ { - f.blobs[j] = nil + if i == f.cached.index && f.cached.blob != nil { + return f.cached.blob, nil } blob, err = f.root.repo.LoadBlob(ctx, restic.DataBlob, f.node.Content[i], nil) @@ -98,16 +100,16 @@ func (f *file) getBlobAt(ctx context.Context, i int) (blob []byte, err error) { debug.Log("LoadBlob(%v, %v) failed: %v", f.node.Name, f.node.Content[i], err) return nil, err } - f.blobs[i] = blob + f.cached.blob, f.cached.index = blob, i return blob, nil } func (f *file) Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) error { debug.Log("Read(%v, %v, %v), file size %v", f.node.Name, req.Size, req.Offset, f.node.Size) - offset := req.Offset + offset := uint64(req.Offset) - if uint64(offset) > f.node.Size { + if offset > f.node.Size { debug.Log("Read(%v): offset is greater than file size: %v > %v", f.node.Name, req.Offset, f.node.Size) @@ -123,16 +125,15 @@ func (f *file) Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadR } // Skip blobs before the offset - startContent := 0 - for offset > int64(f.sizes[startContent]) { - offset -= int64(f.sizes[startContent]) - startContent++ - } + startContent := -1 + sort.Search(len(f.cumsize), func(i int) bool { + return f.cumsize[i] > offset + }) + offset -= f.cumsize[startContent] dst := resp.Data[0:req.Size] readBytes := 0 remainingBytes := req.Size - for i := startContent; remainingBytes > 0 && i < len(f.sizes); i++ { + for i := startContent; remainingBytes > 0 && i < len(f.cumsize)-1; i++ { blob, err := f.getBlobAt(ctx, i) if err != nil { return err @@ -154,13 +155,6 @@ func (f *file) Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadR return nil } -func (f *file) Release(ctx context.Context, req *fuse.ReleaseRequest) error { - for i := range f.blobs { - f.blobs[i] = nil - } - return nil -} - func (f *file) Listxattr(ctx context.Context, req *fuse.ListxattrRequest, resp *fuse.ListxattrResponse) error { debug.Log("Listxattr(%v, %v)", f.node.Name, req.Size) for _, attr := range f.node.ExtendedAttributes { diff --git a/internal/fuse/fuse_test.go b/internal/fuse/fuse_test.go index bb54224ee..147cee317 100644 --- a/internal/fuse/fuse_test.go +++ b/internal/fuse/fuse_test.go @@ -146,8 +146,6 @@ func TestFuseFile(t *testing.T) { t.Errorf("test %d failed, wrong data returned (offset %v, length %v)", i, offset, length) } } - - rtest.OK(t, f.Release(ctx, nil)) } // Test top-level directories for their UID and GID. From 58719e1f47ab6c28a6d3708fcac9bd7767cffc4f Mon Sep 17 00:00:00 2001 From: greatroar <@> Date: Wed, 17 Jun 2020 12:17:55 +0200 Subject: [PATCH 2/3] Replace mount's per-file cache by a global LRU cache --- cmd/restic/cmd_mount.go | 5 +-- go.mod | 2 +- internal/fuse/blobcache.go | 87 ++++++++++++++++++++++++++++++++++++++ internal/fuse/file.go | 15 +++---- internal/fuse/fuse_test.go | 52 ++++++++++++++++++++--- internal/fuse/root.go | 9 +++- 6 files changed, 147 insertions(+), 23 deletions(-) create mode 100644 internal/fuse/blobcache.go diff --git a/cmd/restic/cmd_mount.go b/cmd/restic/cmd_mount.go index 64c7cb987..b17c3d478 100644 --- a/cmd/restic/cmd_mount.go +++ b/cmd/restic/cmd_mount.go @@ -139,10 +139,7 @@ func mount(opts MountOptions, gopts GlobalOptions, mountpoint string) error { Paths: opts.Paths, SnapshotTemplate: opts.SnapshotTemplate, } - root, err := fuse.NewRoot(gopts.ctx, repo, cfg) - if err != nil { - return err - } + root := fuse.NewRoot(gopts.ctx, repo, cfg) Printf("Now serving the repository at %s\n", mountpoint) Printf("When finished, quit with Ctrl-c or umount the mountpoint.\n") diff --git a/go.mod b/go.mod index 8bd5901fa..20bf25234 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/golang/protobuf v1.3.1 // indirect github.com/google/go-cmp v0.2.0 github.com/gopherjs/gopherjs v0.0.0-20190411002643-bd77b112433e // indirect - github.com/hashicorp/golang-lru v0.5.1 // indirect + github.com/hashicorp/golang-lru v0.5.1 github.com/inconshreveable/mousetrap v1.0.0 // indirect github.com/juju/ratelimit v1.0.1 github.com/kr/fs v0.1.0 // indirect diff --git a/internal/fuse/blobcache.go b/internal/fuse/blobcache.go new file mode 100644 index 000000000..728457b11 --- /dev/null +++ b/internal/fuse/blobcache.go @@ -0,0 +1,87 @@ +package fuse + +import ( + "sync" + + "github.com/restic/restic/internal/debug" + "github.com/restic/restic/internal/restic" + + "github.com/hashicorp/golang-lru/simplelru" +) + +// Crude estimate of the overhead per blob: a SHA-256, a linked list node +// and some pointers. See comment in blobCache.add. +const cacheOverhead = len(restic.ID{}) + 64 + +// A blobCache is a fixed-size cache of blob contents. +// It is safe for concurrent access. +type blobCache struct { + mu sync.Mutex + c *simplelru.LRU + + free, size int // Current and max capacity, in bytes. +} + +// Construct a blob cache that stores at most size bytes worth of blobs. +func newBlobCache(size int) *blobCache { + c := &blobCache{ + free: size, + size: size, + } + + // NewLRU wants us to specify some max. number of entries, else it errors. + // The actual maximum will be smaller than size/cacheOverhead, because we + // evict entries (RemoveOldest in add) to maintain our size bound. + maxEntries := size / cacheOverhead + lru, err := simplelru.NewLRU(maxEntries, c.evict) + if err != nil { + panic(err) // Can only be maxEntries <= 0. + } + c.c = lru + + return c +} + +func (c *blobCache) add(id restic.ID, blob []byte) { + debug.Log("blobCache: add %v", id) + + size := len(blob) + cacheOverhead + if size > c.size { + return + } + + c.mu.Lock() + defer c.mu.Unlock() + + var key interface{} = id + + if c.c.Contains(key) { // Doesn't update the recency list. + return + } + + // This loop takes at most min(maxEntries, maxchunksize/cacheOverhead) + // iterations. + for size > c.free { + c.c.RemoveOldest() + } + + c.c.Add(key, blob) + c.free -= size +} + +func (c *blobCache) get(id restic.ID) ([]byte, bool) { + c.mu.Lock() + value, ok := c.c.Get(id) + c.mu.Unlock() + + debug.Log("blobCache: get %v, hit %v", id, ok) + + blob, ok := value.([]byte) + return blob, ok +} + +func (c *blobCache) evict(key, value interface{}) { + blob := value.([]byte) + debug.Log("blobCache: evict %v, %d bytes", key, len(blob)) + c.free += len(blob) + cacheOverhead +} diff --git a/internal/fuse/file.go b/internal/fuse/file.go index 791265e13..b38258fd1 100644 --- a/internal/fuse/file.go +++ b/internal/fuse/file.go @@ -28,12 +28,6 @@ type file struct { // cumsize[i] holds the cumulative size of blobs[:i]. cumsize []uint64 - - // Cached blob and its index in the blobs of node. - cached struct { - blob []byte - index int - } } func newFile(ctx context.Context, root *Root, inode uint64, node *restic.Node) (fusefile *file, err error) { @@ -91,8 +85,10 @@ func (f *file) Attr(ctx context.Context, a *fuse.Attr) error { func (f *file) getBlobAt(ctx context.Context, i int) (blob []byte, err error) { debug.Log("getBlobAt(%v, %v)", f.node.Name, i) - if i == f.cached.index && f.cached.blob != nil { - return f.cached.blob, nil + + blob, ok := f.root.blobCache.get(f.node.Content[i]) + if ok { + return blob, nil } blob, err = f.root.repo.LoadBlob(ctx, restic.DataBlob, f.node.Content[i], nil) @@ -100,7 +96,8 @@ func (f *file) getBlobAt(ctx context.Context, i int) (blob []byte, err error) { debug.Log("LoadBlob(%v, %v) failed: %v", f.node.Name, f.node.Content[i], err) return nil, err } - f.cached.blob, f.cached.index = blob, i + + f.root.blobCache.add(f.node.Content[i], blob) return blob, nil } diff --git a/internal/fuse/fuse_test.go b/internal/fuse/fuse_test.go index 147cee317..c855f403c 100644 --- a/internal/fuse/fuse_test.go +++ b/internal/fuse/fuse_test.go @@ -20,6 +20,48 @@ import ( rtest "github.com/restic/restic/internal/test" ) +func TestCache(t *testing.T) { + var id1, id2, id3 restic.ID + id1[0] = 1 + id2[0] = 2 + id3[0] = 3 + + const ( + kiB = 1 << 10 + cacheSize = 64*kiB + 3*cacheOverhead + ) + + c := newBlobCache(cacheSize) + + addAndCheck := func(id restic.ID, exp []byte) { + c.add(id, exp) + blob, ok := c.get(id) + rtest.Assert(t, ok, "blob %v added but not found in cache", id) + rtest.Equals(t, &exp[0], &blob[0]) + rtest.Equals(t, exp, blob) + } + + addAndCheck(id1, make([]byte, 32*kiB)) + addAndCheck(id2, make([]byte, 30*kiB)) + addAndCheck(id3, make([]byte, 10*kiB)) + + _, ok := c.get(id2) + rtest.Assert(t, ok, "blob %v not present", id2) + _, ok = c.get(id1) + rtest.Assert(t, !ok, "blob %v present, but should have been evicted", id1) + + c.add(id1, make([]byte, 1+c.size)) + _, ok = c.get(id1) + rtest.Assert(t, !ok, "blob %v too large but still added to cache") + + c.c.Remove(id1) + c.c.Remove(id3) + c.c.Remove(id2) + + rtest.Equals(t, cacheSize, c.size) + rtest.Equals(t, cacheSize, c.free) +} + func testRead(t testing.TB, f *file, offset, length int, data []byte) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -114,10 +156,7 @@ func TestFuseFile(t *testing.T) { Size: filesize, Content: content, } - root := &Root{ - blobSizeCache: NewBlobSizeCache(context.TODO(), repo.Index()), - repo: repo, - } + root := NewRoot(context.TODO(), repo, Config{}) t.Logf("blob cache has %d entries", len(root.blobSizeCache.m)) @@ -163,11 +202,10 @@ func testTopUidGid(t *testing.T, cfg Config, repo restic.Repository, uid, gid ui t.Helper() ctx := context.Background() - root, err := NewRoot(ctx, repo, cfg) - rtest.OK(t, err) + root := NewRoot(ctx, repo, cfg) var attr fuse.Attr - err = root.Attr(ctx, &attr) + err := root.Attr(ctx, &attr) rtest.OK(t, err) rtest.Equals(t, uid, attr.Uid) rtest.Equals(t, gid, attr.Gid) diff --git a/internal/fuse/root.go b/internal/fuse/root.go index a7061b224..c43f8b448 100644 --- a/internal/fuse/root.go +++ b/internal/fuse/root.go @@ -29,6 +29,7 @@ type Root struct { cfg Config inode uint64 snapshots restic.Snapshots + blobCache *blobCache blobSizeCache *BlobSizeCache snCount int @@ -45,14 +46,18 @@ var _ = fs.NodeStringLookuper(&Root{}) const rootInode = 1 +// Size of the blob cache. TODO: make this configurable. +const blobCacheSize = 64 << 20 + // NewRoot initializes a new root node from a repository. -func NewRoot(ctx context.Context, repo restic.Repository, cfg Config) (*Root, error) { +func NewRoot(ctx context.Context, repo restic.Repository, cfg Config) *Root { debug.Log("NewRoot(), config %v", cfg) root := &Root{ repo: repo, inode: rootInode, cfg: cfg, + blobCache: newBlobCache(blobCacheSize), blobSizeCache: NewBlobSizeCache(ctx, repo.Index()), } @@ -70,7 +75,7 @@ func NewRoot(ctx context.Context, repo restic.Repository, cfg Config) (*Root, er root.MetaDir = NewMetaDir(root, rootInode, entries) - return root, nil + return root } // Root is just there to satisfy fs.Root, it returns itself. From 4cf1c8e8da193bbaba889426eb8c510505daa6e2 Mon Sep 17 00:00:00 2001 From: greatroar <@> Date: Sun, 12 Jul 2020 10:17:04 +0200 Subject: [PATCH 3/3] Changelog entry for subquadratic reading in mount --- changelog/unreleased/pull-2790 | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 changelog/unreleased/pull-2790 diff --git a/changelog/unreleased/pull-2790 b/changelog/unreleased/pull-2790 new file mode 100644 index 000000000..b1cbd2c19 --- /dev/null +++ b/changelog/unreleased/pull-2790 @@ -0,0 +1,6 @@ +Enhancement: Optimized file access in restic mount + +Reading large (> 100GiB) files from restic mountpoints is now faster, +and the speedup is greater for larger files. + +https://github.com/restic/restic/pull/2790