From 2ba21fe72bf93f4caee3de1b2ea58bf0a50c7a64 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 9 Mar 2024 17:38:41 +0100 Subject: [PATCH 1/6] archiver: only store deviceID for hardlinks The deviceID can change e.g. when backing up from filesystem snapshot. It is only used for hardlink detection. Thus there it is not necessary to store it for everything else. --- internal/archiver/archiver.go | 6 ++++++ internal/archiver/archiver_test.go | 1 + internal/restic/node.go | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 77ddba7c4..63a1691b3 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -188,6 +188,12 @@ func (arch *Archiver) nodeFromFileInfo(snPath, filename string, fi os.FileInfo) if !arch.WithAtime { node.AccessTime = node.ModTime } + if node.Links == 1 || node.Type == "dir" { + // the DeviceID is only necessary for hardlinked files + // when using subvolumes or snapshots their deviceIDs tend to change which causes + // restic to upload new tree blobs + node.DeviceID = 0 + } // overwrite name to match that within the snapshot node.Name = path.Base(snPath) if err != nil { diff --git a/internal/archiver/archiver_test.go b/internal/archiver/archiver_test.go index 46ef44251..3d50e555f 100644 --- a/internal/archiver/archiver_test.go +++ b/internal/archiver/archiver_test.go @@ -2153,6 +2153,7 @@ func TestMetadataChanged(t *testing.T) { sn, node2 := snapshot(t, repo, fs, nil, "testfile") // set some values so we can then compare the nodes + want.DeviceID = 0 want.Content = node2.Content want.Path = "" if len(want.ExtendedAttributes) == 0 { diff --git a/internal/restic/node.go b/internal/restic/node.go index cbe9ef363..e7688aada 100644 --- a/internal/restic/node.go +++ b/internal/restic/node.go @@ -82,7 +82,7 @@ type Node struct { User string `json:"user,omitempty"` Group string `json:"group,omitempty"` Inode uint64 `json:"inode,omitempty"` - DeviceID uint64 `json:"device_id,omitempty"` // device id of the file, stat.st_dev + DeviceID uint64 `json:"device_id,omitempty"` // device id of the file, stat.st_dev, only stored for hardlinks Size uint64 `json:"size,omitempty"` Links uint64 `json:"links,omitempty"` LinkTarget string `json:"linktarget,omitempty"` From a26d6ffa720af4dde2504c560ce838470bdf21ab Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 9 Mar 2024 17:44:48 +0100 Subject: [PATCH 2/6] archiver: move deviceID handling behind feature flag --- internal/archiver/archiver.go | 13 ++++++++----- internal/archiver/archiver_test.go | 3 +++ internal/feature/registry.go | 2 ++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 63a1691b3..19d16c4d3 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -12,6 +12,7 @@ import ( "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" + "github.com/restic/restic/internal/feature" "github.com/restic/restic/internal/fs" "github.com/restic/restic/internal/restic" "golang.org/x/sync/errgroup" @@ -188,11 +189,13 @@ func (arch *Archiver) nodeFromFileInfo(snPath, filename string, fi os.FileInfo) if !arch.WithAtime { node.AccessTime = node.ModTime } - if node.Links == 1 || node.Type == "dir" { - // the DeviceID is only necessary for hardlinked files - // when using subvolumes or snapshots their deviceIDs tend to change which causes - // restic to upload new tree blobs - node.DeviceID = 0 + if feature.Flag.Enabled(feature.DeviceIDForHardlinks) { + if node.Links == 1 || node.Type == "dir" { + // the DeviceID is only necessary for hardlinked files + // when using subvolumes or snapshots their deviceIDs tend to change which causes + // restic to upload new tree blobs + node.DeviceID = 0 + } } // overwrite name to match that within the snapshot node.Name = path.Base(snPath) diff --git a/internal/archiver/archiver_test.go b/internal/archiver/archiver_test.go index 3d50e555f..411994911 100644 --- a/internal/archiver/archiver_test.go +++ b/internal/archiver/archiver_test.go @@ -19,6 +19,7 @@ import ( "github.com/restic/restic/internal/backend/mem" "github.com/restic/restic/internal/checker" "github.com/restic/restic/internal/errors" + "github.com/restic/restic/internal/feature" "github.com/restic/restic/internal/fs" "github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/restic" @@ -2125,6 +2126,8 @@ const ( ) func TestMetadataChanged(t *testing.T) { + defer feature.TestSetFlag(t, feature.Flag, feature.DeviceIDForHardlinks, true)() + files := TestDir{ "testfile": TestFile{ Content: "foo bar test file", diff --git a/internal/feature/registry.go b/internal/feature/registry.go index 620c9ec35..6a9874786 100644 --- a/internal/feature/registry.go +++ b/internal/feature/registry.go @@ -7,11 +7,13 @@ var Flag = New() const ( ExampleFeature FlagName = "example-feature" DeprecateLegacyIndex FlagName = "deprecate-legacy-index" + DeviceIDForHardlinks FlagName = "device-id-for-hardlinks" ) func init() { Flag.SetFlags(map[FlagName]FlagDesc{ ExampleFeature: {Type: Alpha, Description: "just for testing"}, DeprecateLegacyIndex: {Type: Beta, Description: "disable support for index format used by restic 0.1.0. Use `restic repair index` to update the index if necessary."}, + DeviceIDForHardlinks: {Type: Alpha, Description: "store deviceID only for hardlinks to reduce metadata changes for example when using btrfs subvolumes. Will be removed in a future restic version after repository format 3 is available"}, }) } From a9b3d86c4f4d90a7b48ee02da6bf328b95ef4332 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Thu, 28 Mar 2024 18:44:45 +0100 Subject: [PATCH 3/6] features: remove example feature --- internal/feature/registry.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/feature/registry.go b/internal/feature/registry.go index 6a9874786..4693b8909 100644 --- a/internal/feature/registry.go +++ b/internal/feature/registry.go @@ -5,14 +5,12 @@ var Flag = New() // flag names are written in kebab-case const ( - ExampleFeature FlagName = "example-feature" DeprecateLegacyIndex FlagName = "deprecate-legacy-index" DeviceIDForHardlinks FlagName = "device-id-for-hardlinks" ) func init() { Flag.SetFlags(map[FlagName]FlagDesc{ - ExampleFeature: {Type: Alpha, Description: "just for testing"}, DeprecateLegacyIndex: {Type: Beta, Description: "disable support for index format used by restic 0.1.0. Use `restic repair index` to update the index if necessary."}, DeviceIDForHardlinks: {Type: Alpha, Description: "store deviceID only for hardlinks to reduce metadata changes for example when using btrfs subvolumes. Will be removed in a future restic version after repository format 3 is available"}, }) From d705741571530d4a9b1b3ee61dddeaf211aea393 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Thu, 28 Mar 2024 19:11:56 +0100 Subject: [PATCH 4/6] backup: test that deviceID is only stored for hardlinks --- internal/archiver/archiver_unix_test.go | 48 +++++++++++++++++++++++++ internal/archiver/testing.go | 26 +++++++++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/internal/archiver/archiver_unix_test.go b/internal/archiver/archiver_unix_test.go index 7523f0749..2552b23e1 100644 --- a/internal/archiver/archiver_unix_test.go +++ b/internal/archiver/archiver_unix_test.go @@ -6,6 +6,12 @@ package archiver import ( "os" "syscall" + "testing" + + "github.com/restic/restic/internal/feature" + "github.com/restic/restic/internal/fs" + "github.com/restic/restic/internal/restic" + restictest "github.com/restic/restic/internal/test" ) type wrappedFileInfo struct { @@ -39,3 +45,45 @@ func wrapFileInfo(fi os.FileInfo) os.FileInfo { return res } + +func statAndSnapshot(t *testing.T, repo restic.Repository, name string) (*restic.Node, *restic.Node) { + fi := lstat(t, name) + want, err := restic.NodeFromFileInfo(name, fi) + restictest.OK(t, err) + + _, node := snapshot(t, repo, fs.Local{}, nil, name) + return want, node +} + +func TestHardlinkMetadata(t *testing.T) { + defer feature.TestSetFlag(t, feature.Flag, feature.DeviceIDForHardlinks, true)() + + files := TestDir{ + "testfile": TestFile{ + Content: "foo bar test file", + }, + "linktarget": TestFile{ + Content: "test file", + }, + "testlink": TestHardlink{ + Target: "./linktarget", + }, + "testdir": TestDir{}, + } + + tempdir, repo := prepareTempdirRepoSrc(t, files) + + back := restictest.Chdir(t, tempdir) + defer back() + + want, node := statAndSnapshot(t, repo, "testlink") + restictest.Assert(t, node.DeviceID == want.DeviceID, "device id mismatch expected %v got %v", want.DeviceID, node.DeviceID) + restictest.Assert(t, node.Links == want.Links, "link count mismatch expected %v got %v", want.Links, node.Links) + restictest.Assert(t, node.Inode == want.Inode, "inode mismatch expected %v got %v", want.Inode, node.Inode) + + _, node = statAndSnapshot(t, repo, "testfile") + restictest.Assert(t, node.DeviceID == 0, "device id mismatch for testfile expected %v got %v", 0, node.DeviceID) + + _, node = statAndSnapshot(t, repo, "testdir") + restictest.Assert(t, node.DeviceID == 0, "device id mismatch for testdir expected %v got %v", 0, node.DeviceID) +} diff --git a/internal/archiver/testing.go b/internal/archiver/testing.go index 111c1e68c..0bbd03a72 100644 --- a/internal/archiver/testing.go +++ b/internal/archiver/testing.go @@ -6,6 +6,7 @@ import ( "path" "path/filepath" "runtime" + "sort" "strings" "testing" "time" @@ -63,11 +64,29 @@ func (s TestSymlink) String() string { return "" } +// TestHardlink describes a hardlink created for a test. +type TestHardlink struct { + Target string +} + +func (s TestHardlink) String() string { + return "" +} + // TestCreateFiles creates a directory structure described by dir at target, // which must already exist. On Windows, symlinks aren't created. func TestCreateFiles(t testing.TB, target string, dir TestDir) { t.Helper() - for name, item := range dir { + + // ensure a stable order such that it can be guaranteed that a hardlink target already exists + var names []string + for name := range dir { + names = append(names, name) + } + sort.Strings(names) + + for _, name := range names { + item := dir[name] targetPath := filepath.Join(target, name) switch it := item.(type) { @@ -81,6 +100,11 @@ func TestCreateFiles(t testing.TB, target string, dir TestDir) { if err != nil { t.Fatal(err) } + case TestHardlink: + err := fs.Link(filepath.Join(target, filepath.FromSlash(it.Target)), targetPath) + if err != nil { + t.Fatal(err) + } case TestDir: err := fs.Mkdir(targetPath, 0755) if err != nil { From 21cf38fe96570e073b163a96ff6dedae073ad041 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Thu, 28 Mar 2024 19:25:52 +0100 Subject: [PATCH 5/6] add changelog for deviceID only for hardlinks --- changelog/unreleased/pull-4006 | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 changelog/unreleased/pull-4006 diff --git a/changelog/unreleased/pull-4006 b/changelog/unreleased/pull-4006 new file mode 100644 index 000000000..01f4ddb6e --- /dev/null +++ b/changelog/unreleased/pull-4006 @@ -0,0 +1,16 @@ +Enhancement: (alpha) Store deviceID only for hardlinks + +Set `RESTIC_FEATURES=device-id-for-hardlinks` to enable this alpha feature. +The feature flag will be removed after repository format version 3 becomes +available or be replaced with a different solution. + +When creating backups from a filesystem snapshot, for example created using +btrfs subvolumes, the deviceID of the filesystem changes compared to previous +snapshots. This prevented restic from deduplicating the directory metadata of +a snapshot. + +When this alpha feature is enabled, then the deviceID is only stored for +hardlinks. This significantly reduces the metadata duplication for most +backups. + +https://github.com/restic/restic/pull/4006 From cf81f8ced63bee4f1d56c95b1e6557af251e2dcb Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Thu, 28 Mar 2024 21:28:01 +0100 Subject: [PATCH 6/6] stats: only check for hardlinks for files with more than one link --- changelog/unreleased/pull-4503 | 1 + cmd/restic/cmd_stats.go | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/changelog/unreleased/pull-4503 b/changelog/unreleased/pull-4503 index 3ce5c48e8..b52552d69 100644 --- a/changelog/unreleased/pull-4503 +++ b/changelog/unreleased/pull-4503 @@ -4,4 +4,5 @@ If files on different devices had the same inode id, then the `stats` command did not correctly calculate the snapshot size. This has been fixed. https://github.com/restic/restic/pull/4503 +https://github.com/restic/restic/pull/4006 https://forum.restic.net/t/possible-bug-in-stats/6461/8 diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index d3078a419..b84620bab 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -270,11 +270,14 @@ func statsWalkTree(repo restic.Loader, opts StatsOptions, stats *statsContainer, // will still be restored stats.TotalFileCount++ - // if inodes are present, only count each inode once - // (hard links do not increase restore size) - if !hardLinkIndex.Has(node.Inode, node.DeviceID) || node.Inode == 0 { - hardLinkIndex.Add(node.Inode, node.DeviceID, struct{}{}) + if node.Links == 1 || node.Type == "dir" { stats.TotalSize += node.Size + } else { + // if hardlinks are present only count each deviceID+inode once + if !hardLinkIndex.Has(node.Inode, node.DeviceID) || node.Inode == 0 { + hardLinkIndex.Add(node.Inode, node.DeviceID, struct{}{}) + stats.TotalSize += node.Size + } } }