From 2e606ca70b0e77b24b2453701f10cca93885f1b4 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 24 Sep 2022 11:57:16 +0200 Subject: [PATCH] backup: rework read concurrency --- changelog/unreleased/pull-2750 | 3 ++- cmd/restic/cmd_backup.go | 21 +++++++-------------- doc/040_backup.rst | 1 + doc/047_tuning_backup_parameters.rst | 10 +++++----- doc/manual_rest.rst | 2 +- internal/archiver/archiver.go | 12 ++++++------ internal/archiver/archiver_test.go | 2 +- 7 files changed, 23 insertions(+), 28 deletions(-) diff --git a/changelog/unreleased/pull-2750 b/changelog/unreleased/pull-2750 index 381c980ba..4dcec2204 100644 --- a/changelog/unreleased/pull-2750 +++ b/changelog/unreleased/pull-2750 @@ -1,5 +1,6 @@ Enhancement: Make backup file read concurrency configurable -In order to tune restic for special situations we need to be able to configure `backup --file-read-concurrency`. +The `backup` command now supports a `--read-concurrency` flag to allowing +tuning restic for very fast storage like NVME disks. https://github.com/restic/restic/pull/2750 diff --git a/cmd/restic/cmd_backup.go b/cmd/restic/cmd_backup.go index 49c71c21d..e36471762 100644 --- a/cmd/restic/cmd_backup.go +++ b/cmd/restic/cmd_backup.go @@ -101,7 +101,7 @@ type BackupOptions struct { IgnoreCtime bool UseFsSnapshot bool DryRun bool - FileReadConcurrency uint + ReadConcurrency uint } var backupOptions BackupOptions @@ -110,12 +110,6 @@ var backupOptions BackupOptions var ErrInvalidSourceData = errors.New("at least one source file could not be read") func init() { - //set FileReadConcurrency to 2 if not set in env - fileReadConcurrency, err := strconv.ParseUint(os.Getenv("RESTIC_FILE_READ_CONCURRENCY"), 10, 32) - if err != nil || fileReadConcurrency < 1 { - fileReadConcurrency = 2 - } - cmdRoot.AddCommand(cmdBackup) f := cmdBackup.Flags() @@ -132,10 +126,10 @@ func init() { f.BoolVar(&backupOptions.Stdin, "stdin", false, "read backup from stdin") f.StringVar(&backupOptions.StdinFilename, "stdin-filename", "stdin", "`filename` to use when reading from stdin") f.Var(&backupOptions.Tags, "tag", "add `tags` for the new snapshot in the format `tag[,tag,...]` (can be specified multiple times)") - f.UintVar(&backupOptions.FileReadConcurrency, "file-read-concurrency", uint(fileReadConcurrency), "set concurrency on file reads. (default: $RESTIC_FILE_READ_CONCURRENCY or 2)") + f.UintVar(&backupOptions.ReadConcurrency, "read-concurrency", 0, "read `n` file concurrently. (default: $RESTIC_READ_CONCURRENCY or 2)") f.StringVarP(&backupOptions.Host, "host", "H", "", "set the `hostname` for the snapshot manually. To prevent an expensive rescan use the \"parent\" flag") f.StringVar(&backupOptions.Host, "hostname", "", "set the `hostname` for the snapshot manually") - err = f.MarkDeprecated("hostname", "use --host") + err := f.MarkDeprecated("hostname", "use --host") if err != nil { // MarkDeprecated only returns an error when the flag could not be found panic(err) @@ -152,6 +146,9 @@ func init() { f.BoolVar(&backupOptions.UseFsSnapshot, "use-fs-snapshot", false, "use filesystem snapshot where possible (currently only Windows VSS)") } + // parse read concurrency from env, on error the default value will be used + readConcurrency, _ := strconv.ParseUint(os.Getenv("RESTIC_READ_CONCURRENCY"), 10, 32) + backupOptions.ReadConcurrency = uint(readConcurrency) } // filterExisting returns a slice of all existing items, or an error if no @@ -292,10 +289,6 @@ func (opts BackupOptions) Check(gopts GlobalOptions, args []string) error { } } - if backupOptions.FileReadConcurrency == 0 { - return errors.Fatal("--file-read-concurrency must be a positive, nonzero integer") - } - return nil } @@ -697,7 +690,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina } wg.Go(func() error { return sc.Scan(cancelCtx, targets) }) - arch := archiver.New(repo, targetFS, archiver.Options{FileReadConcurrency: backupOptions.FileReadConcurrency}) + arch := archiver.New(repo, targetFS, archiver.Options{ReadConcurrency: backupOptions.ReadConcurrency}) arch.SelectByName = selectByNameFilter arch.Select = selectFilter arch.WithAtime = opts.WithAtime diff --git a/doc/040_backup.rst b/doc/040_backup.rst index 891c6820d..cdd8302c9 100644 --- a/doc/040_backup.rst +++ b/doc/040_backup.rst @@ -555,6 +555,7 @@ environment variables. The following lists these environment variables: RESTIC_COMPRESSION Compression mode (only available for repository format version 2) RESTIC_PROGRESS_FPS Frames per second by which the progress bar is updated RESTIC_PACK_SIZE Target size for pack files + RESTIC_READ_CONCURRENCY Concurrency for file reads TMPDIR Location for temporary files diff --git a/doc/047_tuning_backup_parameters.rst b/doc/047_tuning_backup_parameters.rst index 989eea738..0de846e88 100644 --- a/doc/047_tuning_backup_parameters.rst +++ b/doc/047_tuning_backup_parameters.rst @@ -54,11 +54,11 @@ variable ``RESTIC_COMPRESSION``. File Read Concurrency ===================== -In some instances, such as backing up traditional spinning disks, reducing the file read -concurrency is desired. This will help reduce the amount of time spent seeking around -the disk and can increase the overall performance of the backup operation. You can specify -the concurrency of file reads with the ``RESTIC_FILE_READ_CONCURRENCY`` environment variable -or the ``--file-read-concurrency`` flag for the ``backup`` subcommand. +When backing up fast storage like NVME disks, it can be beneficial to increase the read +concurrency. This can increase the overall performance of the backup operation by reading +more files in parallel. You can specify the concurrency of file reads with the +``RESTIC_READ_CONCURRENCY`` environment variable or the ``--read-concurrency`` flag for +the ``backup`` command. Pack Size diff --git a/doc/manual_rest.rst b/doc/manual_rest.rst index 6c44acd89..f2d090209 100644 --- a/doc/manual_rest.rst +++ b/doc/manual_rest.rst @@ -103,7 +103,6 @@ command: --files-from file read the files to backup from file (can be combined with file args; can be specified multiple times) --files-from-raw file read the files to backup from file (can be combined with file args; can be specified multiple times) --files-from-verbatim file read the files to backup from file (can be combined with file args; can be specified multiple times) - --file-read-concurrency uint set concurrency on file reads. (default: $RESTIC_FILE_READ_CONCURRENCY or 2) -f, --force force re-reading the target files/directories (overrides the "parent" flag) -h, --help help for backup -H, --host hostname set the hostname for the snapshot manually. To prevent an expensive rescan use the "parent" flag @@ -113,6 +112,7 @@ command: --ignore-inode ignore inode number changes when checking for modified files -x, --one-file-system exclude other file systems, don't cross filesystem boundaries and subvolumes --parent snapshot use this parent snapshot (default: last snapshot in the repository that has the same target files/directories, and is not newer than the snapshot time) + --read-concurrency n read n file concurrently. (default: $RESTIC_READ_CONCURRENCY or 2) --stdin read backup from stdin --stdin-filename filename filename to use when reading from stdin (default "stdin") --tag tags add tags for the new snapshot in the format `tag[,tag,...]` (can be specified multiple times) (default []) diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 4fcc8e30c..776aedb53 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -95,10 +95,10 @@ const ( // Options is used to configure the archiver. type Options struct { - // FileReadConcurrency sets how many files are read in concurrently. If + // ReadConcurrency sets how many files are read in concurrently. If // it's set to zero, at most two files are read in concurrently (which // turned out to be a good default for most situations). - FileReadConcurrency uint + ReadConcurrency uint // SaveBlobConcurrency sets how many blobs are hashed and saved // concurrently. If it's set to zero, the default is the number of CPUs @@ -113,11 +113,11 @@ type Options struct { // ApplyDefaults returns a copy of o with the default options set for all unset // fields. func (o Options) ApplyDefaults() Options { - if o.FileReadConcurrency == 0 { + if o.ReadConcurrency == 0 { // two is a sweet spot for almost all situations. We've done some // experiments documented here: // https://github.com/borgbackup/borg/issues/3500 - o.FileReadConcurrency = 2 + o.ReadConcurrency = 2 } if o.SaveBlobConcurrency == 0 { @@ -132,7 +132,7 @@ func (o Options) ApplyDefaults() Options { // Also allow waiting for FileReadConcurrency files, this is the maximum of FutureFiles // which currently can be in progress. The main backup loop blocks when trying to queue // more files to read. - o.SaveTreeConcurrency = uint(runtime.GOMAXPROCS(0)) + o.FileReadConcurrency + o.SaveTreeConcurrency = uint(runtime.GOMAXPROCS(0)) + o.ReadConcurrency } return o @@ -782,7 +782,7 @@ func (arch *Archiver) runWorkers(ctx context.Context, wg *errgroup.Group) { arch.fileSaver = NewFileSaver(ctx, wg, arch.blobSaver.Save, arch.Repo.Config().ChunkerPolynomial, - arch.Options.FileReadConcurrency, arch.Options.SaveBlobConcurrency) + arch.Options.ReadConcurrency, arch.Options.SaveBlobConcurrency) arch.fileSaver.CompleteBlob = arch.CompleteBlob arch.fileSaver.NodeFromFileInfo = arch.nodeFromFileInfo diff --git a/internal/archiver/archiver_test.go b/internal/archiver/archiver_test.go index a6485234f..b5650d1b6 100644 --- a/internal/archiver/archiver_test.go +++ b/internal/archiver/archiver_test.go @@ -2040,7 +2040,7 @@ func TestArchiverAbortEarlyOnError(t *testing.T) { // at most two files may be queued arch := New(testRepo, testFS, Options{ - FileReadConcurrency: 2, + ReadConcurrency: 2, }) _, _, err := arch.Snapshot(ctx, []string{"."}, SnapshotOptions{Time: time.Now()})