From 1fd640a7549489c64d2baa4a9b5cc0bb1a1ce0ee Mon Sep 17 00:00:00 2001 From: Niraj Tolia Date: Mon, 30 Jan 2023 17:53:02 -0800 Subject: [PATCH] Add docs page for fault-tolerance (#2313) ## Description Add docs on Coso's fault tolerance and reliability ## Does this PR need a docs update or release note? - [x] :white_check_mark: Yes, it's included ## Type of change - [x] :world_map: Documentation --- CHANGELOG.md | 4 +++ website/docs/setup/fault-tolerance.md | 48 +++++++++++++++++++++++++++ website/sidebars.js | 4 +-- website/styles/Vocab/Base/accept.txt | 3 +- 4 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 website/docs/setup/fault-tolerance.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 16b893cc6..f4dd1f52c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] (alpha) +### Added + +- Document Corso's fault-tolerance and restartability features + ## [v0.2.0] (alpha) - 2023-1-29 ### Fixed diff --git a/website/docs/setup/fault-tolerance.md b/website/docs/setup/fault-tolerance.md new file mode 100644 index 000000000..1771f600a --- /dev/null +++ b/website/docs/setup/fault-tolerance.md @@ -0,0 +1,48 @@ +# Fault tolerance + +Given the millions of objects found in a typical Microsoft 365 tenant, +Corso is optimized for high-performance processing, hardened to +tolerate transient failures and, most importantly, able to restart backups. + +Corso’s fault-tolerance architecture is motivated by Microsoft’s Graph +API variable performance and throttling. Corso follows Microsoft’s +recommend best practices (for example, [correctly decorating API +traffic](https://learn.microsoft.com/en-us/sharepoint/dev/general-development/how-to-avoid-getting-throttled-or-blocked-in-sharepoint-online#how-to-decorate-your-http-traffic)) +and, in addition, implements a number of optimizations to improve +backup and restore reliability. + +## Recovery from transient failures + +Corso, at the HTTP layer, will retry requests (after a HTTP timeout, +for example) and will respect Graph API’s directives such as the +`retry-after` header to backoff when needed. This allows backups to +succeed in the face of transient or temporary failures. + +## Restarting from permanent API failures + +The Graph API can, for internal reasons, exhibit extended periods of +failures for particular Graph objects. In this scenario, bounded retries +will be ineffective. Unless invoked with the +fail fast option, Corso will skip over these failing objects. For +backups, it will move forward with backing up other objects belonging +to the user and, for restores, it will continue with trying to restore +any remaining objects. If a multi-user backed is in progress (via `*` +or by specifying multiple users with the `—user` argument), Corso will +also continue processing backups for the remaining users. In both +cases, Corso will exit with a non-zero exit code to reflect incomplete +backups or restores. + +On subsequent backup attempts, Corso will try to +minimize the work involved. If the previous backup was successful and +Corso’s stored state tokens haven’t expired, it will use [delta +queries](https://learn.microsoft.com/en-us/graph/delta-query-overview), +wherever supported, to perform incremental backups. + +If the previous backup for a user had resulted in a failure, Corso +uses a variety of fallback mechanisms to reduce the amount of data +downloaded and reduce the number of objects enumerated. For example, with +OneDrive, Corso won't redo downloads of data from Microsoft 365 or +uploads of data to the Corso repository if it had successfully backed +up that OneDrive file as a part of a previously incomplete and failed +backup. Even if the Graph API might not allow Corso to skip +downloading data, Corso can still skip another upload it to the repository. diff --git a/website/sidebars.js b/website/sidebars.js index fd92ea50e..2403412b7 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -19,8 +19,8 @@ const sidebars = { 'quickstart', { type: 'category', - label: 'Corso setup', - items: ['setup/concepts', 'setup/download', 'setup/m365-access', 'setup/configuration', 'setup/repos'], + label: 'Usage', + items: ['setup/concepts', 'setup/download', 'setup/m365-access', 'setup/configuration', 'setup/repos', 'setup/fault-tolerance'], }, { type: 'category', diff --git a/website/styles/Vocab/Base/accept.txt b/website/styles/Vocab/Base/accept.txt index eae43d149..69565b8a5 100644 --- a/website/styles/Vocab/Base/accept.txt +++ b/website/styles/Vocab/Base/accept.txt @@ -36,4 +36,5 @@ Atlassian SLAs runbooks stdout -stderr \ No newline at end of file +stderr +backoff \ No newline at end of file