corso/.github/workflows/longevity_test.yml
Abin Simon fd887f4d04
Add timeouts to CI steps (#4563)
This should help us catch any CI steps that stall. The timeouts should set should be more than enough for most cases and is more than 4x the average time it takes.

https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepstimeout-minutes

Quick tasks: 10m
Medium tasks (eg: linting website, code): 30m
Integration tests: 120m

<!-- PR description-->

---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No

#### Type of change

<!--- Please check the type of change your PR introduces: --->
- [ ] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [x] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

<!-- Can reference multiple issues. Use one of the following "magic words" - "closes, fixes" to auto-close the Github issue. -->
* #<issue>

#### Test Plan

<!-- How will this be tested prior to merging.-->
- [x] 💪 Manual
- [ ]  Unit test
- [ ] 💚 E2E
2023-10-27 16:19:40 +00:00

398 lines
14 KiB
YAML

name: Longevity Testing
on:
schedule:
# Run every day at 04:00 GMT (roughly 8pm PST)
- cron: "0 4 * * *"
workflow_dispatch:
inputs:
user:
description: 'User to run longevity test on'
permissions:
# required to retrieve AWS credentials
id-token: write
contents: write
# cancel currently running jobs if a new version of the branch is pushed
concurrency:
group: longevity_testing-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
SetM365App:
uses: alcionai/corso/.github/workflows/accSelector.yaml@main
Longevity-Tests:
needs: [ SetM365App ]
environment: Testing
runs-on: ubuntu-latest
env:
# Need these in the local env so that corso can read them
AZURE_CLIENT_ID: ${{ secrets[needs.SetM365App.outputs.client_id_env] }}
AZURE_CLIENT_SECRET: ${{ secrets[needs.SetM365App.outputs.client_secret_env] }}
AZURE_TENANT_ID: ${{ secrets.TENANT_ID }}
CORSO_PASSPHRASE: ${{ secrets.INTEGRATION_TEST_CORSO_PASSPHRASE }}
# re-used values
CORSO_LOG_DIR: ${{ github.workspace }}/src/testlog
CORSO_LOG_FILE: ${{ github.workspace }}/src/testlog/run-longevity.log
RESTORE_DEST_PFX: Corso_Test_Longevity_
TEST_USER: ${{ github.event.inputs.user != '' && github.event.inputs.user || vars.CORSO_M365_TEST_USER_ID }}
PREFIX: 'longevity'
# Options for retention.
RETENTION_MODE: GOVERNANCE
# Time to retain blobs for in hours.
RETENTION_DURATION: 216
defaults:
run:
working-directory: src
############################################################################
# setup
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # needed to get latest tag
- name: Setup Golang with cache
uses: magnetikonline/action-golang-cache@v4
with:
go-version-file: src/go.mod
- run: |
go build -o longevity-test ./cmd/longevity_test
go build -o s3checker ./cmd/s3checker
- name: Get version string
id: version
run: |
echo version=$(git describe --tags --abbrev=0) | tee -a $GITHUB_OUTPUT
# Checkout the .github directory at the original branch's ref so we have a
# stable view of the actions.
- name: Code Checkout
working-directory: ${{ github.workspace }}
run: |
git checkout ${{ steps.version.outputs.version }}
git checkout ${{ github.ref }} -- .github
- run: go build -o corso
timeout-minutes: 10
- run: mkdir ${CORSO_LOG_DIR}
# Use shorter-lived credentials obtained from assume-role since these
# runs haven't been taking long.
- name: Configure AWS credentials from Test account
uses: aws-actions/configure-aws-credentials@v4
timeout-minutes: 10
with:
role-to-assume: ${{ secrets.AWS_IAM_ROLE }}
role-session-name: integration-testing
aws-region: us-east-1
##########################################################################
# Repository commands
- name: Version Test
timeout-minutes: 10
run: |
./corso --version | grep -c 'Corso version:'
- name: Repo init test
id: repo-init
timeout-minutes: 10
run: |
set -euo pipefail
echo -e "\nRepo init test\n" >> ${{ env.CORSO_LOG_FILE }}
./corso repo init s3 \
--no-stats \
--hide-progress \
--retention-mode $(echo "${{ env.RETENTION_MODE }}" | tr '[:upper:]' '[:lower:]') \
--retention-duration "${{ env.RETENTION_DURATION }}h" \
--extend-retention \
--prefix ${{ env.PREFIX }} \
--bucket ${{ secrets.CI_RETENTION_TESTS_S3_BUCKET }} \
--succeed-if-exists \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/gotest-repo-init.log
if grep -q 'Failed to' ${{ env.CORSO_LOG_DIR }}/gotest-repo-init.log
then
echo "Repo could not be initialized"
exit 1
fi
- name: Repo connect test
timeout-minutes: 10
run: |
set -euo pipefail
echo -e "\nRepo connect test\n" >> ${{ env.CORSO_LOG_FILE }}
./corso repo connect s3 \
--no-stats \
--hide-progress \
--prefix ${{ env.PREFIX }} \
--bucket ${{ secrets.CI_RETENTION_TESTS_S3_BUCKET }} \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/gotest-repo-connect.log
if ! grep -q 'Connected to S3 bucket' ${{ env.CORSO_LOG_DIR }}/gotest-repo-connect.log
then
echo "Repo could not be connected"
exit 1
fi
##########################################################################
# Exchange
- name: Backup exchange test
id: exchange-test
timeout-minutes: 30
run: |
echo -e "\nBackup Exchange test\n" >> ${CORSO_LOG_FILE}
./corso backup create exchange \
--no-stats \
--mailbox "${TEST_USER}" \
--hide-progress \
--json \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/backup_exchange.txt
resultjson=$(sed -e '1,/Completed Backups/d' ${{ env.CORSO_LOG_DIR }}/backup_exchange.txt )
if [[ $( echo $resultjson | jq -r '.[0] | .stats.errorCount') -ne 0 ]]; then
echo "backup was not successful"
exit 1
fi
data=$( echo $resultjson | jq -r '.[0] | .id' )
echo result=$data >> $GITHUB_OUTPUT
##########################################################################
# Onedrive
- name: Backup onedrive test
id: onedrive-test
timeout-minutes: 30
run: |
set -euo pipefail
echo -e "\nBackup OneDrive test\n" >> ${CORSO_LOG_FILE}
./corso backup create onedrive \
--no-stats \
--hide-progress \
--user "${TEST_USER}" \
--json \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/backup_onedrive.txt
resultjson=$(sed -e '1,/Completed Backups/d' ${{ env.CORSO_LOG_DIR }}/backup_onedrive.txt )
if [[ $( echo $resultjson | jq -r '.[0] | .stats.errorCount') -ne 0 ]]; then
echo "backup was not successful"
exit 1
fi
data=$( echo $resultjson | jq -r '.[0] | .id' )
echo result=$data >> $GITHUB_OUTPUT
##########################################################################
# Sharepoint test
- name: Backup sharepoint test
id: sharepoint-test
timeout-minutes: 30
run: |
set -euo pipefail
echo -e "\nBackup SharePoint test\n" >> ${CORSO_LOG_FILE}
./corso backup create sharepoint \
--no-stats \
--hide-progress \
--site "${{ vars.CORSO_M365_TEST_SITE_URL }}" \
--json \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/backup_sharepoint.txt
resultjson=$(sed -e '1,/Completed Backups/d' ${{ env.CORSO_LOG_DIR }}/backup_sharepoint.txt )
if [[ $( echo $resultjson | jq -r '.[0] | .stats.errorCount') -ne 0 ]]; then
echo "backup was not successful"
exit 1
fi
data=$( echo $resultjson | jq -r '.[0] | .id' )
echo result=$data >> $GITHUB_OUTPUT
##########################################################################
# Backup Exchange Deletion test
- name: Backup Delete exchange test
id: delete-exchange-test
timeout-minutes: 30
env:
SERVICE: "exchange"
DELETION_DAYS: 10
run: |
set -euo pipefail
echo -e "\nDelete Backup exchange \n" >> ${CORSO_LOG_FILE}
./longevity-test
##########################################################################
# Backup Onedrive Deletion test
- name: Backup Delete onedrive test
id: delete-onedrive-test
timeout-minutes: 30
env:
SERVICE: "onedrive"
DELETION_DAYS: 10
run: |
set -euo pipefail
echo -e "\nDelete Backup onedrive \n" >> ${CORSO_LOG_FILE}
./longevity-test
##########################################################################
# Backup Sharepoint Deletion test
- name: Backup Delete Sharepoint test
id: delete-sharepoint-test
timeout-minutes: 30
env:
SERVICE: "sharepoint"
DELETION_DAYS: 5
run: |
set -euo pipefail
echo -e "\nDelete Backup sharepoint \n" >> ${CORSO_LOG_FILE}
./longevity-test
##########################################################################
# Export OneDrive Test
- name: OneDrive Export test
timeout-minutes: 30
run: |
set -euo pipefail
echo -e "\Export OneDrive test\n" >> ${CORSO_LOG_FILE}
echo -e "\Export OneDrive test - first entry\n" >> ${CORSO_LOG_FILE}
./corso backup list onedrive 2>/dev/null | tail -n+2 | head -n1 | awk '{print $1}' |
while read -r line; do
./corso export onedrive \
"/tmp/corso-export--$line" \
--no-stats \
--backup "$line" \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/export_onedrive_first.txt
done
echo -e "\Export OneDrive test - last entry\n" >> ${CORSO_LOG_FILE}
./corso backup list onedrive 2>/dev/null | tail -n1 | awk '{print $1}' |
while read -r line; do
./corso export onedrive \
"/tmp/corso-export--$line" \
--no-stats \
--backup "$line" \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/export_onedrive_last.txt
done
##########################################################################
# Export SharePoint Test
- name: SharePoint Export test
timeout-minutes: 30
run: |
set -euo pipefail
echo -e "\Export SharePoint test\n" >> ${CORSO_LOG_FILE}
echo -e "\Export SharePoint test - first entry\n" >> ${CORSO_LOG_FILE}
./corso backup list sharepoint 2>/dev/null | tail -n+2 | head -n1 | awk '{print $1}' |
while read -r line; do
./corso export sharepoint \
"/tmp/corso-export--$line" \
--no-stats \
--backup "$line" \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/export_sharepoint_first.txt
done
echo -e "\Export SharePoint test - last entry\n" >> ${CORSO_LOG_FILE}
./corso backup list sharepoint 2>/dev/null | tail -n1 | awk '{print $1}' |
while read -r line; do
./corso export sharepoint \
"/tmp/corso-export--$line" \
--no-stats \
--backup "$line" \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/export_sharepoint_last.txt
done
##########################################################################
# Maintenance test
- name: Maintenance test Daily
id: maintenance-test-daily
timeout-minutes: 30
run: |
set -euo pipefail
echo -e "\n Maintenance test Daily\n" >> ${CORSO_LOG_FILE}
# Run with the force flag so it doesn't fail if the github runner
# hostname isn't what's expected. This is only safe because we can
# guarantee only one runner will be executing maintenance at a time.
./corso repo maintenance --mode metadata \
--no-stats \
--hide-progress \
--force \
--json \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/maintenance_metadata.txt
- name: Maintenance test Weekly
id: maintenance-test-weekly
timeout-minutes: 30
run: |
if [[ $(date +%A) == "Saturday" ]]; then
set -euo pipefail
echo -e "\n Maintenance test Weekly\n" >> ${CORSO_LOG_FILE}
./corso repo maintenance --mode complete \
--no-stats \
--hide-progress \
--force \
--json \
2>&1 | tee ${{ env.CORSO_LOG_DIR }}/maintenance_complete.txt
# TODO(ashmrtn): We can also check that non-current versions of
# blobs don't have their retention extended if we want.
#
# Assuming no failures during full maintenance, current versions of
# objects with the below versions should have retention times that
# are roughly (now + RETENTION_DURATION). We can explicitly check
# for this, but leave a little breathing room since maintenance may
# take some time to run.
#
# If we pick a live-retention-duration that is too small then we'll
# start seeing failures. The check for live objects is a lower bound
# check.
#
# Blob prefixes are as follows:
# - kopia.blobcfg - repo-wide config
# - kopia.repository - repo-wide config
# - p - data pack blobs (i.e. file data)
# - q - metadata pack blobs (i.e. manifests, directory listings, etc)
# - x - index blobs
./s3checker \
--bucket ${{ secrets.CI_RETENTION_TESTS_S3_BUCKET }} \
--prefix ${{ env.PREFIX }} \
--retention-mode ${{ env.RETENTION_MODE }} \
--live-retention-duration "$((${{ env.RETENTION_DURATION }}-1))h" \
--object-prefix "kopia.blobcfg" \
--object-prefix "kopia.repository" \
--object-prefix "p" \
--object-prefix "q" \
--object-prefix "x"
fi
##########################################################################
# Logging & Notifications
# Upload the original go test output as an artifact for later review.
- name: Upload test log
if: always()
uses: actions/upload-artifact@v3
with:
name: longevity-test-log
path: src/testlog/*
if-no-files-found: error
retention-days: 14
- name: Notify failure in slack
if: failure()
uses: ./.github/actions/slack-message
with:
msg: "[FAILED] Longevity Test"
slack_url: ${{ secrets.SLACK_WEBHOOK_URL }}