From 12e0eabb2cd094baa6e694de6c61d1a10a725fd0 Mon Sep 17 00:00:00 2001 From: Michael Smith Date: Wed, 16 Apr 2025 22:46:12 +0000 Subject: [PATCH] chore: add registry health check back --- .github/scripts/check-registry-site-health.sh | 204 ++++++++++++++++++ .../workflows/check-registry-site-health.yaml | 23 ++ 2 files changed, 227 insertions(+) create mode 100644 .github/scripts/check-registry-site-health.sh create mode 100644 .github/workflows/check-registry-site-health.yaml diff --git a/.github/scripts/check-registry-site-health.sh b/.github/scripts/check-registry-site-health.sh new file mode 100644 index 00000000..d1bdea97 --- /dev/null +++ b/.github/scripts/check-registry-site-health.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +set -o pipefail +set -u + +VERBOSE="${VERBOSE:-0}" +if [[ "${VERBOSE}" -ne "0" ]]; then + set -x +fi + +# List of required environment variables +required_vars=( + "INSTATUS_API_KEY" + "INSTATUS_PAGE_ID" + "INSTATUS_COMPONENT_ID" + "VERCEL_API_KEY" +) + +# Check if each required variable is set +for var in "${required_vars[@]}"; do + if [[ -z "${!var:-}" ]]; then + echo "Error: Environment variable '$var' is not set." + exit 1 + fi +done + +REGISTRY_BASE_URL="${REGISTRY_BASE_URL:-https://registry.coder.com}" + +status=0 +declare -a modules=() +declare -a failures=() + +# Collect all module directories containing a main.tf file +for path in $(find . -maxdepth 2 -not -path '*/.*' -type f -name main.tf | cut -d '/' -f 2 | sort -u); do + modules+=("${path}") +done + +echo "Checking modules: ${modules[*]}" + +# Function to update the component status on Instatus +update_component_status() { + local component_status=$1 + # see https://instatus.com/help/api/components + (curl -X PUT "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/components/$INSTATUS_COMPONENT_ID" \ + -H "Authorization: Bearer $INSTATUS_API_KEY" \ + -H "Content-Type: application/json" \ + -d "{\"status\": \"$component_status\"}") +} + +# Function to create an incident +create_incident() { + local incident_name="Degraded Service" + local message="The following modules are experiencing issues:\n" + for i in "${!failures[@]}"; do + message+="$((i + 1)). ${failures[$i]}\n" + done + + component_status="PARTIALOUTAGE" + if ((${#failures[@]} == ${#modules[@]})); then + component_status="MAJOROUTAGE" + fi + # see https://instatus.com/help/api/incidents + incident_id=$(curl -s -X POST "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \ + -H "Authorization: Bearer $INSTATUS_API_KEY" \ + -H "Content-Type: application/json" \ + -d "{ + \"name\": \"$incident_name\", + \"message\": \"$message\", + \"components\": [\"$INSTATUS_COMPONENT_ID\"], + \"status\": \"INVESTIGATING\", + \"notify\": true, + \"statuses\": [ + { + \"id\": \"$INSTATUS_COMPONENT_ID\", + \"status\": \"PARTIALOUTAGE\" + } + ] + }" | jq -r '.id') + + echo "Created incident with ID: $incident_id" +} + +# Function to check for existing unresolved incidents +check_existing_incident() { + # Fetch the latest incidents with status not equal to "RESOLVED" + local unresolved_incidents=$(curl -s -X GET "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \ + -H "Authorization: Bearer $INSTATUS_API_KEY" \ + -H "Content-Type: application/json" | jq -r '.incidents[] | select(.status != "RESOLVED") | .id') + + if [[ -n "$unresolved_incidents" ]]; then + echo "Unresolved incidents found: $unresolved_incidents" + return 0 # Indicate that there are unresolved incidents + else + echo "No unresolved incidents found." + return 1 # Indicate that no unresolved incidents exist + fi +} + +force_redeploy_registry() { + # These are not secret values; safe to just expose directly in script + local VERCEL_TEAM_SLUG="codercom" + local VERCEL_TEAM_ID="team_tGkWfhEGGelkkqUUm9nXq17r" + local VERCEL_APP="registry" + + local latest_res + latest_res=$( + curl "https://api.vercel.com/v6/deployments?app=$VERCEL_APP&limit=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID&target=production&state=BUILDING,INITIALIZING,QUEUED,READY" \ + --fail \ + --silent \ + --header "Authorization: Bearer $VERCEL_API_KEY" \ + --header "Content-Type: application/json" + ) + + # If we have zero deployments, something is VERY wrong. Make the whole + # script exit with a non-zero status code + local latest_id + latest_id=$(echo "${latest_res}" | jq -r '.deployments[0].uid') + if [[ "${latest_id}" = "null" ]]; then + echo "Unable to pull any previous deployments for redeployment" + echo "Please redeploy the latest deployment manually in Vercel." + echo "https://vercel.com/codercom/registry/deployments" + exit 1 + fi + + local latest_date_ts_seconds + latest_date_ts_seconds=$(echo "${latest_res}" | jq -r '.deployments[0].createdAt/1000|floor') + local current_date_ts_seconds + current_date_ts_seconds="$(date +%s)" + local max_redeploy_interval_seconds=7200 # 2 hours + if ((current_date_ts_seconds - latest_date_ts_seconds < max_redeploy_interval_seconds)); then + echo "The registry was deployed less than 2 hours ago." + echo "Not automatically re-deploying the regitstry." + echo "A human reading this message should decide if a redeployment is necessary." + echo "Please check the Vercel dashboard for more information." + echo "https://vercel.com/codercom/registry/deployments" + exit 1 + fi + + local latest_deployment_state + latest_deployment_state="$(echo "${latest_res}" | jq -r '.deployments[0].state')" + if [[ "${latest_deployment_state}" != "READY" ]]; then + echo "Last deployment was not in READY state. Skipping redeployment." + echo "A human reading this message should decide if a redeployment is necessary." + echo "Please check the Vercel dashboard for more information." + echo "https://vercel.com/codercom/registry/deployments" + exit 1 + fi + + echo "=============================================================" + echo "!!! Redeploying registry with deployment ID: ${latest_id} !!!" + echo "=============================================================" + + if ! curl -X POST "https://api.vercel.com/v13/deployments?forceNew=1&skipAutoDetectionConfirmation=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID" \ + --fail \ + --header "Authorization: Bearer $VERCEL_API_KEY" \ + --header "Content-Type: application/json" \ + --data-raw "{ \"deploymentId\": \"${latest_id}\", \"name\": \"${VERCEL_APP}\", \"target\": \"production\" }"; then + echo "DEPLOYMENT FAILED! Please check the Vercel dashboard for more information." + echo "https://vercel.com/codercom/registry/deployments" + exit 1 + fi +} + +# Check each module's accessibility +for module in "${modules[@]}"; do + # Trim leading/trailing whitespace from module name + module=$(echo "${module}" | xargs) + url="${REGISTRY_BASE_URL}/modules/${module}" + printf "=== Checking module %s at %s\n" "${module}" "${url}" + status_code=$(curl --output /dev/null --head --silent --fail --location "${url}" --retry 3 --write-out "%{http_code}") + if ((status_code != 200)); then + printf "==> FAIL(%s)\n" "${status_code}" + status=1 + failures+=("${module}") + else + printf "==> OK(%s)\n" "${status_code}" + fi +done + +# Determine overall status and update Instatus component +if ((status == 0)); then + echo "All modules are operational." + # set to + update_component_status "OPERATIONAL" +else + echo "The following modules have issues: ${failures[*]}" + # check if all modules are down + if ((${#failures[@]} == ${#modules[@]})); then + update_component_status "MAJOROUTAGE" + else + update_component_status "PARTIALOUTAGE" + fi + + # Check if there is an existing incident before creating a new one + if ! check_existing_incident; then + create_incident + fi + + # If a module is down, force a reployment to try getting things back online + # ASAP + # EDIT: registry.coder.com is no longer hosted on vercel + #force_redeploy_registry +fi + +exit "${status}" diff --git a/.github/workflows/check-registry-site-health.yaml b/.github/workflows/check-registry-site-health.yaml new file mode 100644 index 00000000..df29ca2b --- /dev/null +++ b/.github/workflows/check-registry-site-health.yaml @@ -0,0 +1,23 @@ +# Check modules health on registry.coder.com +name: check-registry-site-health +on: + schedule: + - cron: "0,15,30,45 * * * *" # Runs every 15 minutes + workflow_dispatch: # Allows manual triggering of the workflow if needed + +jobs: + run-script: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Run check.sh + run: | + ./.github/scripts/check.sh + env: + INSTATUS_API_KEY: ${{ secrets.INSTATUS_API_KEY }} + INSTATUS_PAGE_ID: ${{ secrets.INSTATUS_PAGE_ID }} + INSTATUS_COMPONENT_ID: ${{ secrets.INSTATUS_COMPONENT_ID }} + VERCEL_API_KEY: ${{ secrets.VERCEL_API_KEY }}