chore: add registry health check back
This commit is contained in:
parent
c3822eafe5
commit
12e0eabb2c
204
.github/scripts/check-registry-site-health.sh
vendored
Normal file
204
.github/scripts/check-registry-site-health.sh
vendored
Normal file
@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o pipefail
|
||||
set -u
|
||||
|
||||
VERBOSE="${VERBOSE:-0}"
|
||||
if [[ "${VERBOSE}" -ne "0" ]]; then
|
||||
set -x
|
||||
fi
|
||||
|
||||
# List of required environment variables
|
||||
required_vars=(
|
||||
"INSTATUS_API_KEY"
|
||||
"INSTATUS_PAGE_ID"
|
||||
"INSTATUS_COMPONENT_ID"
|
||||
"VERCEL_API_KEY"
|
||||
)
|
||||
|
||||
# Check if each required variable is set
|
||||
for var in "${required_vars[@]}"; do
|
||||
if [[ -z "${!var:-}" ]]; then
|
||||
echo "Error: Environment variable '$var' is not set."
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
REGISTRY_BASE_URL="${REGISTRY_BASE_URL:-https://registry.coder.com}"
|
||||
|
||||
status=0
|
||||
declare -a modules=()
|
||||
declare -a failures=()
|
||||
|
||||
# Collect all module directories containing a main.tf file
|
||||
for path in $(find . -maxdepth 2 -not -path '*/.*' -type f -name main.tf | cut -d '/' -f 2 | sort -u); do
|
||||
modules+=("${path}")
|
||||
done
|
||||
|
||||
echo "Checking modules: ${modules[*]}"
|
||||
|
||||
# Function to update the component status on Instatus
|
||||
update_component_status() {
|
||||
local component_status=$1
|
||||
# see https://instatus.com/help/api/components
|
||||
(curl -X PUT "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/components/$INSTATUS_COMPONENT_ID" \
|
||||
-H "Authorization: Bearer $INSTATUS_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"status\": \"$component_status\"}")
|
||||
}
|
||||
|
||||
# Function to create an incident
|
||||
create_incident() {
|
||||
local incident_name="Degraded Service"
|
||||
local message="The following modules are experiencing issues:\n"
|
||||
for i in "${!failures[@]}"; do
|
||||
message+="$((i + 1)). ${failures[$i]}\n"
|
||||
done
|
||||
|
||||
component_status="PARTIALOUTAGE"
|
||||
if ((${#failures[@]} == ${#modules[@]})); then
|
||||
component_status="MAJOROUTAGE"
|
||||
fi
|
||||
# see https://instatus.com/help/api/incidents
|
||||
incident_id=$(curl -s -X POST "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \
|
||||
-H "Authorization: Bearer $INSTATUS_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"name\": \"$incident_name\",
|
||||
\"message\": \"$message\",
|
||||
\"components\": [\"$INSTATUS_COMPONENT_ID\"],
|
||||
\"status\": \"INVESTIGATING\",
|
||||
\"notify\": true,
|
||||
\"statuses\": [
|
||||
{
|
||||
\"id\": \"$INSTATUS_COMPONENT_ID\",
|
||||
\"status\": \"PARTIALOUTAGE\"
|
||||
}
|
||||
]
|
||||
}" | jq -r '.id')
|
||||
|
||||
echo "Created incident with ID: $incident_id"
|
||||
}
|
||||
|
||||
# Function to check for existing unresolved incidents
|
||||
check_existing_incident() {
|
||||
# Fetch the latest incidents with status not equal to "RESOLVED"
|
||||
local unresolved_incidents=$(curl -s -X GET "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \
|
||||
-H "Authorization: Bearer $INSTATUS_API_KEY" \
|
||||
-H "Content-Type: application/json" | jq -r '.incidents[] | select(.status != "RESOLVED") | .id')
|
||||
|
||||
if [[ -n "$unresolved_incidents" ]]; then
|
||||
echo "Unresolved incidents found: $unresolved_incidents"
|
||||
return 0 # Indicate that there are unresolved incidents
|
||||
else
|
||||
echo "No unresolved incidents found."
|
||||
return 1 # Indicate that no unresolved incidents exist
|
||||
fi
|
||||
}
|
||||
|
||||
force_redeploy_registry() {
|
||||
# These are not secret values; safe to just expose directly in script
|
||||
local VERCEL_TEAM_SLUG="codercom"
|
||||
local VERCEL_TEAM_ID="team_tGkWfhEGGelkkqUUm9nXq17r"
|
||||
local VERCEL_APP="registry"
|
||||
|
||||
local latest_res
|
||||
latest_res=$(
|
||||
curl "https://api.vercel.com/v6/deployments?app=$VERCEL_APP&limit=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID&target=production&state=BUILDING,INITIALIZING,QUEUED,READY" \
|
||||
--fail \
|
||||
--silent \
|
||||
--header "Authorization: Bearer $VERCEL_API_KEY" \
|
||||
--header "Content-Type: application/json"
|
||||
)
|
||||
|
||||
# If we have zero deployments, something is VERY wrong. Make the whole
|
||||
# script exit with a non-zero status code
|
||||
local latest_id
|
||||
latest_id=$(echo "${latest_res}" | jq -r '.deployments[0].uid')
|
||||
if [[ "${latest_id}" = "null" ]]; then
|
||||
echo "Unable to pull any previous deployments for redeployment"
|
||||
echo "Please redeploy the latest deployment manually in Vercel."
|
||||
echo "https://vercel.com/codercom/registry/deployments"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local latest_date_ts_seconds
|
||||
latest_date_ts_seconds=$(echo "${latest_res}" | jq -r '.deployments[0].createdAt/1000|floor')
|
||||
local current_date_ts_seconds
|
||||
current_date_ts_seconds="$(date +%s)"
|
||||
local max_redeploy_interval_seconds=7200 # 2 hours
|
||||
if ((current_date_ts_seconds - latest_date_ts_seconds < max_redeploy_interval_seconds)); then
|
||||
echo "The registry was deployed less than 2 hours ago."
|
||||
echo "Not automatically re-deploying the regitstry."
|
||||
echo "A human reading this message should decide if a redeployment is necessary."
|
||||
echo "Please check the Vercel dashboard for more information."
|
||||
echo "https://vercel.com/codercom/registry/deployments"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local latest_deployment_state
|
||||
latest_deployment_state="$(echo "${latest_res}" | jq -r '.deployments[0].state')"
|
||||
if [[ "${latest_deployment_state}" != "READY" ]]; then
|
||||
echo "Last deployment was not in READY state. Skipping redeployment."
|
||||
echo "A human reading this message should decide if a redeployment is necessary."
|
||||
echo "Please check the Vercel dashboard for more information."
|
||||
echo "https://vercel.com/codercom/registry/deployments"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "============================================================="
|
||||
echo "!!! Redeploying registry with deployment ID: ${latest_id} !!!"
|
||||
echo "============================================================="
|
||||
|
||||
if ! curl -X POST "https://api.vercel.com/v13/deployments?forceNew=1&skipAutoDetectionConfirmation=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID" \
|
||||
--fail \
|
||||
--header "Authorization: Bearer $VERCEL_API_KEY" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "{ \"deploymentId\": \"${latest_id}\", \"name\": \"${VERCEL_APP}\", \"target\": \"production\" }"; then
|
||||
echo "DEPLOYMENT FAILED! Please check the Vercel dashboard for more information."
|
||||
echo "https://vercel.com/codercom/registry/deployments"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check each module's accessibility
|
||||
for module in "${modules[@]}"; do
|
||||
# Trim leading/trailing whitespace from module name
|
||||
module=$(echo "${module}" | xargs)
|
||||
url="${REGISTRY_BASE_URL}/modules/${module}"
|
||||
printf "=== Checking module %s at %s\n" "${module}" "${url}"
|
||||
status_code=$(curl --output /dev/null --head --silent --fail --location "${url}" --retry 3 --write-out "%{http_code}")
|
||||
if ((status_code != 200)); then
|
||||
printf "==> FAIL(%s)\n" "${status_code}"
|
||||
status=1
|
||||
failures+=("${module}")
|
||||
else
|
||||
printf "==> OK(%s)\n" "${status_code}"
|
||||
fi
|
||||
done
|
||||
|
||||
# Determine overall status and update Instatus component
|
||||
if ((status == 0)); then
|
||||
echo "All modules are operational."
|
||||
# set to
|
||||
update_component_status "OPERATIONAL"
|
||||
else
|
||||
echo "The following modules have issues: ${failures[*]}"
|
||||
# check if all modules are down
|
||||
if ((${#failures[@]} == ${#modules[@]})); then
|
||||
update_component_status "MAJOROUTAGE"
|
||||
else
|
||||
update_component_status "PARTIALOUTAGE"
|
||||
fi
|
||||
|
||||
# Check if there is an existing incident before creating a new one
|
||||
if ! check_existing_incident; then
|
||||
create_incident
|
||||
fi
|
||||
|
||||
# If a module is down, force a reployment to try getting things back online
|
||||
# ASAP
|
||||
# EDIT: registry.coder.com is no longer hosted on vercel
|
||||
#force_redeploy_registry
|
||||
fi
|
||||
|
||||
exit "${status}"
|
||||
23
.github/workflows/check-registry-site-health.yaml
vendored
Normal file
23
.github/workflows/check-registry-site-health.yaml
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
# Check modules health on registry.coder.com
|
||||
name: check-registry-site-health
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0,15,30,45 * * * *" # Runs every 15 minutes
|
||||
workflow_dispatch: # Allows manual triggering of the workflow if needed
|
||||
|
||||
jobs:
|
||||
run-script:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Run check.sh
|
||||
run: |
|
||||
./.github/scripts/check.sh
|
||||
env:
|
||||
INSTATUS_API_KEY: ${{ secrets.INSTATUS_API_KEY }}
|
||||
INSTATUS_PAGE_ID: ${{ secrets.INSTATUS_PAGE_ID }}
|
||||
INSTATUS_COMPONENT_ID: ${{ secrets.INSTATUS_COMPONENT_ID }}
|
||||
VERCEL_API_KEY: ${{ secrets.VERCEL_API_KEY }}
|
||||
Loading…
x
Reference in New Issue
Block a user