registry/.github/scripts/check_registry_site_health.sh

#!/usr/bin/env bash
set -o pipefail
set -u

VERBOSE="${VERBOSE:-0}"
if [[ "${VERBOSE}" -ne "0" ]]; then
    set -x
fi

# List of required environment variables
required_vars=(
    "INSTATUS_API_KEY"
    "INSTATUS_PAGE_ID"
    "INSTATUS_COMPONENT_ID"
    "VERCEL_API_KEY"
)

# Check if each required variable is set
for var in "${required_vars[@]}"; do
    if [[ -z "${!var:-}" ]]; then
        echo "Error: Environment variable '$var' is not set."
        exit 1
    fi
done

REGISTRY_BASE_URL="${REGISTRY_BASE_URL:-https://registry.coder.com}"

status=0
declare -a modules=()
declare -a failures=()

# Collect all module directories containing a main.tf file
for path in $(find . -maxdepth 2 -not -path '*/.*' -type f -name main.tf | cut -d '/' -f 2 | sort -u); do
    modules+=("${path}")
done

echo "Checking modules: ${modules[*]}"

# Function to update the component status on Instatus
update_component_status() {
    local component_status=$1
    # see https://instatus.com/help/api/components
    (curl -X PUT "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/components/$INSTATUS_COMPONENT_ID" \
        -H "Authorization: Bearer $INSTATUS_API_KEY" \
        -H "Content-Type: application/json" \
        -d "{\"status\": \"$component_status\"}")
}

# Function to create an incident
create_incident() {
    local incident_name="Degraded Service"
    local message="The following modules are experiencing issues:\n"
    for i in "${!failures[@]}"; do
        message+="$((i + 1)). ${failures[$i]}\n"
    done

    component_status="PARTIALOUTAGE"
    if ((${#failures[@]} == ${#modules[@]})); then
        component_status="MAJOROUTAGE"
    fi
    # see https://instatus.com/help/api/incidents
    incident_id=$(curl -s -X POST "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \
        -H "Authorization: Bearer $INSTATUS_API_KEY" \
        -H "Content-Type: application/json" \
        -d "{
            \"name\": \"$incident_name\",
            \"message\": \"$message\",
            \"components\": [\"$INSTATUS_COMPONENT_ID\"],
            \"status\": \"INVESTIGATING\",
            \"notify\": true,
            \"statuses\": [
                {
                    \"id\": \"$INSTATUS_COMPONENT_ID\",
                    \"status\": \"PARTIALOUTAGE\"
                }
            ]
        }" | jq -r '.id')

    echo "Created incident with ID: $incident_id"
}

# Function to check for existing unresolved incidents
check_existing_incident() {
    # Fetch the latest incidents with status not equal to "RESOLVED"
    local unresolved_incidents=$(curl -s -X GET "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \
        -H "Authorization: Bearer $INSTATUS_API_KEY" \
        -H "Content-Type: application/json" | jq -r '.incidents[] | select(.status != "RESOLVED") | .id')

    if [[ -n "$unresolved_incidents" ]]; then
        echo "Unresolved incidents found: $unresolved_incidents"
        return 0 # Indicate that there are unresolved incidents
    else
        echo "No unresolved incidents found."
        return 1 # Indicate that no unresolved incidents exist
    fi
}

force_redeploy_registry() {
    # These are not secret values; safe to just expose directly in script
    local VERCEL_TEAM_SLUG="codercom"
    local VERCEL_TEAM_ID="team_tGkWfhEGGelkkqUUm9nXq17r"
    local VERCEL_APP="registry"

    local latest_res
    latest_res=$(
        curl "https://api.vercel.com/v6/deployments?app=$VERCEL_APP&limit=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID&target=production&state=BUILDING,INITIALIZING,QUEUED,READY" \
            --fail \
            --silent \
            --header "Authorization: Bearer $VERCEL_API_KEY" \
            --header "Content-Type: application/json"
    )

    # If we have zero deployments, something is VERY wrong. Make the whole
    # script exit with a non-zero status code
    local latest_id
    latest_id=$(echo "${latest_res}" | jq -r '.deployments[0].uid')
    if [[ "${latest_id}" = "null" ]]; then
        echo "Unable to pull any previous deployments for redeployment"
        echo "Please redeploy the latest deployment manually in Vercel."
        echo "https://vercel.com/codercom/registry/deployments"
        exit 1
    fi

    local latest_date_ts_seconds
    latest_date_ts_seconds=$(echo "${latest_res}" | jq -r '.deployments[0].createdAt/1000|floor')
    local current_date_ts_seconds
    current_date_ts_seconds="$(date +%s)"
    local max_redeploy_interval_seconds=7200 # 2 hours
    if ((current_date_ts_seconds - latest_date_ts_seconds < max_redeploy_interval_seconds)); then
        echo "The registry was deployed less than 2 hours ago."
        echo "Not automatically re-deploying the regitstry."
        echo "A human reading this message should decide if a redeployment is necessary."
        echo "Please check the Vercel dashboard for more information."
        echo "https://vercel.com/codercom/registry/deployments"
        exit 1
    fi

    local latest_deployment_state
    latest_deployment_state="$(echo "${latest_res}" | jq -r '.deployments[0].state')"
    if [[ "${latest_deployment_state}" != "READY" ]]; then
        echo "Last deployment was not in READY state. Skipping redeployment."
        echo "A human reading this message should decide if a redeployment is necessary."
        echo "Please check the Vercel dashboard for more information."
        echo "https://vercel.com/codercom/registry/deployments"
        exit 1
    fi

    echo "============================================================="
    echo "!!! Redeploying registry with deployment ID: ${latest_id} !!!"
    echo "============================================================="

    if ! curl -X POST "https://api.vercel.com/v13/deployments?forceNew=1&skipAutoDetectionConfirmation=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID" \
        --fail \
        --header "Authorization: Bearer $VERCEL_API_KEY" \
        --header "Content-Type: application/json" \
        --data-raw "{ \"deploymentId\": \"${latest_id}\", \"name\": \"${VERCEL_APP}\", \"target\": \"production\" }"; then
        echo "DEPLOYMENT FAILED! Please check the Vercel dashboard for more information."
        echo "https://vercel.com/codercom/registry/deployments"
        exit 1
    fi
}

# Check each module's accessibility
for module in "${modules[@]}"; do
    # Trim leading/trailing whitespace from module name
    module=$(echo "${module}" | xargs)
    url="${REGISTRY_BASE_URL}/modules/${module}"
    printf "=== Checking module %s at %s\n" "${module}" "${url}"
    status_code=$(curl --output /dev/null --head --silent --fail --location "${url}" --retry 3 --write-out "%{http_code}")
    if ((status_code != 200)); then
        printf "==> FAIL(%s)\n" "${status_code}"
        status=1
        failures+=("${module}")
    else
        printf "==> OK(%s)\n" "${status_code}"
    fi
done

# Determine overall status and update Instatus component
if ((status == 0)); then
    echo "All modules are operational."
    # set to
    update_component_status "OPERATIONAL"
else
    echo "The following modules have issues: ${failures[*]}"
    # check if all modules are down
    if ((${#failures[@]} == ${#modules[@]})); then
        update_component_status "MAJOROUTAGE"
    else
        update_component_status "PARTIALOUTAGE"
    fi

    # Check if there is an existing incident before creating a new one
    if ! check_existing_incident; then
        create_incident
    fi

    # If a module is down, force a reployment to try getting things back online
    # ASAP
    # EDIT: registry.coder.com is no longer hosted on vercel
    #force_redeploy_registry
fi

exit "${status}"