diff --git a/registry/coder-labs/modules/claude-self-hosted-runner/README.md b/registry/coder-labs/modules/claude-self-hosted-runner/README.md new file mode 100644 index 00000000..36afd483 --- /dev/null +++ b/registry/coder-labs/modules/claude-self-hosted-runner/README.md @@ -0,0 +1,73 @@ +--- +display_name: Claude Code self-hosted runner +description: Run Anthropic's Claude Code self-hosted runner as a long-lived process inside a Coder workspace, with per-workspace scoped self-eviction so the prebuild reconciler keeps the pool warm. +icon: ../../../../.icons/claude.svg +verified: false +tags: [ai, claude, claude-code, anthropic, runner] +--- + +# Claude Code self-hosted runner + +Drops Anthropic's [Claude Code self-hosted runner](https://docs.anthropic.com/en/docs/claude-code/self-hosted-runners) into any Coder template that has a `coder_agent` and a workspace image with the runner binary installed (`/usr/local/bin/claude self-hosted-runner` by default). + +The module owns the runner script (writes a per-session wrapper that forces `--permission-mode bypassPermissions`, then spawns a detached supervisor that runs the runner in the foreground and POSTs a delete build to self-evict on drain), the agent environment variables it needs, an optional bot-git askpass setup, and a host Docker socket gid fixup. Agent metadata items (lock status, active sessions, runner ID, last poll) are emitted via the `agent_metadata` output for the parent to splat into a `dynamic "metadata"` block. + +The parent template still owns the `coder_agent` itself, the per-workspace scope-restricted self-evict token (minted via the `Mastercard/restapi` provider against an admin bootstrap token), the prebuild preset, and the infra block (`docker_container`, `kubernetes_pod`, etc.). + +> [!IMPORTANT] +> This module is part of the [Claude Code self-hosted runners on Coder](https://coder.com/docs/ai-coder/claude-code-self-hosted-runners) recipe, which currently targets Anthropic's EAP build of the runner. Both the runner binary and the wire contract are still evolving; expect API drift until Anthropic ships GA. + +## Usage + +```tf +module "claude_self_hosted_runner" { + source = "registry.coder.com/coder-labs/claude-self-hosted-runner/coder" + version = "1.0.0" + + agent_id = coder_agent.main.id + workspace_id = data.coder_workspace.me.id + pool_secret = var.pool_secret + self_evict_token = jsondecode(restapi_object.self_evict_token.api_response).key + git_bot_token = var.git_bot_token + capacity = tonumber(data.coder_parameter.capacity.value) +} + +resource "coder_agent" "main" { + # ... arch, os, dir, startup_script_behavior, etc. + + # Static metadata blocks coexist with the dynamic block below; + # Terraform concatenates them on the same coder_agent. + metadata { + display_name = "CPU" + key = "cpu" + script = "top -bn1 | awk '/Cpu/ {print $2 \"%\"}'" + interval = 10 + timeout = 5 + } + + dynamic "metadata" { + for_each = module.claude_self_hosted_runner.agent_metadata + content { + display_name = metadata.value.display_name + key = metadata.value.key + interval = metadata.value.interval + timeout = metadata.value.timeout + script = metadata.value.script + } + } +} +``` + +## What the module does + +- Writes `$HOME/.claude/wrapper.sh` at agent start. The wrapper appends `--permission-mode bypassPermissions` after `"$@"` so unattended sessions never stall on a tool-approval prompt; Claude Code's flag parser is last-occurrence-wins, so this overrides the server-supplied permission mode. +- Sets up the runner's required environment (`CLAUDE_POOL_SECRET`, `CLAUDE_CAPACITY`, `GIT_BOT_TOKEN`, `CODER_SELF_TOKEN`, `CODER_WORKSPACE_ID`) via `coder_env` resources on the agent. +- Spawns a `setsid nohup` supervisor that runs the runner in the foreground. When the runner exits on drain, the supervisor POSTs `/api/v2/workspaces/{id}/builds` with `{"transition":"delete"}` to self-evict, so Coder's prebuild reconciler can queue a replacement. +- Wires up `GIT_ASKPASS` if `git_bot_token` is supplied so the runner's child claude can `git push` without baking credentials into the image. +- If the parent template mounts the host Docker socket at `/var/run/docker.sock` and the gid does not match the in-container `docker` group, chgrps the socket so the workspace user can use it without sudo. + +## Self-eviction security model + +The `self_evict_token` input is minted by the parent template via the `Mastercard/restapi` provider at template build time, against an admin bootstrap token that lives in Terraform state and is never injected into the workspace. The minted token is scoped to `workspace:delete + workspace:read + template:read + user:read` and allow-listed to this single workspace's UUID. A leaked copy can do exactly one thing: delete this one workspace. No read of peer prebuilds, no SSH, no external auth, no git creds. + +The supervisor uses raw `curl` against `/api/v2/workspaces/{id}/builds`, not the `coder delete` CLI. The CLI fetches workspace resources first, which fails against the scoped token whose allow-list intersection excludes peer workspaces. diff --git a/registry/coder-labs/modules/claude-self-hosted-runner/main.tf b/registry/coder-labs/modules/claude-self-hosted-runner/main.tf new file mode 100644 index 00000000..e24db94b --- /dev/null +++ b/registry/coder-labs/modules/claude-self-hosted-runner/main.tf @@ -0,0 +1,175 @@ +terraform { + required_version = ">= 1.5" + + required_providers { + coder = { + source = "coder/coder" + version = ">= 2.13" + } + } +} + +variable "agent_id" { + type = string + description = "The ID of a Coder agent." +} + +variable "workspace_id" { + type = string + description = "data.coder_workspace.me.id from the parent template. Used by the supervisor to self-evict via the workspace builds endpoint." +} + +variable "pool_secret" { + type = string + description = "Claude Code self-hosted runner pool secret (from claude.ai)." + sensitive = true +} + +variable "self_evict_token" { + type = string + description = "Per-workspace, scope-restricted Coder API token. Scope = workspace:delete + workspace:read + template:read + user:read, allow_list = this workspace's UUID. A leaked copy can only delete this one workspace. The parent template mints it via the Mastercard/restapi provider at build time." + sensitive = true +} + +variable "git_bot_token" { + type = string + description = "Optional git PAT for the bot identity. Wired through GIT_ASKPASS so the runner's child claude can push without baking credentials into the image." + sensitive = true + default = "" +} + +variable "capacity" { + type = number + description = "Maximum sessions the runner serves at once. The runner locks to one Anthropic user; this caps parallelism within that user's queue." + default = 4 + validation { + condition = var.capacity >= 1 && var.capacity <= 16 + error_message = "capacity must be between 1 and 16." + } +} + +variable "runner_binary_path" { + type = string + description = "Path to the `claude self-hosted-runner` binary inside the workspace." + default = "/usr/local/bin/claude" +} + +variable "claude_binary_path" { + type = string + description = "Path to the Claude Code binary the wrapper execs for each session." + default = "/opt/claude/claude" +} + +variable "order" { + type = number + description = "Order of the runner script in the agent UI." + default = null +} + +resource "coder_env" "pool_secret" { + agent_id = var.agent_id + name = "CLAUDE_POOL_SECRET" + value = var.pool_secret +} + +resource "coder_env" "capacity" { + agent_id = var.agent_id + name = "CLAUDE_CAPACITY" + value = tostring(var.capacity) +} + +resource "coder_env" "git_bot_token" { + agent_id = var.agent_id + name = "GIT_BOT_TOKEN" + value = var.git_bot_token +} + +resource "coder_env" "self_token" { + agent_id = var.agent_id + name = "CODER_SELF_TOKEN" + value = var.self_evict_token +} + +resource "coder_env" "workspace_id" { + agent_id = var.agent_id + name = "CODER_WORKSPACE_ID" + value = var.workspace_id +} + +resource "coder_script" "claude_runner" { + agent_id = var.agent_id + display_name = "Claude self-hosted runner" + icon = "/icon/code.svg" + run_on_start = true + start_blocks_login = false + script = templatefile("${path.module}/scripts/run.sh", { + CLAUDE_BINARY_PATH = var.claude_binary_path + RUNNER_BINARY_PATH = var.runner_binary_path + }) +} + +# Agent metadata items. The parent splats this list into a +# `dynamic "metadata"` block on its own `coder_agent` because nested +# blocks cannot be injected from a module. Scraped from the runner's +# local /healthz and /metrics endpoints; this is the only window a +# Coder admin has into who the Anthropic pool has bound this workspace +# to (the runner does not expose the locked user's email over its +# local endpoints; that lives in claude.ai > Self-hosted runner pools). +output "agent_metadata" { + description = "List of agent metadata items the parent template should splat into a `dynamic \"metadata\"` block on its coder_agent." + value = [ + { + display_name = "Lock status" + key = "0_lock_status" + interval = 10 + timeout = 5 + script = <<-EOT + val=$(curl -fsS http://127.0.0.1:8080/metrics 2>/dev/null \ + | awk '/^claude_code_self_hosted_runner_locked_account[[:space:]]/ {print $2; exit}') + if [ "$val" = "1" ]; then + printf 'locked' + else + printf 'unlocked' + fi + EOT + }, + { + display_name = "Active sessions" + key = "1_active_sessions" + interval = 5 + timeout = 5 + script = <<-EOT + active=$(curl -fsS http://127.0.0.1:8080/healthz 2>/dev/null \ + | jq -r '.active_sessions // empty') + if [ -z "$active" ]; then echo '?'; exit 0; fi + printf '%s / %s' "$active" "$${CLAUDE_CAPACITY:-1}" + EOT + }, + { + display_name = "Runner ID" + key = "2_runner_id" + interval = 30 + timeout = 5 + script = <<-EOT + curl -fsS http://127.0.0.1:8080/healthz 2>/dev/null \ + | jq -r '.runner_id // "(starting)"' + EOT + }, + { + display_name = "Last Anthropic poll" + key = "3_last_poll" + interval = 15 + timeout = 5 + script = <<-EOT + age=$(curl -fsS http://127.0.0.1:8080/healthz 2>/dev/null \ + | jq -r '.last_poll_age_ms // empty') + if [ -z "$age" ]; then echo '?'; exit 0; fi + if [ "$age" -lt 30000 ]; then + printf 'ok (%sms ago)' "$age" + else + printf 'stale (%ss ago)' $((age/1000)) + fi + EOT + }, + ] +} diff --git a/registry/coder-labs/modules/claude-self-hosted-runner/main.tftest.hcl b/registry/coder-labs/modules/claude-self-hosted-runner/main.tftest.hcl new file mode 100644 index 00000000..d1818bdf --- /dev/null +++ b/registry/coder-labs/modules/claude-self-hosted-runner/main.tftest.hcl @@ -0,0 +1,123 @@ +run "plan_with_required_vars" { + command = plan + + variables { + agent_id = "test-agent" + workspace_id = "test-workspace" + pool_secret = "test-pool-secret" + self_evict_token = "test-self-token" + } + + assert { + condition = length(resource.coder_env.pool_secret.value) > 0 + error_message = "pool_secret env should be set" + } + + assert { + condition = resource.coder_env.capacity.value == "4" + error_message = "default capacity should be 4" + } + + assert { + condition = resource.coder_script.claude_runner.display_name == "Claude self-hosted runner" + error_message = "expected the runner coder_script display_name" + } +} + +run "custom_capacity_and_binary_paths" { + command = plan + + variables { + agent_id = "test-agent" + workspace_id = "test-workspace" + pool_secret = "test-pool-secret" + self_evict_token = "test-self-token" + capacity = 8 + claude_binary_path = "/custom/claude" + runner_binary_path = "/custom/runner" + } + + assert { + condition = resource.coder_env.capacity.value == "8" + error_message = "capacity input should flow into CLAUDE_CAPACITY env" + } + + assert { + condition = strcontains(resource.coder_script.claude_runner.script, "/custom/claude") + error_message = "claude_binary_path should appear in the rendered script" + } + + assert { + condition = strcontains(resource.coder_script.claude_runner.script, "/custom/runner") + error_message = "runner_binary_path should appear in the rendered script" + } +} + +run "git_bot_token_optional" { + command = plan + + variables { + agent_id = "test-agent" + workspace_id = "test-workspace" + pool_secret = "test-pool-secret" + self_evict_token = "test-self-token" + } + + assert { + condition = resource.coder_env.git_bot_token.value == "" + error_message = "git_bot_token should default to empty string" + } +} + +run "capacity_validation_rejects_zero" { + command = plan + + variables { + agent_id = "test-agent" + workspace_id = "test-workspace" + pool_secret = "test-pool-secret" + self_evict_token = "test-self-token" + capacity = 0 + } + + expect_failures = [ + var.capacity, + ] +} + +run "capacity_validation_rejects_high" { + command = plan + + variables { + agent_id = "test-agent" + workspace_id = "test-workspace" + pool_secret = "test-pool-secret" + self_evict_token = "test-self-token" + capacity = 17 + } + + expect_failures = [ + var.capacity, + ] +} + +run "agent_metadata_output_has_four_items" { + command = apply + + variables { + agent_id = "test-agent" + workspace_id = "test-workspace" + pool_secret = "test-pool-secret" + self_evict_token = "test-self-token" + } + + assert { + condition = length(output.agent_metadata) == 4 + error_message = "agent_metadata should expose four scraping items" + } + + assert { + condition = output.agent_metadata[0].key == "0_lock_status" + error_message = "first metadata item should be lock_status" + } +} diff --git a/registry/coder-labs/modules/claude-self-hosted-runner/scripts/run.sh b/registry/coder-labs/modules/claude-self-hosted-runner/scripts/run.sh new file mode 100644 index 00000000..79ccb1ef --- /dev/null +++ b/registry/coder-labs/modules/claude-self-hosted-runner/scripts/run.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +# Wires up everything the Claude Code self-hosted runner needs at agent +# start, then spawns a detached supervisor that keeps the runner alive +# and self-evicts on drain. +# +# Runtime env (set by coder_env in main.tf): +# CLAUDE_POOL_SECRET Anthropic pool secret (mandatory). +# CLAUDE_CAPACITY Max parallel sessions per runner (default 1). +# GIT_BOT_TOKEN Optional bot PAT for GIT_ASKPASS. +# CODER_SELF_TOKEN Per-workspace scope-restricted Coder API token. +# CODER_WORKSPACE_ID This workspace's UUID, used by self-eviction. +# CODER_AGENT_URL Set by the Coder agent itself. + +set -euo pipefail + +CLAUDE_BINARY_PATH='${CLAUDE_BINARY_PATH}' +RUNNER_BINARY_PATH='${RUNNER_BINARY_PATH}' + +if [ -z "$${CLAUDE_POOL_SECRET:-}" ]; then + echo "CLAUDE_POOL_SECRET is empty. Set the pool_secret input on the module." + exit 1 +fi + +install -d -m 0700 "$HOME/.claude" + +# --- Bot git askpass ---------------------------------------------------- +if [ -n "$${GIT_BOT_TOKEN:-}" ]; then + install -d -m 0700 "$HOME/.git-creds" + cat > "$HOME/.git-creds/askpass.sh" << 'ASK' +#!/bin/sh +printf '%s' "$GIT_BOT_TOKEN" +ASK + chmod 0500 "$HOME/.git-creds/askpass.sh" + git config --global core.askPass "$HOME/.git-creds/askpass.sh" + git config --global credential.helper '' +fi + +# --- Host Docker socket gid fixup -------------------------------------- +if [ -S /var/run/docker.sock ]; then + sock_gid=$(stat -c %g /var/run/docker.sock) + docker_gid=$(getent group docker | cut -d: -f3 || true) + if [ -n "$${docker_gid:-}" ] && [ "$${sock_gid}" != "$${docker_gid}" ]; then + sudo chgrp "$${docker_gid}" /var/run/docker.sock 2> /dev/null || true + fi +fi + +# --- Pool secret on disk ----------------------------------------------- +POOL_SECRET_FILE="$HOME/.claude/pool-secret" +rm -f "$POOL_SECRET_FILE" +umask 077 +printf '%s' "$${CLAUDE_POOL_SECRET}" > "$POOL_SECRET_FILE" +chmod 0400 "$POOL_SECRET_FILE" + +# --- Wrapper script ----------------------------------------------------- +# Runner execs this once per session, appending its server-computed +# flags. Claude Code's flag parser is last-occurrence-wins, so flags +# after "$@" win. Force --permission-mode bypassPermissions so +# unattended sessions never stall on a tool-approval prompt. +WRAPPER="$HOME/.claude/wrapper.sh" +{ + echo '#!/bin/bash' + echo "exec $${CLAUDE_BINARY_PATH} \"\$@\" --permission-mode bypassPermissions" +} > "$WRAPPER" +chmod 0755 "$WRAPPER" + +# --- Supervisor -------------------------------------------------------- +# Runs the runner in the foreground; on runner exit POSTs a delete +# build to self-evict. Raw curl, not `coder delete`: the CLI fetches +# workspace resources first, which fails with the per-workspace +# scoped token whose allow-list excludes peer prebuilds. +# +# Single-quoted heredoc, so nothing is expanded by the outer shell. +# The supervisor reads its env vars (CODER_SELF_TOKEN, CODER_AGENT_URL, +# etc.) at runtime, when it's invoked under setsid. +SUPERVISOR="$HOME/.claude/supervisor.sh" +cat > "$SUPERVISOR" << SUP +#!/usr/bin/env bash +set -uo pipefail +exec >>"\$HOME/.claude/supervisor.log" 2>&1 +echo "[supervisor] start \$(date -Is)" + +$${RUNNER_BINARY_PATH} self-hosted-runner \\ + --pool-secret-file "\$HOME/.claude/pool-secret" \\ + --capacity "\$${CLAUDE_CAPACITY:-1}" \\ + --log-file "\$HOME/.claude/runner.log" \\ + --exec-path "\$HOME/.claude/wrapper.sh" +echo "[supervisor] runner exited rc=\$? \$(date -Is)" + +if [ -z "\$${CODER_SELF_TOKEN:-}" ]; then + echo "[supervisor] CODER_SELF_TOKEN is empty; skipping self-eviction." + exit 0 +fi + +http_code=\$(curl -s -o /tmp/evict.out -w "%%{http_code}" \\ + -X POST \\ + -H "Coder-Session-Token: \$CODER_SELF_TOKEN" \\ + -H "Content-Type: application/json" \\ + -d '{"transition":"delete"}' \\ + "\$CODER_AGENT_URL/api/v2/workspaces/\$CODER_WORKSPACE_ID/builds") +if [ "\$http_code" = "201" ]; then + echo "[supervisor] self-eviction queued (HTTP 201)." +else + echo "[supervisor] self-eviction failed (HTTP \$http_code): \$(head -c 300 /tmp/evict.out)" +fi +SUP +chmod 0700 "$SUPERVISOR" + +# Detach with setsid + nohup. The supervisor reopens stdout/stderr to +# its own logfile; redirect all standard fds here to /dev/null so this +# script's exit doesn't drag the supervisor with it. +setsid nohup "$SUPERVISOR" < /dev/null > /dev/null 2>&1 & +disown + +echo "Runner spawned as detached supervisor (pid=$!). See ~/.claude/supervisor.log."