Skip to content

Commit f6fce49

Browse files
ryan-williamsclaude
andcommitted
Implement SSH-based runner setup for Lambda Labs
Lambda doesn't support cloud-init userdata, so we: 1. Launch instance via API 2. Wait for instance to be active with IP 3. SSH in, export env vars, curl script from GitHub, execute Changes: - Remove template substitution (user-script.sh.templ deleted) - Add `execute_setup_via_ssh()` method to StartLambdaLabs - Simplify runner-setup.sh (remove AWS metadata, CloudWatch, etc.) - Rewrite tests to use `responses` library instead of boto3/moto - Update test_main.py for Lambda env vars Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent fc39f59 commit f6fce49

8 files changed

Lines changed: 371 additions & 1152 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ build-backend = "setuptools.build_meta"
1818
where = ["src"]
1919

2020
[tool.setuptools.package-data]
21-
lambda_gha = ["*.templ", "templates/*.templ", "templates/*.sh"]
21+
lambda_gha = ["templates/*.sh", "scripts/*.sh"]
2222

2323
[tool.pytest.ini_options]
2424
markers = ["slow: marks test as slow"]

src/lambda_gha/__main__.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -94,16 +94,26 @@ def main():
9494
print(f"Waiting for {len(instance_ids)} instance(s) to be ready...")
9595
details = starter.wait_until_ready(instance_ids)
9696

97-
# TODO: SSH into instances and run setup scripts
98-
# For now, we'll need to handle this differently since Lambda
99-
# doesn't support cloud-init userdata. Options:
100-
# 1. SSH in from this action and run setup
101-
# 2. Use a pre-baked AMI with runner pre-installed
102-
# 3. Have a separate "setup" step in the workflow
103-
97+
# SSH into each instance and run setup
10498
for instance_id, meta in mapping.items():
10599
instance_details = details.get(instance_id, {})
106-
print(f"Instance {instance_id}: IP={instance_details.get('ip')}, label={meta['labels']}")
100+
ip = instance_details.get("ip")
101+
if not ip:
102+
raise RuntimeError(f"No IP address for instance {instance_id}")
103+
104+
print(f"Instance {instance_id}: IP={ip}, label={meta['labels']}")
105+
106+
# Add instance IP to env vars
107+
env_vars = meta["env_vars"]
108+
env_vars["LAMBDA_INSTANCE_IP"] = ip
109+
110+
# Execute setup via SSH
111+
starter.execute_setup_via_ssh(
112+
instance_id=instance_id,
113+
ip=ip,
114+
env_vars=env_vars,
115+
action_sha=meta["action_sha"],
116+
)
107117

108118
# Output mapping for GitHub Actions
109119
starter.set_instance_mapping(mapping)

src/lambda_gha/scripts/runner-setup.sh

Lines changed: 44 additions & 202 deletions
Original file line numberDiff line numberDiff line change
@@ -13,38 +13,12 @@ if [ "$debug" = "true" ] || [ "$debug" = "True" ] || [ "$debug" = "trace" ] || [
1313
set -x
1414
fi
1515

16-
# Determine home directory early since it's needed by shared functions
17-
if [ -z "$homedir" ] || [ "$homedir" = "AUTO" ]; then
18-
# Try to find the default non-root user's home directory
19-
for user in ubuntu ec2-user centos admin debian fedora alpine arch; do
20-
if id "$user" &>/dev/null; then
21-
homedir="/home/$user"
22-
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Auto-detected homedir: $homedir" | tee -a /var/log/runner-setup.log
23-
break
24-
fi
25-
done
26-
27-
# Fallback if no standard user found
28-
if [ -z "$homedir" ] || [ "$homedir" = "AUTO" ]; then
29-
homedir=$(getent passwd | awk -F: '$3 >= 1000 && $3 < 65534 && $6 ~ /^\/home\// {print $6}' | while read dir; do
30-
if [ -d "$dir" ]; then
31-
echo "$dir"
32-
break
33-
fi
34-
done)
35-
if [ -z "$homedir" ]; then
36-
homedir="/home/ec2-user" # Ultimate fallback
37-
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Using fallback homedir: $homedir" | tee -a /var/log/runner-setup.log
38-
else
39-
owner=$(stat -c "%U" "$homedir" 2>/dev/null || stat -f "%Su" "$homedir" 2>/dev/null)
40-
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Detected homedir: $homedir (owner: $owner)" | tee -a /var/log/runner-setup.log
41-
fi
42-
fi
43-
else
44-
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Using specified homedir: $homedir" | tee -a /var/log/runner-setup.log
45-
fi
16+
# Lambda instances are Ubuntu-based with ubuntu user
17+
homedir="${homedir:-/home/ubuntu}"
4618
export homedir
4719

20+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Using homedir: $homedir" | tee -a /var/log/runner-setup.log
21+
4822
# Set common paths
4923
BIN_DIR=/usr/local/bin
5024
RUNNER_STATE_DIR=/var/run/github-runner
@@ -55,18 +29,23 @@ echo "[$(date '+%Y-%m-%d %H:%M:%S')] Fetching shared functions from GitHub (SHA:
5529
FUNCTIONS_URL="https://raw.githubusercontent.com/Open-Athena/lambda-gha/${action_sha}/src/lambda_gha/templates/shared-functions.sh"
5630
if ! curl -sSL "$FUNCTIONS_URL" -o /tmp/shared-functions.sh && ! wget -q "$FUNCTIONS_URL" -O /tmp/shared-functions.sh; then
5731
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to download shared functions" | tee -a /var/log/runner-setup.log
58-
shutdown -h now
32+
# Terminate via Lambda API
33+
curl -s -X POST -H "Authorization: Bearer $LAMBDA_API_KEY" -H "Content-Type: application/json" \
34+
-d "{\"instance_ids\": [\"$LAMBDA_INSTANCE_ID\"]}" \
35+
"https://cloud.lambdalabs.com/api/v1/instance-operations/terminate" || true
5936
exit 1
6037
fi
6138

6239
# Write shared functions that will be used by multiple scripts
6340
cat > $BIN_DIR/runner-common.sh << EOSF
6441
# Auto-generated shared functions and variables
65-
# Set homedir for scripts that source this file
6642
homedir="$homedir"
6743
debug="$debug"
6844
RUNNER_STATE_DIR="$RUNNER_STATE_DIR"
69-
export homedir debug RUNNER_STATE_DIR
45+
LAMBDA_API_KEY="$LAMBDA_API_KEY"
46+
LAMBDA_INSTANCE_ID="$LAMBDA_INSTANCE_ID"
47+
LAMBDA_INSTANCE_IP="$LAMBDA_INSTANCE_IP"
48+
export homedir debug RUNNER_STATE_DIR LAMBDA_API_KEY LAMBDA_INSTANCE_ID LAMBDA_INSTANCE_IP
7049
7150
EOSF
7251

@@ -76,8 +55,8 @@ cat /tmp/shared-functions.sh >> $BIN_DIR/runner-common.sh
7655
chmod +x $BIN_DIR/runner-common.sh
7756
source $BIN_DIR/runner-common.sh
7857

79-
logger "EC2-GHA: Starting userdata script"
80-
trap 'logger "EC2-GHA: Script failed at line $LINENO with exit code $?"' ERR
58+
logger "lambda-gha: Starting userdata script"
59+
trap 'logger "lambda-gha: Script failed at line $LINENO with exit code $?"' ERR
8160
trap 'terminate_instance "Setup script failed with error on line $LINENO"' ERR
8261
# Handle watchdog termination signal
8362
trap 'if [ -f $RUNNER_STATE_DIR/watchdog-terminate ]; then terminate_instance "No runners registered within timeout"; else terminate_instance "Script terminated"; fi' TERM
@@ -109,122 +88,28 @@ fi
10988
exec >> /var/log/runner-setup.log 2>&1
11089
log "Starting runner setup"
11190

112-
# Fetch instance metadata for labeling and logging
113-
INSTANCE_TYPE=$(get_metadata "instance-type")
114-
INSTANCE_ID=$(get_metadata "instance-id")
115-
REGION=$(get_metadata "placement/region")
116-
AZ=$(get_metadata "placement/availability-zone")
117-
log "Instance metadata: Type=${INSTANCE_TYPE} ID=${INSTANCE_ID} Region=${REGION} AZ=${AZ}"
91+
# Lambda instance info from environment (no metadata service)
92+
INSTANCE_ID="${LAMBDA_INSTANCE_ID:-unknown}"
93+
INSTANCE_IP="${LAMBDA_INSTANCE_IP:-unknown}"
94+
log "Lambda instance: ID=${INSTANCE_ID} IP=${INSTANCE_IP}"
11895

11996
# Set up maximum lifetime timeout - instance will terminate after this time regardless of job status
12097
MAX_LIFETIME_MINUTES=$max_instance_lifetime
12198
log "Setting up maximum lifetime timeout: ${MAX_LIFETIME_MINUTES} minutes"
122-
# Use ; instead of && so shutdown runs even if echo fails (e.g., disk full)
123-
# Try multiple shutdown methods as fallbacks
12499
nohup bash -c "
125100
sleep ${MAX_LIFETIME_MINUTES}m
126101
echo '[$(date)] Maximum lifetime reached' 2>/dev/null || true
127-
# Try normal shutdown
128-
shutdown -h now 2>/dev/null || {
129-
# If shutdown fails, try halt
130-
halt -f 2>/dev/null || {
131-
# If halt fails, try sysrq if available (Linux only)
132-
if [ -w /proc/sysrq-trigger ]; then
133-
echo 1 > /proc/sys/kernel/sysrq 2>/dev/null
134-
echo o > /proc/sysrq-trigger 2>/dev/null
135-
fi
136-
# Last resort: force immediate reboot
137-
reboot -f 2>/dev/null || true
138-
}
139-
}
102+
# Terminate via Lambda API
103+
curl -s -X POST -H 'Authorization: Bearer $LAMBDA_API_KEY' -H 'Content-Type: application/json' \
104+
-d '{\"instance_ids\": [\"$LAMBDA_INSTANCE_ID\"]}' \
105+
'https://cloud.lambdalabs.com/api/v1/instance-operations/terminate' || true
140106
" > /var/log/max-lifetime.log 2>&1 &
141107

142-
# Configure CloudWatch Logs if a log group is specified
143-
if [ "$cloudwatch_logs_group" != "" ]; then
144-
log "Installing CloudWatch agent"
145-
146-
# Detect architecture for CloudWatch agent
147-
ARCH=$(uname -m)
148-
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
149-
CW_ARCH="arm64"
150-
else
151-
CW_ARCH="amd64"
152-
fi
153-
154-
if command -v dpkg >/dev/null 2>&1; then
155-
wait_for_dpkg_lock
156-
wget -q https://s3.amazonaws.com/amazoncloudwatch-agent/ubuntu/${CW_ARCH}/latest/amazon-cloudwatch-agent.deb
157-
dpkg -i -E ./amazon-cloudwatch-agent.deb
158-
rm amazon-cloudwatch-agent.deb
159-
elif command -v rpm >/dev/null 2>&1; then
160-
# Note: For RPM-based systems, the path structure might differ
161-
wget -q https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/${CW_ARCH}/latest/amazon-cloudwatch-agent.rpm
162-
rpm -U ./amazon-cloudwatch-agent.rpm
163-
rm amazon-cloudwatch-agent.rpm
164-
fi
165-
166-
# Build CloudWatch config
167-
cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json << EOF
168-
{
169-
"agent": {
170-
"run_as_user": "cwagent"
171-
},
172-
"logs": {
173-
"logs_collected": {
174-
"files": {
175-
"collect_list": [
176-
{ "file_path": "/var/log/runner-setup.log" , "log_group_name": "$cloudwatch_logs_group", "log_stream_name": "{instance_id}/runner-setup" , "timezone": "UTC" },
177-
{ "file_path": "/var/log/runner-debug.log" , "log_group_name": "$cloudwatch_logs_group", "log_stream_name": "{instance_id}/runner-debug" , "timezone": "UTC" },
178-
{ "file_path": "/tmp/job-started-hook.log" , "log_group_name": "$cloudwatch_logs_group", "log_stream_name": "{instance_id}/job-started" , "timezone": "UTC" },
179-
{ "file_path": "/tmp/job-completed-hook.log" , "log_group_name": "$cloudwatch_logs_group", "log_stream_name": "{instance_id}/job-completed", "timezone": "UTC" },
180-
{ "file_path": "/tmp/termination-check.log" , "log_group_name": "$cloudwatch_logs_group", "log_stream_name": "{instance_id}/termination" , "timezone": "UTC" },
181-
{ "file_path": "/tmp/runner-*-config.log" , "log_group_name": "$cloudwatch_logs_group", "log_stream_name": "{instance_id}/runner-config", "timezone": "UTC" },
182-
{ "file_path": "$homedir/_diag/Runner_**.log", "log_group_name": "$cloudwatch_logs_group", "log_stream_name": "{instance_id}/runner-diag" , "timezone": "UTC" },
183-
{ "file_path": "$homedir/_diag/Worker_**.log", "log_group_name": "$cloudwatch_logs_group", "log_stream_name": "{instance_id}/worker-diag" , "timezone": "UTC" }
184-
]
185-
}
186-
}
187-
}
188-
}
189-
EOF
190-
191-
if ! /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s; then
192-
log_error "Failed to start CloudWatch agent"
193-
terminate_instance "CloudWatch agent startup failed"
194-
fi
195-
196-
log "CloudWatch agent started successfully"
197-
fi
198-
199-
# Configure SSH access if public key provided (useful for debugging)
200-
if [ -n "$ssh_pubkey" ]; then
201-
log "Configuring SSH access"
202-
# Determine the default user based on the home directory owner
203-
DEFAULT_USER=$(stat -c "%U" "$homedir" 2>/dev/null || echo "root")
204-
mkdir -p "$homedir/.ssh"
205-
chmod 700 "$homedir/.ssh"
206-
echo "$ssh_pubkey" >> "$homedir/.ssh/authorized_keys"
207-
chmod 600 "$homedir/.ssh/authorized_keys"
208-
if [ "$DEFAULT_USER" != "root" ]; then
209-
chown -R "$DEFAULT_USER:$DEFAULT_USER" "$homedir/.ssh"
210-
fi
211-
log "SSH key added for user $DEFAULT_USER"
212-
fi
213-
214108
log "Working directory: $homedir"
215109
cd "$homedir"
216110

217-
# Run any pre-runner script provided by the user
218-
if [ -n "$script" ]; then
219-
echo "$script" > pre-runner-script.sh
220-
log "Running pre-runner script"
221-
source pre-runner-script.sh
222-
fi
223111
export RUNNER_ALLOW_RUNASROOT=1
224112

225-
# Number of runners to configure on this instance
226-
RUNNERS_PER_INSTANCE=$runners_per_instance
227-
228113
# Download GitHub Actions runner binary
229114
ARCH=$(uname -m)
230115
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
@@ -296,6 +181,9 @@ Type=oneshot
296181
Environment="RUNNER_GRACE_PERIOD=$runner_grace_period"
297182
Environment="RUNNER_INITIAL_GRACE_PERIOD=$runner_initial_grace_period"
298183
Environment="RUNNER_POLL_INTERVAL=$runner_poll_interval"
184+
Environment="LAMBDA_API_KEY=$LAMBDA_API_KEY"
185+
Environment="LAMBDA_INSTANCE_ID=$LAMBDA_INSTANCE_ID"
186+
Environment="LAMBDA_INSTANCE_IP=$LAMBDA_INSTANCE_IP"
299187
ExecStart=$BIN_DIR/check-runner-termination.sh
300188
EOF
301189

@@ -314,85 +202,39 @@ systemctl daemon-reload
314202
systemctl enable runner-termination-check.timer
315203
systemctl start runner-termination-check.timer
316204

317-
# Build metadata labels (these will be added to the runner labels)
318-
METADATA_LABELS=",${INSTANCE_ID},${INSTANCE_TYPE}"
319-
# Add instance name as a label if provided
320-
if [ -n "$instance_name" ]; then
321-
INSTANCE_NAME_LABEL=$(echo "$instance_name" | tr ' /' '-' | tr -cd '[:alnum:]-_#')
322-
METADATA_LABELS="${METADATA_LABELS},${INSTANCE_NAME_LABEL}"
323-
fi
205+
# Build metadata labels
206+
METADATA_LABELS=",${INSTANCE_ID}"
324207

325-
log "Setting up $RUNNERS_PER_INSTANCE runner(s)"
208+
log "Setting up runner"
326209

327-
# Export functions for subprocesses (variables already exported from runner-common.sh)
210+
# Export functions for subprocesses
328211
export -f configure_runner
329212
export -f log
330213
export -f log_error
331-
export -f get_metadata
332-
export -f flush_cloudwatch_logs
333214
export -f deregister_all_runners
334215
export -f debug_sleep_and_shutdown
216+
export -f terminate_lambda_instance
335217
export -f wait_for_dpkg_lock
336218

337-
# Parse space-delimited tokens and pipe-delimited labels
338-
IFS=' ' read -ra tokens <<< "$runner_tokens"
339-
IFS='|' read -ra labels <<< "$runner_labels"
340-
341-
num_runners=${#tokens[@]}
342-
log "Configuring $num_runners runner(s) in parallel"
343-
344-
# Start configuration for each runner in parallel
345-
pids=()
346-
for i in ${!tokens[@]}; do
347-
token=${tokens[$i]}
348-
label=${labels[$i]:-}
349-
if [ -z "$token" ]; then
350-
log_error "No token for runner $i"
351-
continue
352-
fi
353-
(
354-
# Override ERR trap in subshell to prevent global side effects
355-
trap 'echo "Subshell error on line $LINENO" >&2; exit 1' ERR
356-
configure_runner $i "$token" "${label}$METADATA_LABELS" "$homedir" "$repo" "$INSTANCE_ID" "$runner_grace_period" "$runner_initial_grace_period"
357-
echo $? > /tmp/runner-$i-status
358-
) &
359-
pids+=($!)
360-
log "Started configuration for runner $i (PID: ${pids[-1]})"
361-
done
362-
363-
# Wait for all background jobs to complete
364-
log "Waiting for all runner configurations to complete..."
365-
failed=0
366-
succeeded=0
367-
for i in ${!pids[@]}; do
368-
wait ${pids[$i]}
369-
if [ -f /tmp/runner-$i-status ]; then
370-
status=$(cat /tmp/runner-$i-status)
371-
rm -f /tmp/runner-$i-status
372-
if [ "$status" != "0" ]; then
373-
log_error "Runner $i configuration failed"
374-
failed=$((failed + 1))
375-
else
376-
succeeded=$((succeeded + 1))
377-
fi
378-
fi
379-
done
219+
# Single runner setup (Lambda doesn't need multi-runner complexity for now)
220+
token="$runner_token"
221+
labels="$runner_labels"
380222

381-
# Allow partial success - only terminate if ALL runners failed
382-
if [ $succeeded -eq 0 ] && [ $failed -gt 0 ]; then
383-
terminate_instance "All runners failed to register"
384-
elif [ $failed -gt 0 ]; then
385-
log "WARNING: $failed runner(s) failed, but $succeeded succeeded. Continuing with partial capacity."
223+
if [ -z "$token" ]; then
224+
log_error "No runner token provided"
225+
terminate_instance "No runner token"
386226
fi
387227

388-
if [ $succeeded -gt 0 ]; then
389-
log "$succeeded runner(s) registered and started successfully"
390-
touch $RUNNER_STATE_DIR/registered
391-
else
392-
log_error "No runners registered successfully"
393-
terminate_instance "No runners registered successfully"
228+
configure_runner 0 "$token" "${labels}${METADATA_LABELS}" "$homedir" "$repo" "$INSTANCE_ID" "$runner_grace_period" "$runner_initial_grace_period"
229+
result=$?
230+
231+
if [ $result -ne 0 ]; then
232+
terminate_instance "Runner failed to register"
394233
fi
395234

235+
log "Runner registered and started successfully"
236+
touch $RUNNER_STATE_DIR/registered
237+
396238
# Kill registration watchdog now that runners are registered
397239
if [ -f $RUNNER_STATE_DIR/watchdog.pid ]; then
398240
WATCHDOG_PID=$(cat $RUNNER_STATE_DIR/watchdog.pid)

0 commit comments

Comments
 (0)