@@ -13,38 +13,12 @@ if [ "$debug" = "true" ] || [ "$debug" = "True" ] || [ "$debug" = "trace" ] || [
1313 set -x
1414fi
1515
16- # Determine home directory early since it's needed by shared functions
17- if [ -z " $homedir " ] || [ " $homedir " = " AUTO" ]; then
18- # Try to find the default non-root user's home directory
19- for user in ubuntu ec2-user centos admin debian fedora alpine arch; do
20- if id " $user " & > /dev/null; then
21- homedir=" /home/$user "
22- echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] Auto-detected homedir: $homedir " | tee -a /var/log/runner-setup.log
23- break
24- fi
25- done
26-
27- # Fallback if no standard user found
28- if [ -z " $homedir " ] || [ " $homedir " = " AUTO" ]; then
29- homedir=$( getent passwd | awk -F: ' $3 >= 1000 && $3 < 65534 && $6 ~ /^\/home\// {print $6}' | while read dir; do
30- if [ -d " $dir " ]; then
31- echo " $dir "
32- break
33- fi
34- done)
35- if [ -z " $homedir " ]; then
36- homedir=" /home/ec2-user" # Ultimate fallback
37- echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] Using fallback homedir: $homedir " | tee -a /var/log/runner-setup.log
38- else
39- owner=$( stat -c " %U" " $homedir " 2> /dev/null || stat -f " %Su" " $homedir " 2> /dev/null)
40- echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] Detected homedir: $homedir (owner: $owner )" | tee -a /var/log/runner-setup.log
41- fi
42- fi
43- else
44- echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] Using specified homedir: $homedir " | tee -a /var/log/runner-setup.log
45- fi
16+ # Lambda instances are Ubuntu-based with ubuntu user
17+ homedir=" ${homedir:-/ home/ ubuntu} "
4618export homedir
4719
20+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] Using homedir: $homedir " | tee -a /var/log/runner-setup.log
21+
4822# Set common paths
4923BIN_DIR=/usr/local/bin
5024RUNNER_STATE_DIR=/var/run/github-runner
@@ -55,18 +29,23 @@ echo "[$(date '+%Y-%m-%d %H:%M:%S')] Fetching shared functions from GitHub (SHA:
5529FUNCTIONS_URL=" https://raw.githubusercontent.com/Open-Athena/lambda-gha/${action_sha} /src/lambda_gha/templates/shared-functions.sh"
5630if ! curl -sSL " $FUNCTIONS_URL " -o /tmp/shared-functions.sh && ! wget -q " $FUNCTIONS_URL " -O /tmp/shared-functions.sh; then
5731 echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] ERROR: Failed to download shared functions" | tee -a /var/log/runner-setup.log
58- shutdown -h now
32+ # Terminate via Lambda API
33+ curl -s -X POST -H " Authorization: Bearer $LAMBDA_API_KEY " -H " Content-Type: application/json" \
34+ -d " {\" instance_ids\" : [\" $LAMBDA_INSTANCE_ID \" ]}" \
35+ " https://cloud.lambdalabs.com/api/v1/instance-operations/terminate" || true
5936 exit 1
6037fi
6138
6239# Write shared functions that will be used by multiple scripts
6340cat > $BIN_DIR /runner-common.sh << EOSF
6441# Auto-generated shared functions and variables
65- # Set homedir for scripts that source this file
6642homedir="$homedir "
6743debug="$debug "
6844RUNNER_STATE_DIR="$RUNNER_STATE_DIR "
69- export homedir debug RUNNER_STATE_DIR
45+ LAMBDA_API_KEY="$LAMBDA_API_KEY "
46+ LAMBDA_INSTANCE_ID="$LAMBDA_INSTANCE_ID "
47+ LAMBDA_INSTANCE_IP="$LAMBDA_INSTANCE_IP "
48+ export homedir debug RUNNER_STATE_DIR LAMBDA_API_KEY LAMBDA_INSTANCE_ID LAMBDA_INSTANCE_IP
7049
7150EOSF
7251
@@ -76,8 +55,8 @@ cat /tmp/shared-functions.sh >> $BIN_DIR/runner-common.sh
7655chmod +x $BIN_DIR /runner-common.sh
7756source $BIN_DIR /runner-common.sh
7857
79- logger " EC2-GHA : Starting userdata script"
80- trap ' logger "EC2-GHA : Script failed at line $LINENO with exit code $?"' ERR
58+ logger " lambda-gha : Starting userdata script"
59+ trap ' logger "lambda-gha : Script failed at line $LINENO with exit code $?"' ERR
8160trap ' terminate_instance "Setup script failed with error on line $LINENO"' ERR
8261# Handle watchdog termination signal
8362trap ' if [ -f $RUNNER_STATE_DIR/watchdog-terminate ]; then terminate_instance "No runners registered within timeout"; else terminate_instance "Script terminated"; fi' TERM
10988exec >> /var/log/runner-setup.log 2>&1
11089log " Starting runner setup"
11190
112- # Fetch instance metadata for labeling and logging
113- INSTANCE_TYPE=$( get_metadata " instance-type" )
114- INSTANCE_ID=$( get_metadata " instance-id" )
115- REGION=$( get_metadata " placement/region" )
116- AZ=$( get_metadata " placement/availability-zone" )
117- log " Instance metadata: Type=${INSTANCE_TYPE} ID=${INSTANCE_ID} Region=${REGION} AZ=${AZ} "
91+ # Lambda instance info from environment (no metadata service)
92+ INSTANCE_ID=" ${LAMBDA_INSTANCE_ID:- unknown} "
93+ INSTANCE_IP=" ${LAMBDA_INSTANCE_IP:- unknown} "
94+ log " Lambda instance: ID=${INSTANCE_ID} IP=${INSTANCE_IP} "
11895
11996# Set up maximum lifetime timeout - instance will terminate after this time regardless of job status
12097MAX_LIFETIME_MINUTES=$max_instance_lifetime
12198log " Setting up maximum lifetime timeout: ${MAX_LIFETIME_MINUTES} minutes"
122- # Use ; instead of && so shutdown runs even if echo fails (e.g., disk full)
123- # Try multiple shutdown methods as fallbacks
12499nohup bash -c "
125100 sleep ${MAX_LIFETIME_MINUTES} m
126101 echo '[$( date) ] Maximum lifetime reached' 2>/dev/null || true
127- # Try normal shutdown
128- shutdown -h now 2>/dev/null || {
129- # If shutdown fails, try halt
130- halt -f 2>/dev/null || {
131- # If halt fails, try sysrq if available (Linux only)
132- if [ -w /proc/sysrq-trigger ]; then
133- echo 1 > /proc/sys/kernel/sysrq 2>/dev/null
134- echo o > /proc/sysrq-trigger 2>/dev/null
135- fi
136- # Last resort: force immediate reboot
137- reboot -f 2>/dev/null || true
138- }
139- }
102+ # Terminate via Lambda API
103+ curl -s -X POST -H 'Authorization: Bearer $LAMBDA_API_KEY ' -H 'Content-Type: application/json' \
104+ -d '{\" instance_ids\" : [\" $LAMBDA_INSTANCE_ID \" ]}' \
105+ 'https://cloud.lambdalabs.com/api/v1/instance-operations/terminate' || true
140106" > /var/log/max-lifetime.log 2>&1 &
141107
142- # Configure CloudWatch Logs if a log group is specified
143- if [ " $cloudwatch_logs_group " != " " ]; then
144- log " Installing CloudWatch agent"
145-
146- # Detect architecture for CloudWatch agent
147- ARCH=$( uname -m)
148- if [ " $ARCH " = " aarch64" ] || [ " $ARCH " = " arm64" ]; then
149- CW_ARCH=" arm64"
150- else
151- CW_ARCH=" amd64"
152- fi
153-
154- if command -v dpkg > /dev/null 2>&1 ; then
155- wait_for_dpkg_lock
156- wget -q https://s3.amazonaws.com/amazoncloudwatch-agent/ubuntu/${CW_ARCH} /latest/amazon-cloudwatch-agent.deb
157- dpkg -i -E ./amazon-cloudwatch-agent.deb
158- rm amazon-cloudwatch-agent.deb
159- elif command -v rpm > /dev/null 2>&1 ; then
160- # Note: For RPM-based systems, the path structure might differ
161- wget -q https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/${CW_ARCH} /latest/amazon-cloudwatch-agent.rpm
162- rpm -U ./amazon-cloudwatch-agent.rpm
163- rm amazon-cloudwatch-agent.rpm
164- fi
165-
166- # Build CloudWatch config
167- cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json << EOF
168- {
169- "agent": {
170- "run_as_user": "cwagent"
171- },
172- "logs": {
173- "logs_collected": {
174- "files": {
175- "collect_list": [
176- { "file_path": "/var/log/runner-setup.log" , "log_group_name": "$cloudwatch_logs_group ", "log_stream_name": "{instance_id}/runner-setup" , "timezone": "UTC" },
177- { "file_path": "/var/log/runner-debug.log" , "log_group_name": "$cloudwatch_logs_group ", "log_stream_name": "{instance_id}/runner-debug" , "timezone": "UTC" },
178- { "file_path": "/tmp/job-started-hook.log" , "log_group_name": "$cloudwatch_logs_group ", "log_stream_name": "{instance_id}/job-started" , "timezone": "UTC" },
179- { "file_path": "/tmp/job-completed-hook.log" , "log_group_name": "$cloudwatch_logs_group ", "log_stream_name": "{instance_id}/job-completed", "timezone": "UTC" },
180- { "file_path": "/tmp/termination-check.log" , "log_group_name": "$cloudwatch_logs_group ", "log_stream_name": "{instance_id}/termination" , "timezone": "UTC" },
181- { "file_path": "/tmp/runner-*-config.log" , "log_group_name": "$cloudwatch_logs_group ", "log_stream_name": "{instance_id}/runner-config", "timezone": "UTC" },
182- { "file_path": "$homedir /_diag/Runner_**.log", "log_group_name": "$cloudwatch_logs_group ", "log_stream_name": "{instance_id}/runner-diag" , "timezone": "UTC" },
183- { "file_path": "$homedir /_diag/Worker_**.log", "log_group_name": "$cloudwatch_logs_group ", "log_stream_name": "{instance_id}/worker-diag" , "timezone": "UTC" }
184- ]
185- }
186- }
187- }
188- }
189- EOF
190-
191- if ! /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s; then
192- log_error " Failed to start CloudWatch agent"
193- terminate_instance " CloudWatch agent startup failed"
194- fi
195-
196- log " CloudWatch agent started successfully"
197- fi
198-
199- # Configure SSH access if public key provided (useful for debugging)
200- if [ -n " $ssh_pubkey " ]; then
201- log " Configuring SSH access"
202- # Determine the default user based on the home directory owner
203- DEFAULT_USER=$( stat -c " %U" " $homedir " 2> /dev/null || echo " root" )
204- mkdir -p " $homedir /.ssh"
205- chmod 700 " $homedir /.ssh"
206- echo " $ssh_pubkey " >> " $homedir /.ssh/authorized_keys"
207- chmod 600 " $homedir /.ssh/authorized_keys"
208- if [ " $DEFAULT_USER " != " root" ]; then
209- chown -R " $DEFAULT_USER :$DEFAULT_USER " " $homedir /.ssh"
210- fi
211- log " SSH key added for user $DEFAULT_USER "
212- fi
213-
214108log " Working directory: $homedir "
215109cd " $homedir "
216110
217- # Run any pre-runner script provided by the user
218- if [ -n " $script " ]; then
219- echo " $script " > pre-runner-script.sh
220- log " Running pre-runner script"
221- source pre-runner-script.sh
222- fi
223111export RUNNER_ALLOW_RUNASROOT=1
224112
225- # Number of runners to configure on this instance
226- RUNNERS_PER_INSTANCE=$runners_per_instance
227-
228113# Download GitHub Actions runner binary
229114ARCH=$( uname -m)
230115if [ " $ARCH " = " aarch64" ] || [ " $ARCH " = " arm64" ]; then
@@ -296,6 +181,9 @@ Type=oneshot
296181Environment="RUNNER_GRACE_PERIOD=$runner_grace_period "
297182Environment="RUNNER_INITIAL_GRACE_PERIOD=$runner_initial_grace_period "
298183Environment="RUNNER_POLL_INTERVAL=$runner_poll_interval "
184+ Environment="LAMBDA_API_KEY=$LAMBDA_API_KEY "
185+ Environment="LAMBDA_INSTANCE_ID=$LAMBDA_INSTANCE_ID "
186+ Environment="LAMBDA_INSTANCE_IP=$LAMBDA_INSTANCE_IP "
299187ExecStart=$BIN_DIR /check-runner-termination.sh
300188EOF
301189
@@ -314,85 +202,39 @@ systemctl daemon-reload
314202systemctl enable runner-termination-check.timer
315203systemctl start runner-termination-check.timer
316204
317- # Build metadata labels (these will be added to the runner labels)
318- METADATA_LABELS=" ,${INSTANCE_ID} ,${INSTANCE_TYPE} "
319- # Add instance name as a label if provided
320- if [ -n " $instance_name " ]; then
321- INSTANCE_NAME_LABEL=$( echo " $instance_name " | tr ' /' ' -' | tr -cd ' [:alnum:]-_#' )
322- METADATA_LABELS=" ${METADATA_LABELS} ,${INSTANCE_NAME_LABEL} "
323- fi
205+ # Build metadata labels
206+ METADATA_LABELS=" ,${INSTANCE_ID} "
324207
325- log " Setting up $RUNNERS_PER_INSTANCE runner(s) "
208+ log " Setting up runner"
326209
327- # Export functions for subprocesses (variables already exported from runner-common.sh)
210+ # Export functions for subprocesses
328211export -f configure_runner
329212export -f log
330213export -f log_error
331- export -f get_metadata
332- export -f flush_cloudwatch_logs
333214export -f deregister_all_runners
334215export -f debug_sleep_and_shutdown
216+ export -f terminate_lambda_instance
335217export -f wait_for_dpkg_lock
336218
337- # Parse space-delimited tokens and pipe-delimited labels
338- IFS=' ' read -ra tokens <<< " $runner_tokens"
339- IFS=' |' read -ra labels <<< " $runner_labels"
340-
341- num_runners=${# tokens[@]}
342- log " Configuring $num_runners runner(s) in parallel"
343-
344- # Start configuration for each runner in parallel
345- pids=()
346- for i in ${! tokens[@]} ; do
347- token=${tokens[$i]}
348- label=${labels[$i]:- }
349- if [ -z " $token " ]; then
350- log_error " No token for runner $i "
351- continue
352- fi
353- (
354- # Override ERR trap in subshell to prevent global side effects
355- trap ' echo "Subshell error on line $LINENO" >&2; exit 1' ERR
356- configure_runner $i " $token " " ${label} $METADATA_LABELS " " $homedir " " $repo " " $INSTANCE_ID " " $runner_grace_period " " $runner_initial_grace_period "
357- echo $? > /tmp/runner-$i -status
358- ) &
359- pids+=($! )
360- log " Started configuration for runner $i (PID: ${pids[-1]} )"
361- done
362-
363- # Wait for all background jobs to complete
364- log " Waiting for all runner configurations to complete..."
365- failed=0
366- succeeded=0
367- for i in ${! pids[@]} ; do
368- wait ${pids[$i]}
369- if [ -f /tmp/runner-$i -status ]; then
370- status=$( cat /tmp/runner-$i -status)
371- rm -f /tmp/runner-$i -status
372- if [ " $status " != " 0" ]; then
373- log_error " Runner $i configuration failed"
374- failed=$(( failed + 1 ))
375- else
376- succeeded=$(( succeeded + 1 ))
377- fi
378- fi
379- done
219+ # Single runner setup (Lambda doesn't need multi-runner complexity for now)
220+ token=" $runner_token "
221+ labels=" $runner_labels "
380222
381- # Allow partial success - only terminate if ALL runners failed
382- if [ $succeeded -eq 0 ] && [ $failed -gt 0 ]; then
383- terminate_instance " All runners failed to register"
384- elif [ $failed -gt 0 ]; then
385- log " WARNING: $failed runner(s) failed, but $succeeded succeeded. Continuing with partial capacity."
223+ if [ -z " $token " ]; then
224+ log_error " No runner token provided"
225+ terminate_instance " No runner token"
386226fi
387227
388- if [ $succeeded -gt 0 ]; then
389- log " $succeeded runner(s) registered and started successfully"
390- touch $RUNNER_STATE_DIR /registered
391- else
392- log_error " No runners registered successfully"
393- terminate_instance " No runners registered successfully"
228+ configure_runner 0 " $token " " ${labels}${METADATA_LABELS} " " $homedir " " $repo " " $INSTANCE_ID " " $runner_grace_period " " $runner_initial_grace_period "
229+ result=$?
230+
231+ if [ $result -ne 0 ]; then
232+ terminate_instance " Runner failed to register"
394233fi
395234
235+ log " Runner registered and started successfully"
236+ touch $RUNNER_STATE_DIR /registered
237+
396238# Kill registration watchdog now that runners are registered
397239if [ -f $RUNNER_STATE_DIR /watchdog.pid ]; then
398240 WATCHDOG_PID=$( cat $RUNNER_STATE_DIR /watchdog.pid)
0 commit comments