From 94e815ac681ba5836ce07cda894d53d3dd900afd Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Tue, 22 Jul 2025 01:47:55 -0400 Subject: [PATCH 01/13] initial implementation --- .github/workflows/runner.yml | 243 +++++++++++++++++++++++++++ README.md | 311 ++++++++++++++++++++++++++++++++++- 2 files changed, 553 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/runner.yml diff --git a/.github/workflows/runner.yml b/.github/workflows/runner.yml new file mode 100644 index 0000000..d1b4acb --- /dev/null +++ b/.github/workflows/runner.yml @@ -0,0 +1,243 @@ +name: EC2 Runner +# +# Environment variables (set at org/repo level): +# EC2_IMAGE_ID - Default AMI ID +# EC2_INSTANCE_TYPE - Default instance type +# EC2_HOME_DIR - Default home directory +# EC2_KEY_NAME - Default SSH key pair name +# EC2_SECURITY_GROUP_ID - Default security group ID +# +# Priority: inputs > vars > defaults + +on: + workflow_call: + secrets: + AWS_ROLE: + description: "AWS role ARN to assume for EC2 operations" + required: true + GH_SA_TOKEN: + description: "GitHub token with permissions to manage self-hosted runners" + required: true + SSH_PUBKEY: + description: "SSH public key to add to authorized_keys (optional)" + required: false + inputs: + aws_image_id: + description: "AWS AMI ID to use" + required: false + type: string + default: "ami-00096836009b16a22" # Deep Learning OSS Nvidia Driver AMI GPU PyTorch + aws_instance_type: + description: "AWS instance type" + required: false + type: string + default: "g4dn.xlarge" + aws_home_dir: + description: "Home directory on the AWS instance" + required: false + type: string + default: "/home/ubuntu" + shutdown_poll_wait: + description: "Minutes to wait for runner setup before monitoring for termination" + required: false + type: number + default: 3 + aws_key_name: + description: "Name of the EC2 key pair to use for SSH access" + required: false + type: string + aws_security_group_id: + description: "AWS security group ID (defaults to account default if not specified)" + required: false + type: string + poll_interval: + description: "Interval in seconds to check if GHA process is still running" + required: false + type: number + default: 15 + ssh_pubkey: + description: "SSH public key to add to authorized_keys (optional input)" + required: false + type: string + outputs: + instance: + description: "Instance ID for runs-on" + value: ${{ jobs.start-ec2-runner.outputs.instance }} + +permissions: + id-token: write # Required for AWS OIDC + +jobs: + start-ec2-runner: + runs-on: ubuntu-latest + outputs: + instance: ${{ steps.aws-start.outputs.label }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_ROLE }} + role-session-name: github-actions-session + aws-region: us-east-1 + + - name: Create cloud runner + id: aws-start + uses: Open-Athena/start-aws-gha-runner@dev + with: + aws_image_id: ${{ inputs.aws_image_id || vars.EC2_IMAGE_ID || 'ami-00096836009b16a22' }} + aws_instance_type: ${{ inputs.aws_instance_type || vars.EC2_INSTANCE_TYPE || 'g4dn.xlarge' }} + aws_home_dir: ${{ inputs.aws_home_dir || vars.EC2_HOME_DIR || '/home/ubuntu' }} + aws_key_name: ${{ inputs.aws_key_name || vars.EC2_KEY_NAME }} + aws_security_group_id: ${{ inputs.aws_security_group_id || vars.EC2_SECURITY_GROUP_ID }} + aws_tags: | + [ + { "Key": "Name", "Value": "gha#${{ github.run_id }}" }, + { "Key": "workflow", "Value": "${{ github.workflow }}" }, + { "Key": "repository", "Value": "${{ github.repository }}" }, + { "Key": "gha_url", "Value": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" } + ] + # Uses AWS account defaults for subnet and security group + aws_userdata: | + # Instance is already configured to terminate on shutdown via launch parameter + + # Create log file for debugging + exec > >(tee -a /var/log/runner-setup.log) + exec 2>&1 + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting runner setup userdata script" + + # Configure SSH access + # Get the home directory for the default user + if id ec2-user &>/dev/null; then + USER_HOME="/home/ec2-user" + DEFAULT_USER="ec2-user" + elif id ubuntu &>/dev/null; then + USER_HOME="/home/ubuntu" + DEFAULT_USER="ubuntu" + else + USER_HOME="/root" + DEFAULT_USER="root" + fi + + # Create .ssh directory if it doesn't exist + mkdir -p "$USER_HOME/.ssh" + chmod 700 "$USER_HOME/.ssh" + touch "$USER_HOME/.ssh/authorized_keys" + chmod 600 "$USER_HOME/.ssh/authorized_keys" + + # Add default SSH public key from secret if provided + if [ -n "${{ secrets.SSH_PUBKEY }}" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Adding default SSH public key from secret" + echo "${{ secrets.SSH_PUBKEY }}" >> "$USER_HOME/.ssh/authorized_keys" + fi + + # Add input SSH public key if provided + if [ -n "${{ inputs.ssh_pubkey }}" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Adding SSH public key from workflow input" + echo "${{ inputs.ssh_pubkey }}" >> "$USER_HOME/.ssh/authorized_keys" + fi + + # Fix ownership + chown -R "$DEFAULT_USER:$DEFAULT_USER" "$USER_HOME/.ssh" + + if [ -s "$USER_HOME/.ssh/authorized_keys" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] SSH access configured with $(wc -l < $USER_HOME/.ssh/authorized_keys) key(s)" + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] No SSH keys configured" + fi + + # Create self-termination service + cat > /etc/systemd/system/github-runner-cleanup.service << 'EOF' + [Unit] + Description=GitHub Runner Self-Termination + After=network.target + + [Service] + Type=simple + ExecStart=/usr/local/bin/github-runner-cleanup.sh + Restart=always + RestartSec=60 + User=root + + [Install] + WantedBy=multi-user.target + EOF + + # Create cleanup script + cat > /usr/local/bin/github-runner-cleanup.sh << 'EOF' + #!/bin/bash + + # Log function + log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/github-runner-cleanup.log + } + + log "GitHub Runner cleanup service started" + POLL_INTERVAL=${{ inputs.poll_interval }} + log "Poll interval set to $POLL_INTERVAL seconds" + + # Smart wait for runner to be installed + WAIT_MINUTES=${{ inputs.shutdown_poll_wait }} + MAX_WAIT_SECONDS=$((WAIT_MINUTES * 60)) + WAITED=0 + CHECK_INTERVAL=10 + + log "Waiting up to $WAIT_MINUTES minutes for runner setup (checking every ${CHECK_INTERVAL}s)..." + + # Check periodically during wait time if runner is already up + while [ $WAITED -lt $MAX_WAIT_SECONDS ]; do + if pgrep -f "Runner.Listener" > /dev/null 2>&1 || [ -f /var/run/github-runner-started ]; then + log "Runner process detected after $WAITED seconds, starting monitoring" + break + fi + + sleep $CHECK_INTERVAL + WAITED=$((WAITED + CHECK_INTERVAL)) + + # Log progress every 30 seconds + if [ $((WAITED % 30)) -eq 0 ] && [ $WAITED -lt $MAX_WAIT_SECONDS ]; then + log "Still waiting for runner... ${WAITED}s/${MAX_WAIT_SECONDS}s" + fi + done + + if [ $WAITED -ge $MAX_WAIT_SECONDS ]; then + log "Wait period completed (${MAX_WAIT_SECONDS}s), starting monitoring" + fi + + while true; do + # Check if runner process exists or is being configured + if pgrep -f "Runner.Listener" > /dev/null 2>&1 || pgrep -f "config.sh" > /dev/null 2>&1 || pgrep -f "run.sh" > /dev/null 2>&1; then + log "Runner process detected, continuing monitoring" + else + # Double-check after a delay + log "Runner process not detected, waiting $POLL_INTERVAL seconds before verification" + sleep $POLL_INTERVAL + + if ! pgrep -f "Runner.Listener" > /dev/null 2>&1 && ! pgrep -f "config.sh" > /dev/null 2>&1 && ! pgrep -f "run.sh" > /dev/null 2>&1; then + log "Runner confirmed stopped, initiating self-termination" + + # Shutdown instance (will terminate due to launch configuration) + shutdown -h now + + exit 0 + else + log "False alarm, runner still active" + fi + fi + + sleep $POLL_INTERVAL + done + EOF + + chmod +x /usr/local/bin/github-runner-cleanup.sh + + # Enable and start the service + systemctl daemon-reload + systemctl enable github-runner-cleanup.service + + # Create a marker file to indicate userdata completion + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Userdata script completed" >> /var/log/runner-setup.log + touch /var/run/runner-userdata-complete + + systemctl start github-runner-cleanup.service + env: + GH_PAT: ${{ secrets.GH_SA_TOKEN }} diff --git a/README.md b/README.md index 428d430..90f8f09 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,312 @@ # Open-Athena/ec2 +Auto-terminating EC2 GHA runner. -Placeholder: self-terminating EC2 runner for GitHub Actions with minimal boilerplate. +πŸ“– **Demo**: See [ec2-runner-demo](https://github.com/Open-Athena/ec2-runner-demo) for a complete working example. + +## Features + +- πŸš€ Starts EC2 instances on-demand for GitHub Actions jobs +- 🧹 Self-terminates when job completes (no separate stop job needed!) +- πŸ”‘ Uses GitHub OIDC for AWS authentication (no long-lived credentials) +- ⚑ Single reusable workflow call +- 🎯 GPU-optimized AMI by default +- πŸ’° Defaults to [`g4dn.xlarge`](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) - the cheapest EC2 GPU instance we're aware of +- πŸ”’ Uses Open-Athena's fork of start-aws-gha-runner for `userdata` support + +## Setup + +### 1. Configure AWS IAM Role + +Your AWS role needs permissions to: +- Launch EC2 instances +- Pass IAM roles +- Manage GitHub Actions runners + +The role must trust GitHub's OIDC provider. See [GitHub's documentation](https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/configuring-openid-connect-in-amazon-web-services) for setup. + +### 2. Set Organization Secrets + +In your GitHub organization settings, create these secrets: +- `AWS_ROLE`: ARN of your AWS IAM role (e.g., `arn:aws:iam::123456789012:role/GitHubActionsRole`) +- `GH_SA_TOKEN`: GitHub token with permissions to manage self-hosted runners + +### 3. Create Approval Label (Optional) + +If you want to allow trusted external contributors to run GPU tests: +1. Go to your repository's Issues tab +2. Click Labels β†’ New label +3. Create a label named `gpu` (or your custom name) +4. Maintainers can apply this label to PRs to authorize GPU runs + +## Minimal Example + +Just 16 lines to run GPU tests on EC2: + +```yaml +name: Minimal GPU EC2 runner test +on: + workflow_dispatch: +permissions: + id-token: write # Required for AWS OIDC authentication + contents: read # Required for actions/checkout +jobs: + ec2: + uses: Open-Athena/ec2/.github/workflows/runner.yml@main + secrets: inherit + gpu-test: + needs: ec2 + runs-on: ${{ needs.ec2.outputs.instance }} + steps: + - run: nvidia-smi # g4dn.xlarge! +``` + +That's it! The EC2 instance starts, runs your job, and automatically terminates when done. + +## Full Example + +For more control over instance configuration: + +```yaml +name: GPU Tests + +on: [push, pull_request] + +jobs: + ec2: + uses: Open-Athena/ec2/.github/workflows/runner.yml@main + secrets: inherit # Requires `AWS_ROLE`, `GH_SA_TOKEN` + with: + aws_instance_type: "g4dn.xlarge" # Optional, defaults to g4dn.xlarge + + test: + needs: ec2 + runs-on: ${{ needs.ec2.outputs.instance }} + steps: + - uses: actions/checkout@v4 + + - name: Verify GPU availability + run: nvidia-smi + + # No stop job needed - instance self-terminates! +``` + +## Configuration + +### Workflow Inputs + +| Input | Description | Default | +|-------|-------------|---------| +| `aws_instance_type` | EC2 instance type | [`g4dn.xlarge`](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) (cheapest GPU instance) | +| `aws_image_id` | AMI ID | `ami-00096836009b16a22` (Deep Learning AMI) | +| `aws_home_dir` | Home directory path | `/home/ubuntu` | +| `aws_key_name` | EC2 key pair name for SSH access | - | +| `aws_security_group_id` | Security group ID (must allow SSH if using) | - | +| `ssh_pubkey` | Additional SSH public key to authorize | - | +| `shutdown_poll_wait` | Minutes to wait for runner setup before monitoring | `3` | +| `poll_interval` | Seconds between runner process checks | `15` | + +### Environment Variables + +Set these as organization or repository variables for defaults: + +- `EC2_IMAGE_ID` - Default AMI ID +- `EC2_INSTANCE_TYPE` - Default instance type +- `EC2_KEY_NAME` - Default SSH key pair name +- `EC2_SECURITY_GROUP_ID` - Default security group ID + +Priority: workflow inputs > environment variables > hardcoded defaults + +## How It Works + +1. The workflow starts an EC2 instance using the specified configuration +2. A GitHub Actions runner is automatically installed and registered +3. Your job runs on the EC2 instance +4. A systemd service monitors the runner process +5. When the runner stops (job completes), the instance self-terminates automatically + +## Security + +### Fork Protection +This workflow includes built-in protection against unauthorized EC2 launches from forked repositories. EC2 instances will only start for: +- Direct pushes to your repository +- Manual workflow dispatches +- Pull requests from branches within your repository (not forks) +- Pull requests from forks that have the approval label (default: `gpu`) applied by a maintainer + +This prevents unauthorized external contributors from using your AWS resources through pull requests. + +### Label-Based Approval for External Contributors +For trusted external contributors, maintainers can approve GPU testing by: +1. Adding the approval label (default: `gpu`) to a pull request +2. The workflow will then run with EC2 instances +3. The label should be removed after the run completes + +To customize the label name: +```yaml +jobs: + ec2: + uses: Open-Athena/ec2/.github/workflows/runner.yml@v1 + secrets: inherit + with: + approval_label: "run-benchmarks" # Custom label name +``` + +To disable label-based approval entirely: +```yaml +with: + approval_label: "" # Empty string disables external PR approval +``` + +This follows the same pattern used by scikit-learn and other major open source projects for expensive CI resources. + +## SSH Debugging + +### Initial Setup + +1. **Create an EC2 key pair**: +```bash +# Create key pair and save private key +aws ec2 create-key-pair --key-name gha --key-type ed25519 \ + | jq -r .KeyMaterial > ~/.ssh/gha.pem +chmod 600 ~/.ssh/gha.pem + +# View the public key +ssh-keygen -y -f ~/.ssh/gha.pem +``` + +2. **Create a security group with SSH access**: +```bash +# Create security group +SECURITY_GROUP_ID=$(aws ec2 create-security-group \ + --group-name gha-runner-ssh \ + --description "GitHub Actions runner with SSH access" \ + --query 'GroupId' --output text) + +# Allow SSH from anywhere (or restrict --cidr to your IP) +aws ec2 authorize-security-group-ingress \ + --group-id $SECURITY_GROUP_ID \ + --protocol tcp \ + --port 22 \ + --cidr 0.0.0.0/0 + +echo "Security Group ID: $SECURITY_GROUP_ID" +``` + +3. **Set organization/repository variables**: +```bash +# Set at org level +gh variable set EC2_KEY_NAME --org Open-Athena --body "gha" +gh variable set EC2_SECURITY_GROUP_ID --org Open-Athena --body "$SECURITY_GROUP_ID" + +# Or at repo level +gh variable set EC2_KEY_NAME --body "gha" +gh variable set EC2_SECURITY_GROUP_ID --body "$SECURITY_GROUP_ID" +``` + +### Connecting to Running Instances + +1. **Find the latest instance**: +```bash +# Get the most recent running instance with gha# name +INSTANCE_INFO=$(aws ec2 describe-instances \ + --filters "Name=tag:Name,Values=gha#*" \ + "Name=instance-state-name,Values=running" \ + --query 'sort_by(Reservations[].Instances[], &LaunchTime)[-1].[PublicDnsName,InstanceId,Tags[?Key==`Name`].Value|[0]]' \ + --output text) + +INSTANCE_DNS=$(echo "$INSTANCE_INFO" | cut -f1) +INSTANCE_ID=$(echo "$INSTANCE_INFO" | cut -f2) +INSTANCE_NAME=$(echo "$INSTANCE_INFO" | cut -f3) + +echo "Connecting to $INSTANCE_NAME ($INSTANCE_ID)" +ssh -i ~/.ssh/gha.pem ubuntu@$INSTANCE_DNS +``` + +2. **Optional: Configure SSH client for easier access**: +```bash +# Add to ~/.ssh/config +Host gha + User ubuntu + IdentitiesOnly yes + IdentityFile ~/.ssh/gha.pem + StrictHostKeyChecking no + ForwardAgent yes + +# Then connect with: +ssh gha -o HostName=$INSTANCE_DNS +``` + +### Key Debugging Locations + +Once connected to the instance: + +```bash +# 1. Self-termination daemon logs (most important!) +sudo tail -f /var/log/github-runner-cleanup.log +# Shows: startup time, poll interval, wait duration, process monitoring + +# 2. Daemon status +sudo systemctl status github-runner-cleanup.service +# Shows: if service is running, PID, memory usage + +# 3. Runner setup logs +sudo cat /var/log/runner-setup.log +# Shows: SSH key installation, userdata completion + +# 4. Full userdata execution log +sudo cat /var/log/cloud-init-output.log +# Shows: complete boot process, runner download/registration + +# 5. GitHub Actions runner logs +ls -la /home/ubuntu/_diag/ +cat /home/ubuntu/_diag/Runner_*.log +# Shows: runner connection status, job execution + +# 6. Check running processes +ps aux | grep -E "Runner|runner" +# Shows: if runner is actively processing a job + +# 7. View the cleanup script itself +sudo cat /usr/local/bin/github-runner-cleanup.sh +``` + +## Troubleshooting + +### Instance doesn't terminate + +If the instance doesn't terminate after the workflow completes: + +1. **Connect to the instance** using AWS Systems Manager: + ```bash + aws ssm start-session --target i-xxxxx --region us-east-1 + ``` + +2. **Check the cleanup service logs**: + ```bash + sudo journalctl -u github-runner-cleanup + sudo cat /var/log/github-runner-cleanup.log + ``` + +3. **Verify the service status**: + ```bash + sudo systemctl status github-runner-cleanup + ``` + +4. **Check termination behavior**: + ```bash + aws ec2 describe-instance-attribute \ + --instance-id $(ec2-metadata --instance-id | cut -d " " -f 2) \ + --attribute instanceInitiatedShutdownBehavior \ + --region us-east-1 + ``` + +**Note**: The cleanup service waits for the runner to start (default 3 minutes, configurable via `shutdown_poll_wait`). It checks every 10 seconds during this period and begins monitoring immediately when the runner is detected. Once monitoring begins, it checks every 15 seconds if the runner is still active. When the job completes and the runner stops, the instance typically terminates within 30-45 seconds. + +### Runner not connecting +- Verify `GH_SA_TOKEN` has correct permissions +- Check security group allows outbound HTTPS +- Ensure the AMI is compatible with GitHub Actions runner + +## License + +MIT From 711957673822238ff63cf52c29de84b5b0660a7b Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Tue, 22 Jul 2025 22:45:36 -0400 Subject: [PATCH 02/13] update README --- README.md | 40 +--------------------------------------- 1 file changed, 1 insertion(+), 39 deletions(-) diff --git a/README.md b/README.md index 90f8f09..ae00a6a 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,6 @@ In your GitHub organization settings, create these secrets: - `AWS_ROLE`: ARN of your AWS IAM role (e.g., `arn:aws:iam::123456789012:role/GitHubActionsRole`) - `GH_SA_TOKEN`: GitHub token with permissions to manage self-hosted runners -### 3. Create Approval Label (Optional) - -If you want to allow trusted external contributors to run GPU tests: -1. Go to your repository's Issues tab -2. Click Labels β†’ New label -3. Create a label named `gpu` (or your custom name) -4. Maintainers can apply this label to PRs to authorize GPU runs ## Minimal Example @@ -126,38 +119,7 @@ Priority: workflow inputs > environment variables > hardcoded defaults ## Security -### Fork Protection -This workflow includes built-in protection against unauthorized EC2 launches from forked repositories. EC2 instances will only start for: -- Direct pushes to your repository -- Manual workflow dispatches -- Pull requests from branches within your repository (not forks) -- Pull requests from forks that have the approval label (default: `gpu`) applied by a maintainer - -This prevents unauthorized external contributors from using your AWS resources through pull requests. - -### Label-Based Approval for External Contributors -For trusted external contributors, maintainers can approve GPU testing by: -1. Adding the approval label (default: `gpu`) to a pull request -2. The workflow will then run with EC2 instances -3. The label should be removed after the run completes - -To customize the label name: -```yaml -jobs: - ec2: - uses: Open-Athena/ec2/.github/workflows/runner.yml@v1 - secrets: inherit - with: - approval_label: "run-benchmarks" # Custom label name -``` - -To disable label-based approval entirely: -```yaml -with: - approval_label: "" # Empty string disables external PR approval -``` - -This follows the same pattern used by scikit-learn and other major open source projects for expensive CI resources. +This workflow uses GitHub OIDC for AWS authentication, eliminating the need for long-lived credentials. Ensure your AWS IAM role is properly configured to trust only your GitHub organization/repository. ## SSH Debugging From 373c2fd64d0f33c4bee65856bcdffcdf6f6626f9 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 24 Jul 2025 00:07:08 -0400 Subject: [PATCH 03/13] CR: `case`, `start_aws_gha_runner/start.py` xref --- .github/workflows/runner.yml | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/.github/workflows/runner.yml b/.github/workflows/runner.yml index d1b4acb..3ce4774 100644 --- a/.github/workflows/runner.yml +++ b/.github/workflows/runner.yml @@ -107,16 +107,20 @@ jobs: # Configure SSH access # Get the home directory for the default user - if id ec2-user &>/dev/null; then - USER_HOME="/home/ec2-user" - DEFAULT_USER="ec2-user" - elif id ubuntu &>/dev/null; then - USER_HOME="/home/ubuntu" - DEFAULT_USER="ubuntu" - else - USER_HOME="/root" - DEFAULT_USER="root" - fi + case "$(id -un 2>/dev/null)" in + ec2-user) + USER_HOME="/home/ec2-user" + DEFAULT_USER="ec2-user" + ;; + ubuntu) + USER_HOME="/home/ubuntu" + DEFAULT_USER="ubuntu" + ;; + *) + USER_HOME="/root" + DEFAULT_USER="root" + ;; + esac # Create .ssh directory if it doesn't exist mkdir -p "$USER_HOME/.ssh" @@ -215,7 +219,7 @@ jobs: if ! pgrep -f "Runner.Listener" > /dev/null 2>&1 && ! pgrep -f "config.sh" > /dev/null 2>&1 && ! pgrep -f "run.sh" > /dev/null 2>&1; then log "Runner confirmed stopped, initiating self-termination" - # Shutdown instance (will terminate due to launch configuration) + # Shutdown instance (will terminate due to launch configuration; see start_aws_gha_runner/start.py) shutdown -h now exit 0 From b2cdbd8df57c6865c6bcd7d0ffc34d18f7877917 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 24 Jul 2025 00:12:47 -0400 Subject: [PATCH 04/13] update security section --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index ae00a6a..0040b4d 100644 --- a/README.md +++ b/README.md @@ -119,8 +119,21 @@ Priority: workflow inputs > environment variables > hardcoded defaults ## Security +### AWS Authentication This workflow uses GitHub OIDC for AWS authentication, eliminating the need for long-lived credentials. Ensure your AWS IAM role is properly configured to trust only your GitHub organization/repository. +### Public repos: set "Require approval for all external contributors", don't approve +If you want to use this action on a public repo, you need to protect against external contributors triggering workflows with access to your `AWS_ROLE` secret. + +Recommended settings / protocol: +1. In your repository settings, enable "Require approval for all external contributors" under "Actions" β†’ "General". +2. **Never directly approve workflow runs** from external contributors. Instead: + - Create a temporary branch from the external contributor's commit. + - Trigger the workflow on this temporary branch, using a `workflow_dispatch` event (from the web UI or `gh` CLI) + - Delete the branch when done. + +This ensures external contributors never gain persistent workflow execution rights. + ## SSH Debugging ### Initial Setup From 0813864a7a9af97beb704ee70f961646c54ec495 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 24 Jul 2025 14:27:13 -0400 Subject: [PATCH 05/13] convert AWS_ROLE, SSH_PUBKEY to inputs/variables (rather than secrets) --- .github/workflows/runner.yml | 39 +++++++++++++++++++++--------------- README.md | 15 ++++++++++---- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/.github/workflows/runner.yml b/.github/workflows/runner.yml index 3ce4774..2dc9c4d 100644 --- a/.github/workflows/runner.yml +++ b/.github/workflows/runner.yml @@ -1,27 +1,27 @@ name: EC2 Runner # -# Environment variables (set at org/repo level): +# Environment variables (can be set at org/repo level): +# AWS_ROLE - AWS role ARN for EC2 operations (required if not passed as input) # EC2_IMAGE_ID - Default AMI ID # EC2_INSTANCE_TYPE - Default instance type # EC2_HOME_DIR - Default home directory # EC2_KEY_NAME - Default SSH key pair name # EC2_SECURITY_GROUP_ID - Default security group ID +# SSH_PUBKEY - Default SSH public key to add to instances # # Priority: inputs > vars > defaults on: workflow_call: secrets: - AWS_ROLE: - description: "AWS role ARN to assume for EC2 operations" - required: true GH_SA_TOKEN: description: "GitHub token with permissions to manage self-hosted runners" required: true - SSH_PUBKEY: - description: "SSH public key to add to authorized_keys (optional)" - required: false inputs: + aws_role: + description: "AWS role ARN to assume for EC2 operations" + required: false + type: string aws_image_id: description: "AWS AMI ID to use" required: false @@ -43,7 +43,7 @@ on: type: number default: 3 aws_key_name: - description: "Name of the EC2 key pair to use for SSH access" + description: "Name of an EC2 key pair to use for SSH access (optional)" required: false type: string aws_security_group_id: @@ -56,7 +56,7 @@ on: type: number default: 15 ssh_pubkey: - description: "SSH public key to add to authorized_keys (optional input)" + description: "SSH public key to add to authorized_keys (optional)" required: false type: string outputs: @@ -73,16 +73,24 @@ jobs: outputs: instance: ${{ steps.aws-start.outputs.label }} steps: + - name: Check AWS_ROLE configuration + run: | + if [ -z "${{ inputs.aws_role || vars.AWS_ROLE }}" ]; then + echo "ERROR: AWS_ROLE must be provided either as an input or as a repository/organization variable" + echo "Please set 'aws_role' input or 'AWS_ROLE' variable" + exit 1 + fi + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - role-to-assume: ${{ secrets.AWS_ROLE }} + role-to-assume: ${{ inputs.aws_role || vars.AWS_ROLE }} role-session-name: github-actions-session aws-region: us-east-1 - name: Create cloud runner id: aws-start - uses: Open-Athena/start-aws-gha-runner@dev + uses: Open-Athena/start-aws-gha-runner@v1 with: aws_image_id: ${{ inputs.aws_image_id || vars.EC2_IMAGE_ID || 'ami-00096836009b16a22' }} aws_instance_type: ${{ inputs.aws_instance_type || vars.EC2_INSTANCE_TYPE || 'g4dn.xlarge' }} @@ -96,7 +104,6 @@ jobs: { "Key": "repository", "Value": "${{ github.repository }}" }, { "Key": "gha_url", "Value": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" } ] - # Uses AWS account defaults for subnet and security group aws_userdata: | # Instance is already configured to terminate on shutdown via launch parameter @@ -128,10 +135,10 @@ jobs: touch "$USER_HOME/.ssh/authorized_keys" chmod 600 "$USER_HOME/.ssh/authorized_keys" - # Add default SSH public key from secret if provided - if [ -n "${{ secrets.SSH_PUBKEY }}" ]; then - echo "[$(date '+%Y-%m-%d %H:%M:%S')] Adding default SSH public key from secret" - echo "${{ secrets.SSH_PUBKEY }}" >> "$USER_HOME/.ssh/authorized_keys" + # Add default SSH public key from variable if provided + if [ -n "${{ vars.SSH_PUBKEY }}" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Adding default SSH public key from variable" + echo "${{ vars.SSH_PUBKEY }}" >> "$USER_HOME/.ssh/authorized_keys" fi # Add input SSH public key if provided diff --git a/README.md b/README.md index 0040b4d..badd4ff 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,16 @@ Your AWS role needs permissions to: The role must trust GitHub's OIDC provider. See [GitHub's documentation](https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/configuring-openid-connect-in-amazon-web-services) for setup. -### 2. Set Organization Secrets +### 2. Configure Secrets and Variables -In your GitHub organization settings, create these secrets: -- `AWS_ROLE`: ARN of your AWS IAM role (e.g., `arn:aws:iam::123456789012:role/GitHubActionsRole`) +#### Required Secret: - `GH_SA_TOKEN`: GitHub token with permissions to manage self-hosted runners +#### Required Variable (or pass as input): +- `AWS_ROLE`: ARN of your AWS IAM role (e.g., `arn:aws:iam::123456789012:role/GitHubActionsRole`) + +Set these in your GitHub organization or repository settings. + ## Minimal Example @@ -67,7 +71,7 @@ on: [push, pull_request] jobs: ec2: uses: Open-Athena/ec2/.github/workflows/runner.yml@main - secrets: inherit # Requires `AWS_ROLE`, `GH_SA_TOKEN` + secrets: inherit # Requires `GH_SA_TOKEN` with: aws_instance_type: "g4dn.xlarge" # Optional, defaults to g4dn.xlarge @@ -89,6 +93,7 @@ jobs: | Input | Description | Default | |-------|-------------|---------| +| `aws_role` | AWS role ARN for EC2 operations | `vars.AWS_ROLE` | | `aws_instance_type` | EC2 instance type | [`g4dn.xlarge`](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) (cheapest GPU instance) | | `aws_image_id` | AMI ID | `ami-00096836009b16a22` (Deep Learning AMI) | | `aws_home_dir` | Home directory path | `/home/ubuntu` | @@ -102,10 +107,12 @@ jobs: Set these as organization or repository variables for defaults: +- `AWS_ROLE` - AWS role ARN for EC2 operations (required if not passed as input) - `EC2_IMAGE_ID` - Default AMI ID - `EC2_INSTANCE_TYPE` - Default instance type - `EC2_KEY_NAME` - Default SSH key pair name - `EC2_SECURITY_GROUP_ID` - Default security group ID +- `SSH_PUBKEY` - Default SSH public key to add to instances Priority: workflow inputs > environment variables > hardcoded defaults From ab0b087bf4704afc32e0fcac554830cd44369d3e Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 24 Jul 2025 14:30:01 -0400 Subject: [PATCH 06/13] Mention `vars` fallbacks in `inputs.description`s --- .github/workflows/runner.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/runner.yml b/.github/workflows/runner.yml index 2dc9c4d..5ff2456 100644 --- a/.github/workflows/runner.yml +++ b/.github/workflows/runner.yml @@ -19,21 +19,21 @@ on: required: true inputs: aws_role: - description: "AWS role ARN to assume for EC2 operations" + description: "AWS role ARN to assume for EC2 operations (falls back to vars.AWS_ROLE)" required: false type: string aws_image_id: - description: "AWS AMI ID to use" + description: "AWS AMI ID to use (falls back to vars.EC2_IMAGE_ID)" required: false type: string default: "ami-00096836009b16a22" # Deep Learning OSS Nvidia Driver AMI GPU PyTorch aws_instance_type: - description: "AWS instance type" + description: "AWS instance type (falls back to vars.EC2_INSTANCE_TYPE)" required: false type: string default: "g4dn.xlarge" aws_home_dir: - description: "Home directory on the AWS instance" + description: "Home directory on the AWS instance (falls back to vars.EC2_HOME_DIR)" required: false type: string default: "/home/ubuntu" @@ -43,11 +43,11 @@ on: type: number default: 3 aws_key_name: - description: "Name of an EC2 key pair to use for SSH access (optional)" + description: "Name of an EC2 key pair to use for SSH access (falls back to vars.EC2_KEY_NAME)" required: false type: string aws_security_group_id: - description: "AWS security group ID (defaults to account default if not specified)" + description: "AWS security group ID (falls back to vars.EC2_SECURITY_GROUP_ID)" required: false type: string poll_interval: @@ -56,7 +56,7 @@ on: type: number default: 15 ssh_pubkey: - description: "SSH public key to add to authorized_keys (optional)" + description: "SSH public key to add to authorized_keys (falls back to vars.SSH_PUBKEY)" required: false type: string outputs: From 16402660f1f9d7df3558ad26b73660623771c9ba Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 28 Jul 2025 10:35:23 -0400 Subject: [PATCH 07/13] rm redundant default-input-value passing --- .github/workflows/runner.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/runner.yml b/.github/workflows/runner.yml index 5ff2456..5a8cd26 100644 --- a/.github/workflows/runner.yml +++ b/.github/workflows/runner.yml @@ -92,9 +92,9 @@ jobs: id: aws-start uses: Open-Athena/start-aws-gha-runner@v1 with: - aws_image_id: ${{ inputs.aws_image_id || vars.EC2_IMAGE_ID || 'ami-00096836009b16a22' }} - aws_instance_type: ${{ inputs.aws_instance_type || vars.EC2_INSTANCE_TYPE || 'g4dn.xlarge' }} - aws_home_dir: ${{ inputs.aws_home_dir || vars.EC2_HOME_DIR || '/home/ubuntu' }} + aws_image_id: ${{ inputs.aws_image_id || vars.EC2_IMAGE_ID }} + aws_instance_type: ${{ inputs.aws_instance_type || vars.EC2_INSTANCE_TYPE }} + aws_home_dir: ${{ inputs.aws_home_dir || vars.EC2_HOME_DIR }} aws_key_name: ${{ inputs.aws_key_name || vars.EC2_KEY_NAME }} aws_security_group_id: ${{ inputs.aws_security_group_id || vars.EC2_SECURITY_GROUP_ID }} aws_tags: | From afea2c163c1a872af14979b2c23effffb8efdc01 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 28 Jul 2025 10:35:25 -0400 Subject: [PATCH 08/13] update readme --- README.md | 188 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 123 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index badd4ff..2346515 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,140 @@ # Open-Athena/ec2 Auto-terminating EC2 GHA runner. -πŸ“– **Demo**: See [ec2-runner-demo](https://github.com/Open-Athena/ec2-runner-demo) for a complete working example. +Demo [ec2-runner-demo](https://github.com/Open-Athena/ec2-runner-demo) (currently private; stay tuned)! ## Features - πŸš€ Starts EC2 instances on-demand for GitHub Actions jobs -- 🧹 Self-terminates when job completes (no separate stop job needed!) +- 🧹 Self-terminates when job completes (no separate stop job needed) - πŸ”‘ Uses GitHub OIDC for AWS authentication (no long-lived credentials) - ⚑ Single reusable workflow call -- 🎯 GPU-optimized AMI by default -- πŸ’° Defaults to [`g4dn.xlarge`](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) - the cheapest EC2 GPU instance we're aware of -- πŸ”’ Uses Open-Athena's fork of start-aws-gha-runner for `userdata` support +- 🎯 Defaults to [`g4dn.xlarge`] (cheapest EC2 GPU instance we're aware of) and `ami-00096836009b16a22` (`amazon/Deep Learning OSS Nvidia Driver AMI GPU PyTorch 2.4.1 (Ubuntu 22.04) 20250302`) ## Setup ### 1. Configure AWS IAM Role -Your AWS role needs permissions to: -- Launch EC2 instances -- Pass IAM roles -- Manage GitHub Actions runners +Here's an example [Pulumi] recipe to create the necessary AWS IAM role: + +
+Pulumi example + +Update `ORGS_REPOS_UPDATEME` below with the repos/orgs you want the role to be accessible from: + +```python +"""Create AWS_ROLE used to launch EC2 instances in GitHub Actions workflows.""" + +import pulumi +import pulumi_aws as aws + + +current = aws.get_caller_identity() + +# Create IAM OIDC provider for GitHub Actions +# fingerprint instructions: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_providers_create_oidc_verify-thumbprint.html +# +# ```console +# $ d=token.actions.githubusercontent.com +# $ openssl s_client -servername $d -showcerts -connect $d:443 -The role must trust GitHub's OIDC provider. See [GitHub's documentation](https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/configuring-openid-connect-in-amazon-web-services) for setup. +The role must be able to launch, tag, describe, and shutdown instances, and should be integrated with GitHub's OIDC provider (see [GitHub's documentation](https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/configuring-openid-connect-in-amazon-web-services) for more info). ### 2. Configure Secrets and Variables -#### Required Secret: -- `GH_SA_TOKEN`: GitHub token with permissions to manage self-hosted runners +#### Required Secret: `GH_SA_TOKEN` +This workflow requires a GitHub token with admin permissions to the repo it's run within, because the underlying `gha-runner` [calls `/actions/runners/registration-token`][call], whose [docs] state: + +> Authenticated users must have admin access to the repository to use this endpoint. + +[call]: https://github.com/Open-Athena/gha-runner/blob/v1/src/gha_runner/gh.py#L144-L146 +[docs]: https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository #### Required Variable (or pass as input): -- `AWS_ROLE`: ARN of your AWS IAM role (e.g., `arn:aws:iam::123456789012:role/GitHubActionsRole`) +- `AWS_ROLE`: ARN of your AWS IAM role (e.g. `arn:aws:iam::123456789012:role/GitHubActionsRole`) -Set these in your GitHub organization or repository settings. +Set this in your GitHub organization or repository settings by e.g.: +```bash +gh variable set AWS_ROLE --body "arn:aws:iam::123456789012:role/GitHubActionsRole" +``` ## Minimal Example -Just 16 lines to run GPU tests on EC2: +Here's a minimal workflow that exercises a GPU on an EC2 instance: ```yaml name: Minimal GPU EC2 runner test @@ -57,35 +154,7 @@ jobs: - run: nvidia-smi # g4dn.xlarge! ``` -That's it! The EC2 instance starts, runs your job, and automatically terminates when done. - -## Full Example - -For more control over instance configuration: - -```yaml -name: GPU Tests - -on: [push, pull_request] - -jobs: - ec2: - uses: Open-Athena/ec2/.github/workflows/runner.yml@main - secrets: inherit # Requires `GH_SA_TOKEN` - with: - aws_instance_type: "g4dn.xlarge" # Optional, defaults to g4dn.xlarge - - test: - needs: ec2 - runs-on: ${{ needs.ec2.outputs.instance }} - steps: - - uses: actions/checkout@v4 - - - name: Verify GPU availability - run: nvidia-smi - - # No stop job needed - instance self-terminates! -``` +This launches an EC2 instance, runs the `gpu-test` job on it, and automatically terminates when finished. ## Configuration @@ -121,25 +190,18 @@ Priority: workflow inputs > environment variables > hardcoded defaults 1. The workflow starts an EC2 instance using the specified configuration 2. A GitHub Actions runner is automatically installed and registered 3. Your job runs on the EC2 instance -4. A systemd service monitors the runner process -5. When the runner stops (job completes), the instance self-terminates automatically +4. A systemd service monitors the runner process on the instance +5. When the runner stops (job completes), the instance self-terminates ## Security ### AWS Authentication -This workflow uses GitHub OIDC for AWS authentication, eliminating the need for long-lived credentials. Ensure your AWS IAM role is properly configured to trust only your GitHub organization/repository. +This workflow assumes GitHub OIDC for AWS authentication, eliminating the need for long-lived credentials. Ensure your AWS IAM role is properly configured to trust only your GitHub organization/repository. ### Public repos: set "Require approval for all external contributors", don't approve -If you want to use this action on a public repo, you need to protect against external contributors triggering workflows with access to your `AWS_ROLE` secret. - -Recommended settings / protocol: -1. In your repository settings, enable "Require approval for all external contributors" under "Actions" β†’ "General". -2. **Never directly approve workflow runs** from external contributors. Instead: - - Create a temporary branch from the external contributor's commit. - - Trigger the workflow on this temporary branch, using a `workflow_dispatch` event (from the web UI or `gh` CLI) - - Delete the branch when done. +If you want to use this action on a public repo, you should restrict external contributors from triggering workflows that can access your `AWS_ROLE` secret. -This ensures external contributors never gain persistent workflow execution rights. +The best way to do this is to enable "Require approval for all external contributors" under "Actions" β†’ "General". ## SSH Debugging @@ -185,6 +247,8 @@ gh variable set EC2_KEY_NAME --body "gha" gh variable set EC2_SECURITY_GROUP_ID --body "$SECURITY_GROUP_ID" ``` +These can also be passed as `inputs` to the workflow. + ### Connecting to Running Instances 1. **Find the latest instance**: @@ -282,13 +346,7 @@ If the instance doesn't terminate after the workflow completes: --region us-east-1 ``` -**Note**: The cleanup service waits for the runner to start (default 3 minutes, configurable via `shutdown_poll_wait`). It checks every 10 seconds during this period and begins monitoring immediately when the runner is detected. Once monitoring begins, it checks every 15 seconds if the runner is still active. When the job completes and the runner stops, the instance typically terminates within 30-45 seconds. - -### Runner not connecting -- Verify `GH_SA_TOKEN` has correct permissions -- Check security group allows outbound HTTPS -- Ensure the AMI is compatible with GitHub Actions runner - -## License +**Note**: The cleanup service waits for the runner to start (default 3 minutes, configurable via `shutdown_poll_wait`). It checks every 10 seconds during this period and begins monitoring immediately when the runner is detected. Once monitoring begins, it checks every 15 seconds if the runner is still active. When 2 consecutive checks are missed, the instance shuts down. -MIT +[`g4dn.xlarge`]: https://instances.vantage.sh/aws/ec2/g4dn.xlarge +[Pulumi]: https://www.pulumi.com From 0c9c716eb9e1d2f584f8fc33b28a16704ff9a3e0 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 28 Jul 2025 10:40:00 -0400 Subject: [PATCH 09/13] rename some `aws_` inputs to `ec2_` For more specificity, and better consistency with corresponding env var names --- .github/workflows/runner.yml | 30 +++++++++++++++--------------- README.md | 22 +++++++++++----------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/workflows/runner.yml b/.github/workflows/runner.yml index 5a8cd26..5b09062 100644 --- a/.github/workflows/runner.yml +++ b/.github/workflows/runner.yml @@ -22,34 +22,34 @@ on: description: "AWS role ARN to assume for EC2 operations (falls back to vars.AWS_ROLE)" required: false type: string - aws_image_id: + ec2_image_id: description: "AWS AMI ID to use (falls back to vars.EC2_IMAGE_ID)" required: false type: string default: "ami-00096836009b16a22" # Deep Learning OSS Nvidia Driver AMI GPU PyTorch - aws_instance_type: + ec2_instance_type: description: "AWS instance type (falls back to vars.EC2_INSTANCE_TYPE)" required: false type: string default: "g4dn.xlarge" - aws_home_dir: + ec2_home_dir: description: "Home directory on the AWS instance (falls back to vars.EC2_HOME_DIR)" required: false type: string default: "/home/ubuntu" - shutdown_poll_wait: - description: "Minutes to wait for runner setup before monitoring for termination" - required: false - type: number - default: 3 - aws_key_name: + ec2_key_name: description: "Name of an EC2 key pair to use for SSH access (falls back to vars.EC2_KEY_NAME)" required: false type: string - aws_security_group_id: + ec2_security_group_id: description: "AWS security group ID (falls back to vars.EC2_SECURITY_GROUP_ID)" required: false type: string + shutdown_poll_wait: + description: "Minutes to wait for runner setup before monitoring for termination" + required: false + type: number + default: 3 poll_interval: description: "Interval in seconds to check if GHA process is still running" required: false @@ -92,11 +92,11 @@ jobs: id: aws-start uses: Open-Athena/start-aws-gha-runner@v1 with: - aws_image_id: ${{ inputs.aws_image_id || vars.EC2_IMAGE_ID }} - aws_instance_type: ${{ inputs.aws_instance_type || vars.EC2_INSTANCE_TYPE }} - aws_home_dir: ${{ inputs.aws_home_dir || vars.EC2_HOME_DIR }} - aws_key_name: ${{ inputs.aws_key_name || vars.EC2_KEY_NAME }} - aws_security_group_id: ${{ inputs.aws_security_group_id || vars.EC2_SECURITY_GROUP_ID }} + aws_image_id: ${{ inputs.ec2_image_id || vars.EC2_IMAGE_ID }} + aws_instance_type: ${{ inputs.ec2_instance_type || vars.EC2_INSTANCE_TYPE }} + aws_home_dir: ${{ inputs.ec2_home_dir || vars.EC2_HOME_DIR }} + aws_key_name: ${{ inputs.ec2_key_name || vars.EC2_KEY_NAME }} + aws_security_group_id: ${{ inputs.ec2_security_group_id || vars.EC2_SECURITY_GROUP_ID }} aws_tags: | [ { "Key": "Name", "Value": "gha#${{ github.run_id }}" }, diff --git a/README.md b/README.md index 2346515..76a82e3 100644 --- a/README.md +++ b/README.md @@ -160,17 +160,17 @@ This launches an EC2 instance, runs the `gpu-test` job on it, and automatically ### Workflow Inputs -| Input | Description | Default | -|-------|-------------|---------| -| `aws_role` | AWS role ARN for EC2 operations | `vars.AWS_ROLE` | -| `aws_instance_type` | EC2 instance type | [`g4dn.xlarge`](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) (cheapest GPU instance) | -| `aws_image_id` | AMI ID | `ami-00096836009b16a22` (Deep Learning AMI) | -| `aws_home_dir` | Home directory path | `/home/ubuntu` | -| `aws_key_name` | EC2 key pair name for SSH access | - | -| `aws_security_group_id` | Security group ID (must allow SSH if using) | - | -| `ssh_pubkey` | Additional SSH public key to authorize | - | -| `shutdown_poll_wait` | Minutes to wait for runner setup before monitoring | `3` | -| `poll_interval` | Seconds between runner process checks | `15` | +| Input | Description | Default | +|-------------------------|-------------|---------| +| `aws_role` | AWS role ARN for EC2 operations | `vars.AWS_ROLE` | +| `ec2_instance_type` | EC2 instance type | [`g4dn.xlarge`](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) (cheapest GPU instance) | +| `ec2_image_id` | AMI ID | `ami-00096836009b16a22` (Deep Learning AMI) | +| `ec2_home_dir` | Home directory path | `/home/ubuntu` | +| `ec2_key_name` | EC2 key pair name for SSH access | - | +| `ec2_security_group_id` | Security group ID (must allow SSH if using) | - | +| `ssh_pubkey` | Additional SSH public key to authorize | - | +| `shutdown_poll_wait` | Minutes to wait for runner setup before monitoring | `3` | +| `poll_interval` | Seconds between runner process checks | `15` | ### Environment Variables From a89d8fefba4af2e4ec09b974d8353be8143ba8b1 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 28 Jul 2025 10:50:10 -0400 Subject: [PATCH 10/13] add MIT LICENSE --- LICENSE | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c2c92f3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,7 @@ +Copyright 2025 Open Athena + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the β€œSoftware”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED β€œAS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. From 31b7aff6c997b1432851a249c7e4d5dd90bc30f2 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 28 Jul 2025 11:18:19 -0400 Subject: [PATCH 11/13] rm README reference to `aws ssm` --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 76a82e3..6eeba15 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ If you want to use this action on a public repo, you should restrict external co The best way to do this is to enable "Require approval for all external contributors" under "Actions" β†’ "General". -## SSH Debugging +## SSH Debugging ### Initial Setup @@ -322,10 +322,9 @@ sudo cat /usr/local/bin/github-runner-cleanup.sh If the instance doesn't terminate after the workflow completes: -1. **Connect to the instance** using AWS Systems Manager: - ```bash - aws ssm start-session --target i-xxxxx --region us-east-1 - ``` +1. **Connect to the instance** using SSH + + (This requires having provided an `ec2_key_name` (or `ssh_pubkey`) and `ec2_security_group`; see [SSH Debugging](#ssh-debugging) above). 2. **Check the cleanup service logs**: ```bash From 1d1dcc6100ffd0ebeab8830c8d70b7210331c230 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 28 Jul 2025 11:41:36 -0400 Subject: [PATCH 12/13] CR: more README updates --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6eeba15..bae4842 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,7 @@ Set these as organization or repository variables for defaults: - `AWS_ROLE` - AWS role ARN for EC2 operations (required if not passed as input) - `EC2_IMAGE_ID` - Default AMI ID - `EC2_INSTANCE_TYPE` - Default instance type +- `EC2_HOME_DIR` - Default home directory path (should match the default AMI) - `EC2_KEY_NAME` - Default SSH key pair name - `EC2_SECURITY_GROUP_ID` - Default security group ID - `SSH_PUBKEY` - Default SSH public key to add to instances @@ -187,11 +188,11 @@ Priority: workflow inputs > environment variables > hardcoded defaults ## How It Works -1. The workflow starts an EC2 instance using the specified configuration -2. A GitHub Actions runner is automatically installed and registered +1. The workflow launches an EC2 instance using the specified configuration +2. A GitHub Actions runner is installed and registered 3. Your job runs on the EC2 instance -4. A systemd service monitors the runner process on the instance -5. When the runner stops (job completes), the instance self-terminates +4. A `systemd` service monitors the runner process on the instance (polling every 15 seconds, by default) +5. When the runner stops (job completes, detected as 2 consecutive runner-process "health checks" failing), the instance self-terminates ## Security From 2a9e0f201516c2bc324148f70bdbba5307b6a1fa Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 28 Jul 2025 12:01:49 -0400 Subject: [PATCH 13/13] move to Apache 2 License --- LICENSE | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 198 insertions(+), 4 deletions(-) diff --git a/LICENSE b/LICENSE index c2c92f3..261eeb9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,201 @@ -Copyright 2025 Open Athena + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the β€œSoftware”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + 1. Definitions. -THE SOFTWARE IS PROVIDED β€œAS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License.