diff --git a/deploy-aws b/deploy-aws index ca2c145..d8b4ac0 100755 --- a/deploy-aws +++ b/deploy-aws @@ -17,6 +17,7 @@ # endregion import os +import re import sys import click @@ -100,6 +101,35 @@ class DeployAWSCommand(DeployCommand): return value + @staticmethod + def availability_zone_callback(ctx, param, value): + """ + Called after parsing --availability-zone option. + Accepts a full AZ name ("us-west-2b"), a bare zone letter ("b"), + or an empty value (auto-select the first zone offering the instance type). + """ + + value = (value or "").strip().lower() + + if value == "": + return "" + + region = ctx.params.get("region", "") + + # accept a bare zone suffix (e.g. "b") and prefix it with the region + if region and re.fullmatch(r"[a-z]", value): + value = f"{region}{value}" + + # sanity check: the zone must belong to the selected region + if region and not value.startswith(region): + raise click.BadParameter( + colorize_error( + f'Availability zone "{value}" is not in region "{region}".' + ) + ) + + return value + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -144,6 +174,27 @@ class DeployAWSCommand(DeployCommand): ), ) + # --availability-zone + self.params.insert( + # insert before --ingress-cidrs option (and after --region, so the + # region is parsed before the AZ callback runs) + self.param_index("ingress_cidrs"), + click.core.Option( + ("--availability-zone", "--az"), + default="", + show_default=True, + callback=DeployAWSCommand.availability_zone_callback, + prompt=colorize_prompt( + "* Availability Zone (e.g. us-west-2b, or just \"b\"; " + "leave empty to auto-select. Use this to route around " + '"InsufficientInstanceCapacity" errors in a given zone)' + ), + help="AWS availability zone for the instance, e.g. 'us-west-2b'." + " Leave empty to auto-select the first zone that offers the" + " instance type. Useful to avoid per-zone GPU capacity shortages.", + ), + ) + # defaults self.params[self.param_index("from_image")].default = config[ @@ -179,6 +230,7 @@ class AWSDeployer(Deployer): self.create_tfvars( { "region": self.params["region"], + "availability_zone": self.params.get("availability_zone", ""), } ) diff --git a/src/terraform/aws/isaac-workstation/main.tf b/src/terraform/aws/isaac-workstation/main.tf index 4a98857..11bac00 100644 --- a/src/terraform/aws/isaac-workstation/main.tf +++ b/src/terraform/aws/isaac-workstation/main.tf @@ -9,12 +9,23 @@ data "aws_ec2_instance_type_offerings" "zones" { location_type = "availability-zone" } +locals { + # availability zones that offer the requested instance type, sorted + offered_zones = sort(data.aws_ec2_instance_type_offerings.zones.locations) + + # use the explicitly requested availability zone when provided, otherwise + # fall back to the first zone that offers the instance type. an explicit AZ + # lets the user route around per-zone GPU capacity shortages + # (InsufficientInstanceCapacity), which vary over time and by zone. + availability_zone = var.availability_zone != "" ? var.availability_zone : try(local.offered_zones[0], "not-available") +} + # create a subnet for the isaac-workstation instance resource "aws_subnet" "subnet" { # get a /24 block from vpc cidr cidr_block = cidrsubnet(var.vpc.cidr_block, 8, 3) - availability_zone = try(sort(data.aws_ec2_instance_type_offerings.zones.locations)[0], "not-available") + availability_zone = local.availability_zone vpc_id = var.vpc.id map_public_ip_on_launch = true diff --git a/src/terraform/aws/isaac-workstation/variables.tf b/src/terraform/aws/isaac-workstation/variables.tf index 7a21b03..2b99ca5 100644 --- a/src/terraform/aws/isaac-workstation/variables.tf +++ b/src/terraform/aws/isaac-workstation/variables.tf @@ -14,6 +14,13 @@ variable "region" { type = string } +# optional availability zone (e.g. "us-west-2b"); empty = auto-select the +# first zone that offers the instance type +variable "availability_zone" { + default = "" + type = string +} + variable "from_image" { default = true type = bool diff --git a/src/terraform/aws/main.tf b/src/terraform/aws/main.tf index c5ae0ef..ccd9bbd 100644 --- a/src/terraform/aws/main.tf +++ b/src/terraform/aws/main.tf @@ -42,6 +42,7 @@ module "isaac_workstation" { instance_type = var.isaac_workstation_instance_type from_image = var.from_image region = var.region + availability_zone = var.availability_zone ssh_port = var.ssh_port deployment_name = var.deployment_name ingress_cidrs = var.ingress_cidrs diff --git a/src/terraform/aws/variables.tf b/src/terraform/aws/variables.tf index 8503608..3fd3373 100644 --- a/src/terraform/aws/variables.tf +++ b/src/terraform/aws/variables.tf @@ -13,6 +13,12 @@ variable "region" { type = string } +# optional availability zone (e.g. "us-west-2b"); empty = auto-select +variable "availability_zone" { + default = "" + type = string +} + variable "from_image" { default = false type = bool