diff --git a/.github/workflows/packer_build.yml b/.github/workflows/packer_build.yml index 7e8f3523..69e441ff 100644 --- a/.github/workflows/packer_build.yml +++ b/.github/workflows/packer_build.yml @@ -56,6 +56,4 @@ jobs: env: PKR_VAR_source_image_family: jenkins-stock-agent PKR_VAR_image_prefix: jenkins-gpu-agent - # This driver version below is the latest recommended given our requirements (T4 GPU, CUDA 12.2, Linux X64). Furthermore, the corresponding run script is compatible with Linux kernel version 5.15. - PKR_VAR_nvidia_driver_version: "535.183.01" - PKR_VAR_nvidia_driver_base_url: "https://us.download.nvidia.com/tesla" + PKR_VAR_nvidia_driver_version: "570" diff --git a/packer/base-images/aws/base-image.pkr.hcl b/packer/base-images/aws/base-image.pkr.hcl index 8aae2358..a64451ba 100644 --- a/packer/base-images/aws/base-image.pkr.hcl +++ b/packer/base-images/aws/base-image.pkr.hcl @@ -1,3 +1,12 @@ +packer { + required_plugins { + amazon = { + version = ">= 1.2.0" + source = "github.com/hashicorp/amazon" + } + } +} + variable "buildtime" { default = "{{isotime \"200601021504\"}}" } @@ -7,7 +16,6 @@ source "amazon-ebs" "aws_base_image" { secret_key = var.aws_secret_key communicator = "ssh" ami_name = "${var.image_prefix}-x64-v${var.buildtime}" - ami_groups = ["all"] tags = { image_family = "${var.image_prefix}-x64" } @@ -15,7 +23,7 @@ source "amazon-ebs" "aws_base_image" { source_ami_filter { filters = { virtualization-type = "hvm" - name = "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*" + name = "ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*" root-device-type = "ebs" } owners = ["099720109477"] @@ -30,7 +38,6 @@ source "amazon-ebs" "aws_base_image_arm" { secret_key = var.aws_secret_key communicator = "ssh" ami_name = "${var.image_prefix}-arm-v${var.buildtime}" - ami_groups = ["all"] tags = { image_family = "${var.image_prefix}-arm" } @@ -38,7 +45,7 @@ source "amazon-ebs" "aws_base_image_arm" { source_ami_filter { filters = { virtualization-type = "hvm" - name = "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-arm64-server-*" + name = "ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-arm64-server-*" root-device-type = "ebs" } owners = ["099720109477"] diff --git a/packer/jenkins-agents/gpu/gpu-jenkins-agent.pkr.hcl b/packer/jenkins-agents/gpu/gpu-jenkins-agent.pkr.hcl index 69521107..71b66aac 100644 --- a/packer/jenkins-agents/gpu/gpu-jenkins-agent.pkr.hcl +++ b/packer/jenkins-agents/gpu/gpu-jenkins-agent.pkr.hcl @@ -1,3 +1,12 @@ +packer { + required_plugins { + amazon = { + version = ">= 1.2.0" + source = "github.com/hashicorp/amazon" + } + } +} + variable "buildtime" { default = "{{isotime \"200601021504\"}}" } @@ -7,7 +16,6 @@ source "amazon-ebs" "jenkins_gpu_image" { access_key = var.aws_access_key secret_key = var.aws_secret_key communicator = "ssh" - ami_groups = ["all"] ami_name = "${var.image_prefix}-x64-v${var.buildtime}" tags = { image_family = "${var.image_prefix}-x64" @@ -43,8 +51,7 @@ build { provisioner "shell" { environment_vars = [ - "NVIDIA_DRIVER_VERSION=${var.nvidia_driver_version}", - "BASE_URL=${var.nvidia_driver_base_url}" + "NVIDIA_DRIVER_VERSION=${var.nvidia_driver_version}" ] execute_command = "echo 'ubuntu' | {{.Vars}} sudo -S -E bash '{{.Path}}'" scripts = ["${path.root}/../../scripts/nvidia-drivers.sh", "${path.root}/vulkan-setup.sh"] diff --git a/packer/jenkins-agents/gpu/variables.pkr.hcl b/packer/jenkins-agents/gpu/variables.pkr.hcl index 020b4d61..79b064df 100644 --- a/packer/jenkins-agents/gpu/variables.pkr.hcl +++ b/packer/jenkins-agents/gpu/variables.pkr.hcl @@ -14,6 +14,3 @@ variable "source_image_family" { variable "nvidia_driver_version" { } - -variable "nvidia_driver_base_url"{ -} diff --git a/packer/jenkins-agents/stock/configure-jenkins.sh b/packer/jenkins-agents/stock/configure-jenkins.sh index 4d22fccf..627727c4 100644 --- a/packer/jenkins-agents/stock/configure-jenkins.sh +++ b/packer/jenkins-agents/stock/configure-jenkins.sh @@ -12,8 +12,7 @@ usermod -aG docker ubuntu #Build essentials like make and gcc apt install -y build-essential -apt install -y python3 python3-pip -pip install -q poetry launchpadlib +apt install -y python3 python3-pip python3-poetry python3-launchpadlib apt install -y openjdk-17-jre-headless apt-mark hold openjdk-17-jre-headless #echo new cron into cron file diff --git a/packer/jenkins-agents/stock/stock-jenkins.pkr.hcl b/packer/jenkins-agents/stock/stock-jenkins.pkr.hcl index e97bb630..ac2dd2e4 100644 --- a/packer/jenkins-agents/stock/stock-jenkins.pkr.hcl +++ b/packer/jenkins-agents/stock/stock-jenkins.pkr.hcl @@ -1,3 +1,12 @@ +packer { + required_plugins { + amazon = { + version = ">= 1.2.0" + source = "github.com/hashicorp/amazon" + } + } +} + variable "buildtime" { default = "{{isotime \"200601021504\"}}" } @@ -7,7 +16,6 @@ source "amazon-ebs" "jenkins_stock_image" { secret_key = var.aws_secret_key communicator = "ssh" ami_name = "${var.image_prefix}-x64-v${var.buildtime}" - ami_groups = ["all"] tags = { image_family = "${var.image_prefix}-x64" } @@ -30,7 +38,6 @@ source "amazon-ebs" "jenkins_stock_image_arm" { secret_key = var.aws_secret_key communicator = "ssh" ami_name = "${var.image_prefix}-arm-v${var.buildtime}" - ami_groups = ["all"] tags = { image_family = "${var.image_prefix}-arm" } diff --git a/packer/scripts/nvidia-drivers.sh b/packer/scripts/nvidia-drivers.sh index ee960c5c..590c7fb9 100644 --- a/packer/scripts/nvidia-drivers.sh +++ b/packer/scripts/nvidia-drivers.sh @@ -1,18 +1,17 @@ set -eux -# install Nvidia drivers and dependencies -curl -fSsl -O $BASE_URL/$NVIDIA_DRIVER_VERSION/NVIDIA-Linux-x86_64-$NVIDIA_DRIVER_VERSION.run -sh NVIDIA-Linux-x86_64-$NVIDIA_DRIVER_VERSION.run -s -#get the Nvidia container runtime -APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 -curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | \ - apt-key add - -distribution=$(. /etc/os-release;echo ${ID}${VERSION_ID}) -curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | \ - tee /etc/apt/sources.list.d/nvidia-container-runtime.list -apt update -y -apt install -y nvidia-container-runtime +# Install NVIDIA driver via apt (pre-built kernel modules, compatible with HWE kernels) +apt install -y linux-headers-$(uname -r) +apt install -y nvidia-driver-${NVIDIA_DRIVER_VERSION}-server +# install nvidia-container-toolkit (replaces deprecated nvidia-container-runtime) +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +apt update -y +apt install -y nvidia-container-toolkit cat << EOF > /etc/docker/daemon.json { "log-driver": "json-file", @@ -20,13 +19,8 @@ cat << EOF > /etc/docker/daemon.json "max-size": "100m", "max-file": "2" }, - "default-runtime": "nvidia", - "runtimes": { - "nvidia": { - "path": "/usr/bin/nvidia-container-runtime", - "runtimeArgs": [] - } - }, "storage-driver": "overlay2" } EOF + +nvidia-ctk runtime configure --runtime=docker --set-as-default