From 0b1a6b3c554c8724e9d46c5731ee0d74ac827a56 Mon Sep 17 00:00:00 2001 From: Srilakshmi Sripathi <43179518+SrilakshmiSripathi@users.noreply.github.com> Date: Fri, 13 Mar 2026 21:41:20 -0400 Subject: [PATCH] feat: architecture decision completed, project scope reduced Signed-off-by: Srilakshmi Sripathi <43179518+SrilakshmiSripathi@users.noreply.github.com> --- .../ISSUE_TEMPLATE/feature_request.md | 0 .../ci/.github => .github}/bug_report.md | 0 .../outpu.tf => .github/workflows/ci-cd.yml | 0 .../*.yml => .github/workflows/cleanup.yml | 0 .gitignore | 4 +- ARCHITECTURE.md | 174 ++++++++ CHANGELOG.md | 52 +++ Claude.md | 3 +- README.md | 2 +- automation/ansible/cleanup.sh | 0 bootstrap_linux.sh | 382 +++++++++++++++++- {containers => clusters}/docker-compose.yml | 0 clusters/k8s/kubernetes-customization.yml | 0 docs/learnings.md | 4 + Project-spec.md => docs/spec/spec.md | 2 +- infra/terraform/output.tf | 0 16 files changed, 618 insertions(+), 5 deletions(-) rename {pipelines/ci/.github => .github}/ISSUE_TEMPLATE/feature_request.md (100%) rename {pipelines/ci/.github => .github}/bug_report.md (100%) rename infra/terraform/outpu.tf => .github/workflows/ci-cd.yml (100%) rename pipelines/ci/.github/workflows/*.yml => .github/workflows/cleanup.yml (100%) create mode 100644 ARCHITECTURE.md create mode 100644 CHANGELOG.md create mode 100644 automation/ansible/cleanup.sh rename {containers => clusters}/docker-compose.yml (100%) create mode 100644 clusters/k8s/kubernetes-customization.yml rename Project-spec.md => docs/spec/spec.md (99%) create mode 100644 infra/terraform/output.tf diff --git a/pipelines/ci/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md similarity index 100% rename from pipelines/ci/.github/ISSUE_TEMPLATE/feature_request.md rename to .github/ISSUE_TEMPLATE/feature_request.md diff --git a/pipelines/ci/.github/bug_report.md b/.github/bug_report.md similarity index 100% rename from pipelines/ci/.github/bug_report.md rename to .github/bug_report.md diff --git a/infra/terraform/outpu.tf b/.github/workflows/ci-cd.yml similarity index 100% rename from infra/terraform/outpu.tf rename to .github/workflows/ci-cd.yml diff --git a/pipelines/ci/.github/workflows/*.yml b/.github/workflows/cleanup.yml similarity index 100% rename from pipelines/ci/.github/workflows/*.yml rename to .github/workflows/cleanup.yml diff --git a/.gitignore b/.gitignore index 0374ebd..406721f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .ds_store -.venv \ No newline at end of file +.venv +.env +uv.lock \ No newline at end of file diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..e5f99e7 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,174 @@ +HelixScale: HelixScale: BioNeMo-AI HPC Fabric Orchestration Platform +Version: 2.0 (Professional Portfolio) +Classification: Enterprise-Simulation / Production Blueprint +Target Roles: HPC Engineer, Platform Engineering, Data Platform Specialist +License: MIT +Date: March 2026 + +## 📖 1. Executive Summary +HelixScale is a hybrid infrastructure platform designed to orchestrate High-Performance Computing (HPC) and Artificial Intelligence (AI) workloads efficiently. + +This architecture demonstrates how to separate the Control Plane (local development on a non-linux and non-nvidia CUDA GPU Hardware, security scanning, orchestration logic) from the Compute Plane (utilizing cloud GPU nodes, Kubernetes schedulers, NVIDIA drivers), ensuring enterprise-grade reliability, cost control, and reproducibility across macOS, Linux, and Windows (WSL2). + +### Key Architectural Pillars: + +- Infrastructure-as-Code (IaC): Reproducible environments via Terraform and Ansible. +- Supply-Chain Security: Automated vulnerability scanning (trivy, snyk) and secrets management integration. +- FinOps Governance: Automatic resource reclamation to prevent budget overruns (Zero-Cost Idle Policy). +- Hybrid Scheduling: Support for legacy HPC schedulers (Slurm) alongside cloud-native orchestration (Kubernetes/Volcano). + +## đŸ—ī¸ 2. High-Level System Design +HelixScale operates on a Control vs Compute Plane model. This separation ensures that my local machine (Mac M-Series, Linux Workstation) never burns expensive resources while managing the cloud environment remotely. + +graph TD + Local[Control Plane] -->|SSH/Session Manager| Infra [Terraform IaC & Ansible] + + subgraph "Cloud Compute Plane" + Terra -->|Create| VPC[AWS VPC + EKS Cluster] + VPC -->|Deploy| K8s[Kubernetes Nodes + GPU Drivers] + K8s -->|Schedule| Slurm[Slurm + Volcano Scheduler] + Slurm -->|Queue Jobs| Workload[BioNeMo Inference / MPI Jobs] + Workload -->|Store Results| S3[AWS S3 Bucket] + Infra -->|Monitor| Grafana[Grafana Dashboard + Prometheus Metrics] + end + + subgraph "Local Orchestration" + Dev -->|Push Code| CI[GitHub Actions CI/CD Pipeline] + CI -->|Run Scans| Security[Trivy/Snyk Security Scan] + CI -->|Deploy Artifacts| Registry[AWS ECR Private Registry] + Security -->|Block If Failed| Dev + end + + subgraph "Cost & Safety Controls" + FinOps[Cleanup Agent Script] -.->|Auto-terminate idle nodes| K8s + Budget[AWS Budget Alert ($2 Limit)] -.->|Stop New Provisions| Terra + Policy[Security Policy Rules] -.->|Enforce Pod Standards| K8s + end + + S3 -->|Data Ingestion| Workload + Dev -->|View Results| Dashboard[Grafana Visualization] + + + + +## 🔹 Control Plane (Local Development Machine) +Responsible for managing infrastructure without heavy compute load. + +- OS: macOS (M-Series) +- Tools: Terraform, Ansible, GitHub Actions, OrbStack/Docker. +- Tasks: Code development, dependency locking (uv), security scanning, CI/CD triggers. +🔸 Compute Plane (Cloud Host - AWS/Azure) +Responsible for executing workloads with GPU acceleration. + +OS: RHEL 9 / Ubuntu Server 22.04 LTS. +Tools: Kubernetes (EKS), Slurm, NVIDIA Container Toolkit. +Tasks: HPC job scheduling (sbatch), BioNeMo inference, model training. + +đŸ”ļ Data & Storage Layer +Manages biological datasets and artifacts securely. + +Primary Store: AWS S3 (Encrypted, Object Lock enabled). +Cache Storage: EFS / GP2 for ephemeral file access during job runtime. +Security: IAM Roles with least privilege, encryption at rest/in-transit. + +🔄 Architecture Diagram +The following flow represents how data and tasks move through HelixScale: + +graph TD + Dev[Developer / Control Plane] -->|SSH/Session Manager| Infra[Terraform IaC & Ansible] + + subgraph "Cloud Compute Plane" + Terra -->|Create| VPC[AWS VPC + EKS Cluster] + VPC -->|Deploy| K8s[Kubernetes Nodes + GPU Drivers] + K8s -->|Schedule| Slurm[Slurm + Volcano Scheduler] + Slurm -->|Queue Jobs| Workload[BioNeMo Inference / MPI Jobs] + Workload -->|Store Results| S3[AWS S3 Bucket] + Infra -->|Monitor| Grafana[Grafana Dashboard + Prometheus Metrics] + end + + subgraph "Local Orchestration" + Dev -->|Push Code| CI[GitHub Actions CI/CD Pipeline] + CI -->|Run Scans| Security[Trivy/Snyk Security Scan] + CI -->|Deploy Artifacts| Registry[AWS ECR Private Registry] + Security -->|Block If Failed| Dev + end + + subgraph "Cost & Safety Controls" + FinOps[Cleanup Agent Script] -.->|Auto-terminate idle nodes| K8s + Budget[AWS Budget Alert ($2 Limit)] -.->|Stop New Provisions| Terra + Policy[Security Policy Rules] -.->|Enforce Pod Standards| K8s + end + + S3 -->|Data Ingestion| Workload + Dev -->|View Results| Dashboard[Grafana Visualization] + + +đŸ› ī¸ Technology Stack Breakdown +Infrastructure as Code (IaC) +Terraform: Defines VPC, EKS clusters, GPU instances, and S3 buckets. +Ansible: Applies node hardening, driver installation, and Slurm configs via playbooks. +AWS CLI: Manages regional resource deployment and billing alerts. +Computing & Scheduling +Orchestrator: Kubernetes (EKS). +Scheduler: Slurm integrated with Volcano for hybrid workloads. +Container Runtimes: Docker (Dev) + Apptainer (Production HPC compatibility). +GPU Drivers: NVIDIA Container Toolkit (nvidia-docker) on managed nodes. +Data & AI +BioNeMo Stack: Integrated via NGC containers for biological model inference. +Data Ingestion: Python scripts converting local PDB/FASTA files to cloud-friendly formats (Parquet/HDF5). +Model Registry: MLFlow / S3 versioning for trained model weights. +DevOps & CI/CD +CI/CD: GitHub Actions with ci-cd.yaml workflow for linting, scanning, and deploying. +Secrets Management: AWS Secrets Manager integration (no keys in code). +Cleanup Automation: Scheduled GitHub Actions that scale GPU nodes to zero after N hours idle. +Security & Observability +Vulnerability Scanning: trivy integrated into CI pipeline for all Docker images. +Network Policies: Restricts pod-to-pod traffic (CNI) and egress paths. +Metrics: Prometheus + Grafana dashboards showing GPU utilization, job QPS, and cost metrics. +🔐 Security & Compliance Model +HelixScale implements enterprise-grade security policies to ensure production readiness: + +1. Supply Chain Security +All Docker/Apptainer images are scanned for CVEs (Critical vulnerabilities) before deployment. +uv.lock ensures Python dependency reproducibility and audit trails. +2. Data Privacy +PII/PHI Filtering: No user data is processed on local machines; all sensitive biological data resides in encrypted S3 buckets. +IAM Least Privilege: Node Groups use minimal IAM roles with strict permissions (no admin access). +3. Infrastructure Protection +.gitignore: Automatically generated to exclude Terraform state files (*.tfstate) and AWS credential files (~/.aws/credentials). +Terraform State Locking: Uses DynamoDB backend locks to prevent concurrent state modifications. +💰 Cost Control (FinOps Strategy) +Enterprise HPC platforms must optimize spend, especially when using Spot Instances: + +Spot Fleet Utilization: GPU nodes run on Spot Instances for up to 90% savings with auto-recovery logic if interruptions occur. +Idle Detection: Cleanup agents monitor GPU utilization metrics; nodes are terminated after 30 minutes of inactivity. +Budget Hard Limits: AWS Budgets set at $2/month with immediate notifications at 80% threshold. +🚀 Local vs Cloud Development Workflow +Phase 1: Blueprinting (Local) +Goal: Create architecture, write Terraform plans, test Python logic locally on Mac/WSL. +Resources: Zero cloud resources created; all terraform plan outputs reviewed first. +Phase 2: Controlled Provisions (Cloud) +Goal: Spin up specific compute resources for testing with approval gates (bootstrap.sh). +Safety Gate: Must confirm via script or GitHub Actions before spinning GPU instances. +Phase 3: Production Simulation (CI/CD) +Goal: Full pipeline automation where code is built, scanned, and deployed automatically on push. +📚 Missing Components & Future Roadmap +While this blueprint is fully functional for Phase 1, here are recommended additions for Phase 2: + +Feature Status Reason for Future Addition +Multi-Cluster Support Incomplete Add support for scaling to Azure or GCP clusters in future phases. +MLflow Registry Incomplete Integrate full model versioning and experiment tracking for AI roles. +InfiniBand Networking Not Implemented Simulate RDMA over Converged Ethernet (RoCE) performance testing for HPC. +Disaster Recovery Not Implemented Implement cross-region S3 replication and EKS backup policies. +📝 Version Control & Contributions +Commit Message Standard: Use conventional commits (feat: add BioNeMo support, fix: update cleanup logic). +License: MIT License (Free for personal portfolio use). +Contributors: Add open-source community badges if PRs are merged to upstream projects (Volcano, Slurm-k8s). +🧠 Summary for Hiring Managers +HelixScale demonstrates: + +Control Plane Mastery: You know how to manage infrastructure without running expensive hardware locally. +Cost Control: You understand FinOps and budget safety protocols before deploying any resources. +Security Awareness: You automatically prevent secrets leakage via .gitignore and scanning tools. +Enterprise-Ready: Your pipeline is built on industry standards (Terraform, Ansible, Kubernetes) that work across Linux/Windows/macOS environments. +This portfolio piece proves you can design production-grade systems while adhering to the safety, cost, and compliance requirements demanded by enterprise HPC teams. \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..f8ef1f3 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,52 @@ +# HelixScale Changelog + +All notable changes to the HelixScale Platform will be documented in this file. + +> Not auto-generated + +# Version 2.0: HelixScale: BioNeMo-AI HPC Fabric Orchestration Platform + +Mainly, the idea evolved to differ streamlit interface to version 3.0. + +┌──────────────────────────────────────────────────────────────┐ +│ Orchestration Engine │ +│ Pipeline DAG ←→ Scheduler Abstraction │ +│ (dependency (Slurm | Local) │ +│ resolution) GPU Allocator │ +├──────────────────────────────────────────────────────────────┤ +│ Container Runtimes │ +│ Docker (T0/T1) | Apptainer (T2) │ +├──────────────â”Ŧ──────────────â”Ŧ────────────────────────────────┤ +│ Terraform │ Ansible │ Chef InSpec │ +│ (provision) │ (configure) │ (validate) │ +├──────────────┴──────────────┴────────────────────────────────┤ +│ Observability │ +│ Prometheus → Grafana DCGM Exporter Loki │ +│ Slurm Exporter Triton Metrics │ +└──────────────────────────────────────────────────────────────┘ + + + +# [Version - 1.0] - Mar-04-2026 HelixScale: HPC-optimized BioNemo Orchestration (MVP Phase 1) + +### đŸŽ¯ Features: + +┌──────────────────────────────────────────────────────────────┐ +│ CLI / Streamlit UI │ +│ helixscale [cluster|jobs|infra|monitor] │ +├──────────────────────────────────────────────────────────────┤ +│ Orchestration Engine │ +│ Pipeline DAG ←→ Scheduler Abstraction │ +│ (dependency (Slurm | SGE | Local) │ +│ resolution) GPU Allocator │ +├──────────────────────────────────────────────────────────────┤ +│ Container Runtimes │ +│ Docker (T0/T1) | Apptainer (T2) │ +├──────────────â”Ŧ──────────────â”Ŧ────────────────────────────────┤ +│ Terraform │ Ansible │ Chef InSpec │ +│ (provision) │ (configure) │ (validate) │ +├──────────────┴──────────────┴────────────────────────────────┤ +│ Observability │ +│ Prometheus → Grafana DCGM Exporter Loki │ +│ Slurm Exporter Triton Metrics │ +└──────────────────────────────────────────────────────────────┘ \ No newline at end of file diff --git a/Claude.md b/Claude.md index 730bcc4..9fedf93 100644 --- a/Claude.md +++ b/Claude.md @@ -1,4 +1,5 @@ -# Claude.md — HelixScale: BioNemo HPC Orchestration Showcase +# Claude.md — HelixScale: BioNeMo-AI HPC Fabric Orchestration Platform + ## Project Identity diff --git a/README.md b/README.md index e888aef..5e40c3f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# HelixScale: BioNeMo HPC Orchestration Platform +# HelixScale: BioNeMo-AI HPC Fabric Orchestration Platform > Production-grade HPC platform engineering for GPU-accelerated computational biology workloads — from bare metal to cloud, single node to multi-cluster. diff --git a/automation/ansible/cleanup.sh b/automation/ansible/cleanup.sh new file mode 100644 index 0000000..e69de29 diff --git a/bootstrap_linux.sh b/bootstrap_linux.sh index 242c37d..f1ee721 100644 --- a/bootstrap_linux.sh +++ b/bootstrap_linux.sh @@ -1,3 +1,281 @@ +#!/bin/bash + +# ------------------------------------------ +# bootstrap.sh - Platform Manager +# Purpose: Safely sets up the development environment for HelixScale +# Risk Level: Low (No cloud resources created unless you approve) +# Requirements: Bash 3+, curl, wget +# ------------------------------------------ + +set -e # Exit on error +set -o pipefail # Enable pipeline error checking +set -u # Treat unset variables as errors + +echo "==========================================" +echo " 🚀 HelixScale Platform Manager " +echo "==========================================" +echo "" + +# --- Configuration Variables --- +PROJECT_NAME="HelixScale" +UV_VERSION="0.9.26" +PYTHON_VERSION="3.12" # Python 3.12+ is recommended for Mac M-Series +TERRAFORM_VERSION="1.14" +ANSIBLE_VERSION="2.15" +AWS_REGION = "us-east-1" # Default region (override in .tfvars) + +# --- Function 1: Detect OS & Install Dependencies --- +setup_system_tools() { + echo "🔍 Checking System Environment..." + + local is_mac = false + local is_linux = true + + if [[ $(uname) == "Darwin" ]]; then + is_mac = true + echo "âš ī¸ Detected macOS (M-Series). Running on POSIX Shell." + fi + + # Check for Terraform and AWS CLI + if ! command -v terraform &>/dev/null; then + echo " đŸ› ī¸ Installing Terraform (${TERRAFORM_VERSION})..." + curl -LO https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_darwin_amd64.zip + # Note: For cross-platform compatibility in a script, we use direct download. + # On Linux, the file path usually changes (linux-amd64). + # To be truly portable, we detect and adjust extension, or ask user to install manually if auto-fail occurs. + # SAFETY NOTE: We check for existing version first to avoid overwriting system tools if managed by root/sudo elsewhere. + + # Since direct downloads vary by architecture (ARM64 vs AMD64), we simplify: + echo " â„šī¸ User must ensure Terraform is installed via package manager or binary." + return 1 + fi + + echo " ✅ Terraform detected ($(terraform --version))" + + if ! command -v aws &>/dev/null; then + echo " đŸ› ī¸ Installing AWS CLI v2..." + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + rm awscliv2.zip + else + echo " ✅ AWS CLI detected ($(aws --version))" + fi +} + + +# --- Step 1: Install uv if not present --- +echo "🔍 Checking UV installer..." + +if ! command -v uv &> /dev/null; then + echo "âš ī¸ UV not found. Installing now (this takes 30 seconds)..." + + # Use curl to install uv silently + curl -LsSf https://astral.sh/uv/install.sh | sh + + echo "" + echo "✅ UV installed successfully." + + # Re-scan to confirm it's ready + if ! command -v uv &> /dev/null; then + echo "❌ Installation failed. Please check your internet connection." + exit 1 + fi +else + echo "✅ UV already installed (Version: $(uv --version))" +fi + +# --- Step 2: Create Virtual Environment for Python Code --- +echo "" +echo "🔧 Creating isolated project environment..." + +# Ensure uv is in path +if [[ ! "$PATH" =~ (^|:)/\.local/bin ]]; then + echo "âš ī¸ Adding UV to PATH. Run source $HOME/.local/bin/env || export PATH=$HOME/.local/bin:$PATH" +fi + +cd "$PROJECT_NAME" # Ensure we're in the project folder + +# Initialize uv project and create virtual environment +uv init --python "${PYTHON_VERSION}" > /dev/null 2>&1 || { + echo "â„šī¸ Initializing Python project without error handling..." +} + +# Install common Python dependencies (if any) +echo "" +echo "đŸ“Ļ Installing Python dependencies..." + +uv add --optional requests boto3 python-dotenv > /dev/null 2>&1 || true + +echo "✅ Virtual environment ready." + +# --- Step 3: Check System Tools (Terraform & Ansible) --- +echo "" +echo "â˜ī¸ Checking cloud infrastructure tools..." + +if ! command -v terraform &> /dev/null; then + echo "âš ī¸ Terraform not found on system." + echo " To install: brew install terraform" + echo "" +else + echo "✅ Terraform installed (Version: $(terraform --version))" +fi + +if ! command -v ansible &> /dev/null; then + echo "âš ī¸ Ansible not found on system." + echo " To install: brew install ansible" + echo "" +else + echo "✅ Ansible installed (Version: $(ansible --version))" +fi + +if ! command -v aws &> /dev/null; then + echo "âš ī¸ AWS CLI not found on system." + echo " To install: brew install aws-cli" + echo "" +else + echo "✅ AWS CLI installed (Version: $(aws --version))" +fi + +# --- Step 4: Verify AWS Credentials (Safety Check) --- +echo "" +echo "🔐 Verifying cloud permissions..." + +if [ -z "$AWS_ACCESS_KEY_ID" ] || [ -z "$AWS_SECRET_ACCESS_KEY" ]; then + echo "" + echo "âš ī¸ No AWS credentials found in environment." + echo "" + echo " To set up credentials securely:" + echo " 1. Go to https://console.aws.amazon.com/iam/" + echo " 2. Create a new access key for your IAM user" + echo " 3. Run: aws configure" + echo " OR export AWS_ACCESS_KEY_ID='...'" + echo " OR export AWS_SECRET_ACCESS_KEY='...'" + echo "" + + read -p "Would you like to set up AWS credentials now? [y/N]: " SETUP_CREDENTIALS + + if [[ "$SETUP_CREDENTIALS" == "y" || "$SETUP_CREDENTIALS" == "Y" ]]; then + echo "" + aws configure --profile default + # Check again after configuration + if [ -z "$AWS_ACCESS_KEY_ID" ]; then + echo "❌ Credential setup failed. Please set up credentials manually." + exit 1 + fi + else + echo "â„šī¸ Skipping AWS credential setup for now." + echo " You can run 'aws configure' later before creating cloud resources." + fi +else + echo "✅ AWS credentials detected (no action needed)." +fi + +# --- Step 5: Create .gitignore if not exists (Safety Protection) --- +echo "" +echo "đŸ›Ąī¸ Setting up security protections..." + +if [[ ! -f .gitignore ]]; then + cat > .gitignore << 'EOF' +# Python cache +__pycache__/ +*.py[cod] +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg + +# Terraform State (NEVER commit this!) +*.tfstate* +*.tfvars + +# Virtual environments +.env +.venv/ +venv/ +ENV/ +.venv/ +uv.lock # Optional: commit if you want locked versions + +# AWS Credentials (Never commit) +.aws/ +*.pem +EOF + + echo "✅ .gitignore created for security." +else + echo "â„šī¸ .gitignore already exists." +fi + +# --- Step 6: Final Summary & Approval Gate --- +echo "" +echo "==========================================" +echo "✅ Environment Ready!" +echo "==========================================" +echo "" +echo "Next Steps:" +echo "1. Review your AWS Budget settings ($2/month recommended)" +echo "2. Edit 'main.tf' in infra/ folder with desired resources" +echo "3. Run: terraform plan (to preview changes)" +echo "4. Run: ./bootstrap.sh again to refresh environment anytime" +echo "" + +# --- Step 7: Ask for Approval Before Creating Cloud Resources --- +if ! check_credentials; then + echo "" + echo "âš ī¸ Cannot create cloud resources without credentials." + echo " Please run 'aws configure' or set AWS_ACCESS_KEY_ID/SECRET_ACCESS_KEY" + exit 0 +fi + +echo "" +echo "â˜ī¸ Ready for Cloud Provisioning..." +read -p "🔐 Do you want to approve Terraform execution now? [y/N]: " CONFIRM_CREATE + +if [[ "$CONFIRM_CREATE" == "y" || "$CONFIRM_CREATE" == "Y" ]]; then + echo "" + echo "✅ Creation Approved. Running Terraform..." + terraform init + + if [ -f main.tf ]; then + terraform plan + read -p "Execute plan? [y/N]: " EXECUTE + + if [[ "$EXECUTE" == "y" || "$EXECUTE" == "Y" ]]; then + terraform apply -auto-approve + echo "" + echo "✅ Cloud resources created successfully." + echo " View your resources in AWS Console!" + else + echo "â„šī¸ Skipping resource creation for now." + fi + else + echo "â„šī¸ No main.tf found. Skipping Terraform execution." + fi +else + echo "â„šī¸ Cloud resource creation skipped." +fi + +echo "" +echo "==========================================" +echo "Bootstrap Complete!" +echo "==========================================" + + + + + + # 1. Install uv for Linux curl -LsSf https://astral.sh/uv/install.sh | sh source $HOME/.cargo/env @@ -7,4 +285,106 @@ uv sync --all-extras # 3. Install core OS dependencies (Ubuntu equivalents of the macOS brew prerequisites) sudo apt update && sudo apt install -y ansible nodejs -snap install terraform --classic \ No newline at end of file +snap install terraform --classic + + + + +#!/bin/bash + +# ----------------------------------------------------------------------------- +# bootstrap.sh - Safety First Initialization Script +# Purpose: Prepares the environment safely before creating cloud resources +# Author: HPC Platform Engineer Portfolio (Phase 1) +# Risk Level: Low (Requires user confirmation to spend money) +# ----------------------------------------------------------------------------- + +set -e # Exit immediately if a command exits with non-zero status +set -o pipefail # Enable pipeline error checking + +echo "🚀 Starting HPC Platform Bootstrap..." +echo "==================================================" + +# --- 1. Check and Install 'uv' (Python Manager) --- +# Why? uv is faster than pip. We install it once, then reuse it locally. +check_uv() { + if command -v uv &> /dev/null; then + echo "✅ UV is already installed." + echo " Version: $(uv --version)" + return 0 + else + echo "âš ī¸ UV not found. Installing now (this takes a moment)..." + curl -LsSf https://astral.sh/uv/install.sh | sh + source $HOME/.local/bin/env # Ensure path is updated for this session + if command -v uv &> /dev/null; then + echo "✅ UV installation complete." + return 0 + else + echo "❌ Installation failed. Please try manually with curl..." + exit 1 + fi + fi +} + +# --- 2. Check AWS Credentials (Safety Guard) --- +# Why? We never want to accidentally use your credentials without checking. +check_credentials() { + if [ -z "$AWS_ACCESS_KEY_ID" ] || [ -z "$AWS_SECRET_ACCESS_KEY" ]; then + echo "âš ī¸ WARNING: No AWS Credentials detected in environment." + echo " You can generate these at https://console.aws.amazon.com/iam/" + echo " Set them using: export AWS_ACCESS_KEY_ID='...'" + echo " (Or use 'aws configure' to set them securely)." + echo "" + echo " ⛔ CLOUD RESOURCE CREATION SKIPPED due to missing keys." + echo " This prevents accidental bills!" + return 1 + else + echo "✅ AWS Credentials detected." + echo " (Note: Keys are never logged or displayed)." + return 0 + fi +} + +# --- 3. Logic to Create/Check Linux VM (Cloud Instance) --- +# Why? This is the critical step. We ask for permission first. +create_linux_vm() { + if ! check_credentials; then + echo "Cannot create VM without credentials." + return 1 + fi + + # Safety Check: Did you really mean it? + read -p "âš ī¸ This action will spin up a Cloud Linux Instance (EC2/GCP). Do you want to proceed? [y/N]: " CONFIRM + + if [[ "$CONFIRM" == "y" || "$CONFIRM" == "Y" ]]; then + echo "✅ VM Creation Request Approved. Running Terraform..." + + # We use 'terraform apply' here but only for the specific resources defined in Phase 1 + # In a real project, you might pass --target to only create the instance (e.g., -var=instance_name="my-vm") + terraform init + terraform plan -out=tfplan && terraform apply -auto-approve tfplan + + echo "✅ VM Instance Ready." + else + echo "â„šī¸ Skipping VM creation for now. Run this script again with confirmation." + fi +} + +# --- 4. Main Execution Flow --- +# Step-by-step process to prevent mistakes + +# 1. Setup Local Tools (uv) - This is always safe and local +echo "đŸ“Ļ Checking Python Manager..." +check_uv + +# 2. Check Cloud Permissions (Safety Gate) +echo "🔐 Checking Security Credentials..." +check_credentials + +# 3. Ask to Create Cloud Resources (The Risky Part) +echo "â˜ī¸ Ready for Cloud Provisioning Logic..." +create_linux_vm + +echo "==================================================" +echo "Bootstrap Complete." +echo "Next Step: Check your AWS Console for the new resources if any were created." diff --git a/containers/docker-compose.yml b/clusters/docker-compose.yml similarity index 100% rename from containers/docker-compose.yml rename to clusters/docker-compose.yml diff --git a/clusters/k8s/kubernetes-customization.yml b/clusters/k8s/kubernetes-customization.yml new file mode 100644 index 0000000..e69de29 diff --git a/docs/learnings.md b/docs/learnings.md index e69de29..f50ad3c 100644 --- a/docs/learnings.md +++ b/docs/learnings.md @@ -0,0 +1,4 @@ +# Docuemnting all challenges + +# 1. GPU passthrough in Mac +- OrbStack, colima all currently popular tools or traditional Docker engine Mac doesnt allow GPU access. \ No newline at end of file diff --git a/Project-spec.md b/docs/spec/spec.md similarity index 99% rename from Project-spec.md rename to docs/spec/spec.md index 7920da5..61b7cff 100644 --- a/Project-spec.md +++ b/docs/spec/spec.md @@ -1,4 +1,4 @@ -# spec.md — HelixScale Technical Specification +# spec.md — HelixScale Technical Specification completed vision ## 1. System Overview diff --git a/infra/terraform/output.tf b/infra/terraform/output.tf new file mode 100644 index 0000000..e69de29