diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..c7a6df3 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,46 @@ +version: 2 +updates: + - package-ecosystem: gomod + directory: /api-server + schedule: + interval: weekly + open-pull-requests-limit: 5 + commit-message: + prefix: "deps(go)" + + - package-ecosystem: npm + directory: /web-ui + schedule: + interval: weekly + open-pull-requests-limit: 5 + commit-message: + prefix: "deps(web-ui)" + + - package-ecosystem: npm + directory: /website + schedule: + interval: weekly + open-pull-requests-limit: 5 + commit-message: + prefix: "deps(website)" + + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + commit-message: + prefix: "ci" + + - package-ecosystem: docker + directory: /api-server + schedule: + interval: weekly + commit-message: + prefix: "deps(docker)" + + - package-ecosystem: docker + directory: /web-ui + schedule: + interval: weekly + commit-message: + prefix: "deps(docker)" diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index d1abc5c..db62134 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -27,7 +27,7 @@ jobs: uses: actions/checkout@v4 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version: '1.24' cache-dependency-path: api-server/go.sum @@ -53,7 +53,7 @@ jobs: uses: actions/checkout@v4 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version: '1.24' cache-dependency-path: api-server/go.sum diff --git a/.github/workflows/cleanup-docs-version.yml b/.github/workflows/cleanup-docs-version.yml new file mode 100644 index 0000000..6c5473e --- /dev/null +++ b/.github/workflows/cleanup-docs-version.yml @@ -0,0 +1,137 @@ +name: Cleanup Documentation Version + +on: + release: + types: [deleted] + +permissions: + contents: write + +concurrency: + group: docs-versioning + cancel-in-progress: false + +jobs: + remove-version: + name: Remove Documentation Version + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: main + + - name: Extract version from tag + id: version + run: | + # Extract version from tag (v0.0.3 -> 0.0.3) + VERSION=${GITHUB_REF#refs/tags/v} + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Removing documentation version: $VERSION" + + - name: Check if version exists + id: check + run: | + VERSION=${{ steps.version.outputs.version }} + if grep -q "\"${VERSION}\"" website/versions.json; then + echo "exists=true" >> $GITHUB_OUTPUT + echo "✅ Version ${VERSION} found in versions.json" + else + echo "exists=false" >> $GITHUB_OUTPUT + echo "⚠️ Version ${VERSION} not found, nothing to remove" + fi + + - name: Install jq + if: steps.check.outputs.exists == 'true' + run: | + sudo apt-get update + sudo apt-get install -y jq + + - name: Remove version from versions.json + if: steps.check.outputs.exists == 'true' + run: | + VERSION=${{ steps.version.outputs.version }} + + echo "Current versions.json:" + cat website/versions.json + + # Remove version from array using jq + jq --arg ver "${VERSION}" 'del(.[] | select(. == $ver))' website/versions.json > website/versions.json.tmp + mv website/versions.json.tmp website/versions.json + + echo "" + echo "Updated versions.json:" + cat website/versions.json + + - name: Remove versioned directories + if: steps.check.outputs.exists == 'true' + run: | + VERSION=${{ steps.version.outputs.version }} + + # Remove versioned documentation directory + if [ -d "website/versioned_docs/version-${VERSION}" ]; then + rm -rf "website/versioned_docs/version-${VERSION}" + echo "✅ Removed versioned_docs/version-${VERSION}/" + fi + + # Remove versioned sidebars file + if [ -f "website/versioned_sidebars/version-${VERSION}-sidebars.json" ]; then + rm -f "website/versioned_sidebars/version-${VERSION}-sidebars.json" + echo "✅ Removed versioned_sidebars/version-${VERSION}-sidebars.json" + fi + + echo "" + echo "Files removed:" + git status --short + + - name: Configure Git + if: steps.check.outputs.exists == 'true' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Commit and push changes + if: steps.check.outputs.exists == 'true' + run: | + VERSION=${{ steps.version.outputs.version }} + + git add website/versions.json + git add website/versioned_docs/ + git add website/versioned_sidebars/ + + git commit -m "docs: remove version ${VERSION} [skip ci] + + Auto-cleanup documentation version after release deletion ${GITHUB_REF} + + - Removed version ${VERSION} from versions.json + - Deleted versioned_docs/version-${VERSION}/ + - Deleted versioned_sidebars/version-${VERSION}-sidebars.json" + + git push origin HEAD:main + + echo "✅ Changes pushed to main branch" + echo "📚 Documentation version ${VERSION} has been removed" + + - name: Summary + run: | + VERSION=${{ steps.version.outputs.version }} + EXISTS=${{ steps.check.outputs.exists }} + + echo "## Documentation Cleanup Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Version**: ${VERSION}" >> $GITHUB_STEP_SUMMARY + echo "**Release**: ${GITHUB_REF}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + if [ "$EXISTS" == "true" ]; then + echo "✅ **Status**: Documentation version removed successfully" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Removed files**:" >> $GITHUB_STEP_SUMMARY + echo "- \`versions.json\` (updated)" >> $GITHUB_STEP_SUMMARY + echo "- \`versioned_docs/version-${VERSION}/\`" >> $GITHUB_STEP_SUMMARY + echo "- \`versioned_sidebars/version-${VERSION}-sidebars.json\`" >> $GITHUB_STEP_SUMMARY + else + echo "⚠️ **Status**: Version not found, nothing to remove" >> $GITHUB_STEP_SUMMARY + fi diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 428461d..ef0f15c 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -19,13 +19,17 @@ concurrency: cancel-in-progress: false jobs: - build-docs: - name: Build Docusaurus + deploy-docs: + name: Build and Deploy Documentation runs-on: ubuntu-latest + steps: - - name: Checkout code + - name: Checkout repository uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Setup Node.js uses: actions/setup-node@v4 with: @@ -43,75 +47,22 @@ jobs: cd website npm run build - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: docusaurus-build - path: website/build/ - - deploy-docs: - name: Deploy to GitHub Pages - needs: build-docs - runs-on: ubuntu-latest - steps: - - name: Checkout gh-pages branch (or create it) - uses: actions/checkout@v4 - with: - ref: gh-pages - fetch-depth: 0 - continue-on-error: true - - - name: Initialize gh-pages if checkout failed - run: | - if [ ! -d ".git" ]; then - echo "gh-pages branch doesn't exist, creating it..." - git init - git checkout -b gh-pages - git remote add origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git - fi - - - name: Download docs build - uses: actions/download-artifact@v4 - with: - name: docusaurus-build - path: docs-temp/ - - - name: Preserve Helm charts and deploy docs + - name: Prepare deployment run: | - # Backup existing charts/ directory if it exists - if [ -d "charts" ]; then - echo "Backing up existing charts/ directory" - mv charts charts-backup - fi + # Create deployment directory + mkdir -p _site - # Clear everything except charts-backup and .git - find . -maxdepth 1 ! -name charts-backup ! -name .git ! -name . ! -name .. -exec rm -rf {} + + # Copy Docusaurus build output + cp -r website/build/* _site/ - # Move Docusaurus build to root - if [ -d "docs-temp" ] && [ "$(ls -A docs-temp)" ]; then - mv docs-temp/* docs-temp/.* . 2>/dev/null || mv docs-temp/* . - rm -rf docs-temp - else - echo "Warning: docs-temp directory is empty or doesn't exist" - fi + # Ensure .nojekyll exists (should already be in build) + touch _site/.nojekyll - # Restore charts/ directory - if [ -d "charts-backup" ]; then - echo "Restoring charts/ directory" - mv charts-backup charts - fi - - - name: Configure Git - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: '_site' - - name: Commit and push - run: | - git add -A - if ! git diff --cached --quiet; then - git commit -m "Deploy documentation from commit ${{ github.sha }}" - git push origin gh-pages --force - else - echo "No changes to commit" - fi + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8fb8205..d9c407a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,9 +15,74 @@ env: IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/bison jobs: + gate: + name: Test & Lint Gate + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version: '1.24' + cache-dependency-path: api-server/go.sum + + - name: Go vet + working-directory: api-server + run: go vet ./... + + - name: Go fmt check + working-directory: api-server + run: | + if [ -n "$(gofmt -l .)" ]; then + echo "Go code is not formatted:"; gofmt -d .; exit 1 + fi + + - name: Go build + working-directory: api-server + run: go build ./... + + - name: Go test (race + coverage) + working-directory: api-server + run: | + go test -race -coverprofile=coverage.out ./... + echo "### API coverage" >> "$GITHUB_STEP_SUMMARY" + go tool cover -func=coverage.out | tail -1 >> "$GITHUB_STEP_SUMMARY" + + - name: Set up Node + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: npm + cache-dependency-path: web-ui/package-lock.json + + - name: Web install + working-directory: web-ui + # Remove the lockfile before install so platform-specific optional deps + # (e.g. @rollup/rollup-linux-x64-gnu) resolve on the Linux runner. A lock + # generated on another platform omits them and triggers npm bug #4828. + # This mirrors the web-ui Dockerfile. + run: | + rm -f package-lock.json + npm install --no-audit --no-fund + + - name: Web lint + working-directory: web-ui + run: npm run lint + + - name: Web test + working-directory: web-ui + run: npx vitest run + + - name: Web build + working-directory: web-ui + run: npm run build + prepare: name: Prepare Release runs-on: ubuntu-latest + needs: gate outputs: version: ${{ steps.extract_version.outputs.version }} steps: @@ -81,6 +146,12 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + - name: Set GHCR organization + id: ghcr + run: | + # Convert GitHub username to lowercase for GHCR + echo "owner=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + - name: Log in to GitHub Container Registry uses: docker/login-action@v3 with: @@ -92,7 +163,7 @@ jobs: id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.IMAGE_PREFIX }}/${{ matrix.component }} + images: ghcr.io/${{ steps.ghcr.outputs.owner }}/bison/${{ matrix.component }} tags: | type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} @@ -171,14 +242,19 @@ jobs: ### 🚀 Installation - #### Using Helm Repository (Recommended) + **Requirements:** Helm >= 3.8.0, Kubernetes >= 1.22 + + #### Method 1: From GHCR (Recommended) \`\`\`bash - helm repo add bison https://${{ github.repository_owner }}.github.io/Bison/ - helm repo update - helm install my-bison bison/bison --version ${VERSION} + # Install directly from GitHub Container Registry + helm install my-bison oci://ghcr.io/supermarioyl/charts/bison --version ${VERSION} + + # Or pull first, then install + helm pull oci://ghcr.io/supermarioyl/charts/bison --version ${VERSION} + helm install my-bison bison-${VERSION}.tgz \`\`\` - #### Using GitHub Release + #### Method 2: From GitHub Releases \`\`\`bash wget https://github.com/${{ github.repository }}/releases/download/v${VERSION}/bison-${VERSION}.tgz helm install my-bison bison-${VERSION}.tgz @@ -187,14 +263,21 @@ jobs: ### 🐳 Docker Images \`\`\`bash - docker pull ghcr.io/${{ github.repository_owner }}/bison/api-server:${VERSION} - docker pull ghcr.io/${{ github.repository_owner }}/bison/web-ui:${VERSION} + docker pull ghcr.io/supermarioyl/bison/api-server:${VERSION} + docker pull ghcr.io/supermarioyl/bison/web-ui:${VERSION} \`\`\` ### 📦 What's Changed **Full Changelog**: https://github.com/${{ github.repository }}/compare/v${VERSION}...v${VERSION} + ### 📚 Documentation + + Version-specific documentation will be available shortly at: + - 📖 [https://bison.lei6393.com/docs/${VERSION}/](https://bison.lei6393.com/docs/${VERSION}/) + + > Note: Documentation versioning happens automatically after release. Allow a few minutes for the docs site to update. + --- 🤖 Generated with [GitHub Actions](https://github.com/features/actions) EOF @@ -211,106 +294,99 @@ jobs: prerelease: false token: ${{ secrets.GITHUB_TOKEN }} - publish-helm-repo: - name: Publish to Helm Repository (GitHub Pages) + commit-versions: + name: Commit Version Updates to Main runs-on: ubuntu-latest needs: [prepare, create-release] + permissions: + contents: write + steps: - - name: Checkout code + - name: Checkout repository uses: actions/checkout@v4 with: + ref: main fetch-depth: 0 - - name: Download Helm chart + - name: Download updated files uses: actions/download-artifact@v4 with: - name: helm-chart + name: updated-files path: . - - name: Install Helm - uses: azure/setup-helm@v4 - with: - version: 'latest' - - name: Configure Git run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - - name: Publish to GitHub Pages (charts/ subdirectory) - env: - VERSION: ${{ needs.prepare.outputs.version }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Commit and push version updates run: | - # Clone or create gh-pages branch - if git ls-remote --exit-code --heads origin gh-pages; then - echo "gh-pages branch exists, cloning..." - git clone --single-branch --branch gh-pages https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }}.git gh-pages - else - echo "gh-pages branch does not exist, creating..." - mkdir gh-pages - cd gh-pages - git init - git checkout -b gh-pages - git remote add origin https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }}.git - cd .. - fi - - # Create charts/ directory if it doesn't exist - mkdir -p gh-pages/charts - - # Copy Helm chart to gh-pages/charts/ - cp bison-${VERSION}.tgz gh-pages/charts/ + VERSION=${{ needs.prepare.outputs.version }} - # Generate or update index.yaml in charts/ directory - cd gh-pages/charts - helm repo index . --url https://${{ github.repository_owner }}.github.io/Bison/charts/ + # Check if there are changes to commit + if git diff --quiet deploy/charts/bison/Chart.yaml web-ui/package.json; then + echo "⚠️ No changes to commit" + exit 0 + fi - # Create README in charts/ directory if it doesn't exist - if [ ! -f README.md ]; then - cat < README.md - # Bison Helm Chart Repository + git add deploy/charts/bison/Chart.yaml + git add web-ui/package.json - ## Usage + git commit -m "chore: bump version to ${VERSION} [skip ci] - Add the Helm repository: + Auto-updated version files from release ${GITHUB_REF} - \`\`\`bash - helm repo add bison https://${{ github.repository_owner }}.github.io/Bison/charts/ - helm repo update - \`\`\` + - Updated Chart.yaml version to ${VERSION} + - Updated package.json version to ${VERSION}" - Search for available charts: + git push origin HEAD:main - \`\`\`bash - helm search repo bison - \`\`\` + echo "✅ Version updates committed to main branch" - Install the chart: + - name: Summary + run: | + VERSION=${{ needs.prepare.outputs.version }} + echo "## Version Update Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Version**: ${VERSION}" >> $GITHUB_STEP_SUMMARY + echo "**Release**: ${GITHUB_REF}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "✅ **Status**: Version files committed to main branch" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Updated files**:" >> $GITHUB_STEP_SUMMARY + echo "- \`deploy/charts/bison/Chart.yaml\`" >> $GITHUB_STEP_SUMMARY + echo "- \`web-ui/package.json\`" >> $GITHUB_STEP_SUMMARY - \`\`\`bash - helm install my-bison bison/bison --version ${VERSION} - \`\`\` + publish-helm-repo: + name: Publish to Helm Repository (GHCR) + runs-on: ubuntu-latest + needs: [prepare, create-release] + permissions: + contents: read + packages: write - ## Available Versions + steps: + - name: Checkout repository + uses: actions/checkout@v4 - See [index.yaml](./index.yaml) for all available versions. - EOF - fi + - name: Download Helm chart + uses: actions/download-artifact@v4 + with: + name: helm-chart + path: . - # Return to gh-pages root - cd .. + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: 'latest' - # Configure Git - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" + - name: Log in to GitHub Container Registry + run: echo "${{ secrets.GITHUB_TOKEN }}" | helm registry login ghcr.io -u ${{ github.actor }} --password-stdin - # Commit and push - git add charts/ - if git diff --cached --quiet; then - echo "No changes to commit" - else - git commit -m "Release Helm chart v${VERSION}" - git push origin gh-pages - echo "Successfully published Helm chart to GitHub Pages at /charts/" - fi + - name: Push Helm chart to GHCR + env: + VERSION: ${{ needs.prepare.outputs.version }} + run: | + # Convert GitHub username to lowercase for GHCR + REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + helm push bison-${VERSION}.tgz oci://ghcr.io/${REPO_OWNER}/charts diff --git a/.github/workflows/version-docs.yml b/.github/workflows/version-docs.yml new file mode 100644 index 0000000..1314da3 --- /dev/null +++ b/.github/workflows/version-docs.yml @@ -0,0 +1,119 @@ +name: Version Documentation + +on: + release: + types: [published] + +permissions: + contents: write + +concurrency: + group: docs-versioning + cancel-in-progress: false + +jobs: + create-version: + name: Create Documentation Version + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: main + + - name: Extract version from tag + id: version + run: | + # Extract version from tag (v0.0.3 -> 0.0.3) + VERSION=${GITHUB_REF#refs/tags/v} + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Creating documentation version: $VERSION" + + - name: Check if version already exists + id: check + run: | + VERSION=${{ steps.version.outputs.version }} + if grep -q "\"${VERSION}\"" website/versions.json; then + echo "exists=true" >> $GITHUB_OUTPUT + echo "⚠️ Version ${VERSION} already exists in versions.json" + else + echo "exists=false" >> $GITHUB_OUTPUT + echo "✅ Version ${VERSION} not found, will create" + fi + + - name: Setup Node.js + if: steps.check.outputs.exists == 'false' + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: website/package-lock.json + + - name: Install dependencies + if: steps.check.outputs.exists == 'false' + run: | + cd website + npm ci + + - name: Create documentation version + if: steps.check.outputs.exists == 'false' + run: | + VERSION=${{ steps.version.outputs.version }} + cd website + + echo "Running: npm run docusaurus docs:version ${VERSION}" + npm run docusaurus docs:version ${VERSION} + + echo "✅ Documentation version ${VERSION} created" + echo "" + echo "Files created/modified:" + git status --short + + - name: Configure Git + if: steps.check.outputs.exists == 'false' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Commit and push changes + if: steps.check.outputs.exists == 'false' + run: | + VERSION=${{ steps.version.outputs.version }} + + git add website/versions.json + git add website/versioned_docs/ + git add website/versioned_sidebars/ + + git commit -m "docs: add version ${VERSION} [skip ci] + + Auto-generated documentation version from release ${GITHUB_REF} + + - Added version ${VERSION} to versions.json + - Created versioned_docs/version-${VERSION}/ + - Created versioned_sidebars/version-${VERSION}-sidebars.json" + + git push origin HEAD:main + + echo "✅ Changes pushed to main branch" + echo "📚 Documentation will be available at: https://bison.lei6393.com/docs/${VERSION}/" + + - name: Summary + run: | + VERSION=${{ steps.version.outputs.version }} + EXISTS=${{ steps.check.outputs.exists }} + + echo "## Documentation Versioning Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Version**: ${VERSION}" >> $GITHUB_STEP_SUMMARY + echo "**Release**: ${GITHUB_REF}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + if [ "$EXISTS" == "true" ]; then + echo "⚠️ **Status**: Version already exists, skipped" >> $GITHUB_STEP_SUMMARY + else + echo "✅ **Status**: Documentation version created successfully" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "📚 **View Documentation**: [https://bison.lei6393.com/docs/${VERSION}/](https://bison.lei6393.com/docs/${VERSION}/)" >> $GITHUB_STEP_SUMMARY + fi diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a33766..799207f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,147 @@ All notable changes to the Bison project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.27] - 2026-06-19 + +### Added — Supply-chain hygiene + +- Added `.github/dependabot.yml` to track updates weekly for Go modules, the web-ui and website npm trees, GitHub Actions, and the Docker base images. +- Added `.dockerignore` for api-server and web-ui so build contexts exclude `.git`, `node_modules`, `dist`, coverage and editor files — smaller, more reproducible image builds. + +## [0.0.26] - 2026-06-19 + +### Added — Availability & network hardening templates (opt-in) + +- Optional **PodDisruptionBudget** (`apiServer.podDisruptionBudget` / `webUI.podDisruptionBudget`) keeps a minimum replica available during voluntary disruptions (node drains). +- Optional **HorizontalPodAutoscaler** (`apiServer.autoscaling` / `webUI.autoscaling`); when enabled, the Deployment no longer pins `replicas` so the HPA owns scaling. +- Optional **NetworkPolicy** (`networkPolicy.enabled`) restricting api-server ingress to web-ui pods and the release namespace. +- All three are disabled by default; `values.schema.json` extended to type-check the new keys. + +## [0.0.25] - 2026-06-19 + +### Added — Chart validation + +- Added `values.schema.json` so Helm validates value types at install/upgrade time (e.g. a string `replicaCount` or non-boolean `enabled` now fails fast instead of being silently mis-applied). The schema type-checks known keys while remaining lenient about additions. +- Added `kubeVersion: ">=1.22.0-0"` to `Chart.yaml` so unsupported clusters are rejected up front. + +## [0.0.24] - 2026-06-19 + +### Changed — Frontend re-render reduction + +- `AuthProvider` and `ThemeProvider` now memoize their context value (`useMemo`) instead of allocating a fresh object every render, so consumers (and, for the theme, the whole Ant Design `ConfigProvider` subtree) no longer re-render on unrelated parent renders. + +## [0.0.23] - 2026-06-19 + +### Changed — Centralized frontend error handling + +- Replaced the 13 duplicated `err.response?.data?.error || err.message` extraction sites (across ClusterNodes, TeamDetail, TeamCreate, Login, ResourceConfig) with the shared `getApiErrorMessage` helper. The backend error envelope shape now lives in one place, so future changes touch one file instead of thirteen. + +## [0.0.22] - 2026-06-19 + +### Security — Refuse insecure defaults at startup + +- When `AUTH_ENABLED=true`, the server now refuses to start if `JWT_SECRET` is empty or still the built-in public default, or if `ADMIN_PASSWORD` is empty or `admin`. This prevents a production deployment from silently running with a forgeable token-signing key or the well-known default password. Auth-disabled and local development are unaffected; the Helm chart already injects randomly generated, persisted secrets. Added table-driven config validation tests. + +## [0.0.21] - 2026-06-19 + +### Added — Release test/lint gate + +- The release workflow now runs a **Test & Lint Gate** before anything is built or published: `go vet`, `gofmt` check, `go build`, `go test -race` (with coverage in the job summary), plus web `npm ci` / lint / `vitest run` / build. `prepare` (and the whole publish chain) `needs` this gate, so broken code can no longer be tagged into a public release. + +### Fixed — Reproducible web build + +- Declared `tslib` as an explicit dependency: `echarts-for-react` imports it but doesn't declare it, so it was a phantom dependency previously satisfied only by the removed `@ant-design/pro-components`. Clean installs (`npm ci`) now build reliably. +- Synced `package-lock.json` with `package.json` (removed stale `pro-components`, added `tslib`) so `npm ci` works. +- Applied `gofmt` across the api-server (formatting only). + +## [0.0.20] - 2026-06-19 + +### Changed — OpenCost query caching + +- The OpenCost client now wraps allocation queries in a 30s TTL cache that also **coalesces concurrent identical queries** (same window/aggregate/filter), so a burst of dashboard/billing requests hits OpenCost once instead of once per caller. Errors are not cached (next caller retries). Self-contained implementation — no new dependency; covered by race-tested unit tests. + +## [0.0.19] - 2026-06-19 + +### Changed — Backend performance + +- `GET /teams` no longer issues one (discarded) OpenCost usage query per team; per-team usage is fetched on demand by the detail/dashboard endpoints. This removes an O(teams) OpenCost call storm from the team list. +- Billing/report cost computation now resolves the resource price table **once per operation** (`loadPrices` + `costFromPrices`) instead of reading the resource-config ConfigMap for every allocation row, cutting ConfigMap reads from O(allocations) to O(1) in `ProcessBilling`, `GetTeamBill`, and `GetProjectBill`. + +## [0.0.18] - 2026-06-19 + +### Fixed — Daily-consumption (burn-rate) estimate + +- `CalculateDailyConsumption` now divides total in-window deductions by the **actual span of deduction activity** (capped at 7 days, floored at 0.5 day) instead of a fixed 7-day denominator, which previously underestimated the burn rate and overestimated the time-to-overdue. +- Fetches up to 400 history records (was 100) so a full week of hourly deductions isn't truncated and undercounted. Recharges and out-of-window records are correctly excluded. Added unit tests for span, floor, and exclusion behavior. + +## [0.0.17] - 2026-06-19 + +### Fixed — Billing interval correctness & restart safety + +- Billing now gates on a persisted `lastBilledAt` timestamp (stored in the billing ConfigMap): a cycle only runs once ~the configured interval has actually elapsed. This stops two failure modes — the hourly scheduler tick over-billing when `interval > 1h`, and a process restart re-billing a window that was already charged. +- The first run on a fresh deployment establishes a baseline instead of billing an unknown historical window. +- The timestamp write uses optimistic-concurrency retry; added a round-trip unit test. + +## [0.0.16] - 2026-06-19 + +### Security — Configurable CORS + +- CORS is now configurable via `CORS_ALLOWED_ORIGINS` (comma-separated allowlist). When set, only listed origins are echoed back (with `Vary: Origin` and `Access-Control-Allow-Credentials`); other origins get no `Access-Control-Allow-Origin` and are blocked by the browser. Default (unset) preserves the previous `*` behavior, so existing deployments are unaffected until they opt in to tightening. + +## [0.0.15] - 2026-06-19 + +### Security — Login hardening + +- **Per-IP login rate limiting**: after 5 failed attempts within 5 minutes an IP is locked out for 15 minutes (HTTP 429 + `Retry-After`), stopping unthrottled brute-force of the admin password. +- **Constant-time credential comparison** (`crypto/subtle.ConstantTimeCompare`) for both username and password, removing the early-exit timing side channel; both comparisons always run so username validity isn't leaked. +- Added unit tests for the limiter (block threshold, success reset, window reset). + +## [0.0.14] - 2026-06-19 + +### Fixed — Helm secret persistence + +- The auth `Secret` now reuses the existing JWT signing key and admin password on `helm upgrade` via `lookup`, instead of regenerating them with `randAlphaNum` on every render. Previously each upgrade rotated the JWT key (invalidating all sessions) and silently changed the admin password. Fresh installs still auto-generate; explicit `auth.admin.password` / `auth.jwt.secret` and `existingSecret` continue to take precedence. + +## [0.0.13] - 2026-06-19 + +### Added — Scheduler leader election + +- **Lease-based leader election** (`internal/leader`) guards the singleton billing/auto-recharge/alert scheduler so it runs on exactly one api-server replica at a time. This is the root fix for the duplicate-billing risk; `apiServer.replicaCount` is restored to `2` for HA. +- The scheduler is now **re-startable** (clean `Start`/`Stop` on leadership changes), with new tests covering restart and stop-before-start safety. +- Toggle via `LEADER_ELECTION_ENABLED` (default on); disable for single-replica / local dev. +- Added `coordination.k8s.io/leases` (get/create/update) to the api-server RBAC. + +## [0.0.12] - 2026-06-19 + +### Fixed — Billing correctness & concurrency + +- **Atomic balance updates**: `Recharge`, `Deduct`, auto-recharge, and overdue-marking now perform their ConfigMap read-modify-write under `retry.RetryOnConflict`, eliminating silent lost updates / balance corruption under concurrent operations. +- **`Deduct` returns the post-write balance**, so `ProcessBilling` no longer makes a racy second read to decide suspension. +- **Overdue marker preserved across deductions** — a deduction no longer wipes `OverdueAt`, so the grace-period clock is measured from when the balance first went negative (teams now actually suspend after the grace window instead of never). +- **Scheduler hardening**: per-task `panic` recovery (one failing task can no longer crash the api-server) plus startup jitter to avoid multi-replica stampede. +- **Stopgap against double-billing**: `apiServer.replicaCount` defaults to `1` until scheduler leader election lands (the scheduler runs in the api-server; 2 replicas billed tenants twice). + +### Changed — Performance + +- **K8s client** QPS/Burst raised to 50/100 (from the 5/10 default) so dashboard/billing list bursts are not client-side throttled. +- **Web UI** route-level code splitting + Vite `manualChunks`: echarts (~1 MB) and per-page bundles now load on demand instead of shipping with every Login/Dashboard session. + +### Added + +- First backend unit tests (`calculateCost`, `Recharge`/`Deduct`, grace-period logic, concurrent-recharge race test) with a fake-clientset optimistic-concurrency harness. +- Top-level React `ErrorBoundary` and a shared `getApiErrorMessage` utility. +- `docs/optimization-roadmap.md` — prioritized continuous-optimization roadmap from a full-codebase audit. + +### Changed — Website & docs + +- Replaced emoji icons with inline Tabler-style SVG icons; added an interactive vector `ProductShowcase` (Dashboard / Cluster / Reports / Billing). +- Fixed the displayed UI version (was hardcoded `v3.0.0`; now injected from `package.json`). +- Corrected install docs: OCI path `charts/bison`, `/healthz` health check, OpenCost namespace/value keys, `replicaCount`, object names `bison-api`/`bison-web`. +- New 1200×630 social/OG card and SEO metadata; reduced-motion + offscreen-pause for the particle background. +- Removed the unused `@ant-design/pro-components` dependency and stray `console.log`s. + +> Note: versions 0.0.2–0.0.11 were release-automation version bumps without dedicated changelog entries. + ## [0.0.1] - 2025-12-27 ### 🎉 Initial Release diff --git a/Makefile b/Makefile index 9453481..62ef8b6 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # 基于 Capsule + OpenCost 架构 # ==================== 配置 ==================== -REGISTRY ?= docker.io +REGISTRY ?= ghcr.io/supermarioyl REPO ?= bison VERSION ?= latest HELM_RELEASE ?= bison @@ -304,9 +304,8 @@ deploy: ## 部署 Bison helm upgrade --install $(HELM_RELEASE) ./deploy/charts/bison \ --namespace $(NAMESPACE) \ --create-namespace \ - --set apiServer.image.repository=$(REGISTRY)/$(REPO)/api-server \ + --set global.imageRegistry=$(REGISTRY) \ --set apiServer.image.tag=$(VERSION) \ - --set webUI.image.repository=$(REGISTRY)/$(REPO)/web-ui \ --set webUI.image.tag=$(VERSION) .PHONY: deploy-with-auth @@ -316,9 +315,8 @@ deploy-with-auth: ## 部署 Bison (启用认证) --create-namespace \ --set auth.enabled=true \ --set auth.admin.password=$$(openssl rand -base64 12) \ - --set apiServer.image.repository=$(REGISTRY)/$(REPO)/api-server \ + --set global.imageRegistry=$(REGISTRY) \ --set apiServer.image.tag=$(VERSION) \ - --set webUI.image.repository=$(REGISTRY)/$(REPO)/web-ui \ --set webUI.image.tag=$(VERSION) .PHONY: undeploy diff --git a/README.md b/README.md index 17866cd..48d660a 100644 --- a/README.md +++ b/README.md @@ -237,29 +237,27 @@ helm install opencost opencost/opencost -n opencost --create-namespace \ ### 2. Deploy Bison -#### Option A: Using Helm Repository (Recommended) +#### Option A: Using GHCR (Recommended - OCI Format) ```bash -# Add Bison Helm repository -helm repo add bison https://supermarioyl.github.io/Bison/charts/ -helm repo update - -# Install Bison -helm install bison bison/bison \ +# Install directly from GitHub Container Registry +helm install bison oci://ghcr.io/supermarioyl/charts/bison \ --namespace bison-system \ --create-namespace \ --set auth.enabled=true \ - --version 0.0.1 + --version 0.0.12 ``` +> **Note:** Requires Helm >= 3.8.0 for OCI support + #### Option B: From GitHub Release ```bash # Download Helm chart from GitHub Release -wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.1/bison-0.0.1.tgz +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.12/bison-0.0.12.tgz # Install from downloaded chart -helm install bison bison-0.0.1.tgz \ +helm install bison bison-0.0.12.tgz \ --namespace bison-system \ --create-namespace \ --set auth.enabled=true @@ -288,7 +286,7 @@ kubectl get secret bison-auth -n bison-system -o jsonpath='{.data.password}' | b kubectl port-forward svc/bison-api 8080:8080 -n bison-system # Access API -curl http://localhost:8080/api/v1/health +curl http://localhost:8080/healthz ``` --- @@ -732,26 +730,53 @@ docker pull ghcr.io/supermarioyl/bison/web-ui:latest - `linux/amd64` - `linux/arm64` -### Helm Repository +### Helm Installation Methods + +Bison Helm charts are distributed via **GitHub Container Registry (GHCR)** using the OCI format, which is the modern standard for Helm 3.8+. + +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 + +#### Method 1: Install from GHCR (Recommended) ```bash -# Add repository -helm repo add bison https://supermarioyl.github.io/Bison/charts/ +# Install specific version directly from GHCR +helm install my-bison oci://ghcr.io/supermarioyl/charts/bison --version 0.0.12 -# Search available versions -helm search repo bison +# Or pull the chart first, then install +helm pull oci://ghcr.io/supermarioyl/charts/bison --version 0.0.12 +helm install my-bison bison-0.0.12.tgz -# View chart information -helm show chart bison/bison -helm show values bison/bison +# Customize installation +helm install my-bison oci://ghcr.io/supermarioyl/charts/bison \ + --version 0.0.12 \ + --namespace bison-system \ + --create-namespace \ + --set dependencies.opencost.apiUrl=http://opencost.opencost.svc.cluster.local:9003 \ + --set dependencies.opencost.enabled=true \ + --set auth.enabled=true +``` + +#### Method 2: Install from GitHub Releases -# Install specific version -helm install my-bison bison/bison --version 0.0.1 +```bash +# Download chart from GitHub Releases +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.12/bison-0.0.12.tgz -# Upgrade to latest -helm upgrade my-bison bison/bison +# Install from downloaded file +helm install my-bison bison-0.0.12.tgz \ + --namespace bison-system \ + --create-namespace ``` +**Why GHCR OCI Format?** +- ✅ No need for separate Helm repository maintenance +- ✅ Unified image and chart management in GHCR +- ✅ Faster installation (direct pull from registry) +- ✅ Better version control and immutability +- ✅ Standard practice for Helm 3.8+ + ## Development ```bash @@ -783,16 +808,16 @@ Bison uses automated GitHub Actions for releases: ``` 2. **GitHub Actions automatically**: - - Builds multi-platform Docker images + - Builds multi-platform Docker images (amd64, arm64) - Pushes images to GitHub Container Registry - Packages Helm chart - - Creates GitHub Release - - Updates Helm repository on GitHub Pages + - Publishes chart to GHCR (OCI format) + - Creates GitHub Release with chart attachment 3. **Verify release**: - Check [GitHub Releases](https://github.com/SuperMarioYL/Bison/releases) - - Pull new images: `docker pull ghcr.io/supermarioyl/bison/api-server:3.1.0` - - Update Helm repo: `helm repo update && helm search repo bison` + - Pull new images: `docker pull ghcr.io/supermarioyl/bison/api-server:0.0.12` + - Install chart: `helm install test oci://ghcr.io/supermarioyl/charts/bison --version 0.0.12` ## Project Structure diff --git a/api-server/.dockerignore b/api-server/.dockerignore new file mode 100644 index 0000000..7106c2e --- /dev/null +++ b/api-server/.dockerignore @@ -0,0 +1,11 @@ +# Keep the Docker build context small and reproducible. +.git +.gitignore +.dockerignore +Dockerfile +dist +bin +*.out +coverage.out +.idea +.vscode diff --git a/api-server/cmd/main.go b/api-server/cmd/main.go index 307caf1..8c8be80 100644 --- a/api-server/cmd/main.go +++ b/api-server/cmd/main.go @@ -14,6 +14,7 @@ import ( "github.com/bison/api-server/internal/config" "github.com/bison/api-server/internal/handler" "github.com/bison/api-server/internal/k8s" + "github.com/bison/api-server/internal/leader" "github.com/bison/api-server/internal/middleware" "github.com/bison/api-server/internal/opencost" "github.com/bison/api-server/internal/scheduler" @@ -71,6 +72,9 @@ func main() { reportSvc := service.NewReportService(opencostClient, tenantSvc, projectSvc, billingSvc) nodeSvc := service.NewNodeService(k8sClient) workloadSvc := service.NewWorkloadService(k8sClient) + initScriptSvc := service.NewInitScriptService(k8sClient) + onboardingSvc := service.NewOnboardingService(k8sClient, nodeSvc, initScriptSvc) + configTransferSvc := service.NewConfigTransferService(billingSvc, alertSvc, resourceConfigSvc, initScriptSvc) // Initialize scheduler sched := scheduler.NewScheduler(billingSvc, balanceSvc, alertSvc) @@ -106,6 +110,8 @@ func main() { statusHandler := handler.NewStatusHandler(statusSvc) nodeHandler := handler.NewNodeHandler(nodeSvc) workloadHandler := handler.NewWorkloadHandler(workloadSvc, projectSvc) + onboardingHandler := handler.NewOnboardingHandler(onboardingSvc, initScriptSvc) + configTransferHandler := handler.NewConfigTransferHandler(configTransferSvc) // Setup Gin router if cfg.Mode == "release" { @@ -115,7 +121,7 @@ func main() { router := gin.New() router.Use(middleware.Recovery()) router.Use(middleware.Logger()) - router.Use(corsMiddleware()) + router.Use(corsMiddleware(cfg.CORSAllowedOrigins)) // Health check endpoints router.GET("/healthz", func(c *gin.Context) { @@ -135,7 +141,9 @@ func main() { // Feature flags (public) api.GET("/features", func(c *gin.Context) { c.JSON(http.StatusOK, gin.H{ - "costEnabled": costSvc.IsEnabled(), + "costEnabled": costSvc.IsEnabled(), + "capsuleEnabled": cfg.CapsuleEnabled, + "prometheusEnabled": cfg.PrometheusURL != "", }) }) @@ -235,6 +243,12 @@ func main() { protected.POST("/nodes/:name/assign", nodeHandler.AssignNodeToTeam) protected.POST("/nodes/:name/release", nodeHandler.ReleaseNode) + // Node onboarding + protected.POST("/nodes/onboard", onboardingHandler.StartOnboarding) + protected.GET("/nodes/onboard", onboardingHandler.ListOnboardingJobs) + protected.GET("/nodes/onboard/:jobId", onboardingHandler.GetOnboardingJob) + protected.DELETE("/nodes/onboard/:jobId", onboardingHandler.CancelOnboardingJob) + // System settings protected.GET("/settings", settingsHandler.GetSettings) protected.GET("/settings/billing", billingHandler.GetBillingConfig) @@ -243,6 +257,25 @@ func main() { protected.PUT("/settings/alerts", alertHandler.UpdateAlertConfig) protected.POST("/settings/alerts/test", alertHandler.TestChannel) + // Control plane settings + protected.GET("/settings/control-plane", onboardingHandler.GetControlPlaneConfig) + protected.PUT("/settings/control-plane", onboardingHandler.UpdateControlPlaneConfig) + protected.POST("/settings/control-plane/test", onboardingHandler.TestControlPlaneConnection) + + // Init scripts settings + protected.GET("/settings/init-scripts", onboardingHandler.ListInitScripts) + protected.POST("/settings/init-scripts", onboardingHandler.CreateInitScript) + protected.GET("/settings/init-scripts/:id", onboardingHandler.GetInitScript) + protected.PUT("/settings/init-scripts/:id", onboardingHandler.UpdateInitScript) + protected.DELETE("/settings/init-scripts/:id", onboardingHandler.DeleteInitScript) + protected.PUT("/settings/init-scripts/:id/toggle", onboardingHandler.ToggleInitScript) + protected.PUT("/settings/init-scripts/reorder", onboardingHandler.ReorderInitScripts) + + // Configuration import/export + protected.GET("/settings/export", configTransferHandler.ExportConfig) + protected.POST("/settings/import/preview", configTransferHandler.PreviewImport) + protected.POST("/settings/import/apply", configTransferHandler.ApplyImport) + // Node metrics (from Prometheus) protected.GET("/metrics/node/:name", settingsHandler.GetNodeMetrics) @@ -268,9 +301,22 @@ func main() { IdleTimeout: 60 * time.Second, } - // Start scheduler + // Start scheduler. When leader election is enabled the singleton scheduler + // runs on exactly one replica at a time (guards against duplicate billing); + // otherwise it runs directly (single-replica / local dev). ctx, cancel := context.WithCancel(context.Background()) - sched.Start(ctx) + if cfg.LeaderElectionEnabled { + go leader.Run(ctx, k8sClient.Clientset(), service.BisonNamespace, + func(leaderCtx context.Context) { + sched.Start(leaderCtx) + <-leaderCtx.Done() + sched.Stop() + }, + func() { sched.Stop() }, + ) + } else { + sched.Start(ctx) + } // Start server in goroutine go func() { @@ -302,9 +348,31 @@ func main() { logger.Info("Server stopped gracefully") } -func corsMiddleware() gin.HandlerFunc { +func corsMiddleware(allowedOrigins []string) gin.HandlerFunc { + // Build an O(1) lookup; "*" or an empty list means allow any origin. + allowAny := len(allowedOrigins) == 0 + allowSet := make(map[string]struct{}, len(allowedOrigins)) + for _, o := range allowedOrigins { + if o == "*" { + allowAny = true + } + allowSet[o] = struct{}{} + } + return func(c *gin.Context) { - c.Header("Access-Control-Allow-Origin", "*") + origin := c.GetHeader("Origin") + + if allowAny { + c.Header("Access-Control-Allow-Origin", "*") + } else if _, ok := allowSet[origin]; ok && origin != "" { + // Echo the specific allowed origin and allow credentialed requests. + c.Header("Access-Control-Allow-Origin", origin) + c.Header("Access-Control-Allow-Credentials", "true") + c.Header("Vary", "Origin") + } + // If the origin is not allowed, no Allow-Origin header is set and the + // browser blocks the cross-origin response. + c.Header("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS") c.Header("Access-Control-Allow-Headers", "Content-Type, Authorization") diff --git a/api-server/go.mod b/api-server/go.mod index 5e13945..a7e009b 100644 --- a/api-server/go.mod +++ b/api-server/go.mod @@ -6,6 +6,7 @@ require ( github.com/gin-gonic/gin v1.9.1 github.com/golang-jwt/jwt/v5 v5.2.0 go.uber.org/zap v1.27.0 + golang.org/x/crypto v0.18.0 k8s.io/api v0.29.1 k8s.io/apimachinery v0.29.1 k8s.io/client-go v0.29.1 @@ -16,6 +17,7 @@ require ( github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/evanphx/json-patch v4.12.0+incompatible // indirect github.com/gabriel-vasile/mimetype v1.4.2 // indirect github.com/gin-contrib/sse v0.1.0 // indirect github.com/go-logr/logr v1.4.1 // indirect @@ -45,12 +47,12 @@ require ( github.com/onsi/ginkgo/v2 v2.15.0 // indirect github.com/onsi/gomega v1.31.1 // indirect github.com/pelletier/go-toml/v2 v2.0.8 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.11 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/arch v0.3.0 // indirect - golang.org/x/crypto v0.18.0 // indirect golang.org/x/net v0.20.0 // indirect golang.org/x/oauth2 v0.12.0 // indirect golang.org/x/sys v0.16.0 // indirect diff --git a/api-server/go.sum b/api-server/go.sum index 11d916a..c59e0ea 100644 --- a/api-server/go.sum +++ b/api-server/go.sum @@ -10,6 +10,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84= +github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU= github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA= github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE= @@ -97,6 +99,8 @@ github.com/onsi/gomega v1.31.1 h1:KYppCUK+bUgAZwHOu7EXVBKyQA6ILvOESHkn/tgoqvo= github.com/onsi/gomega v1.31.1/go.mod h1:y40C95dwAD1Nz36SsEnxvfFe8FFfNxzI5eJ0EYGyAy0= github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ= github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= diff --git a/api-server/internal/config/config.go b/api-server/internal/config/config.go index 437324d..266320d 100644 --- a/api-server/internal/config/config.go +++ b/api-server/internal/config/config.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "strconv" + "strings" ) // Config holds the API server configuration @@ -21,19 +22,40 @@ type Config struct { // External services OpenCostURL string PrometheusURL string + + // CORSAllowedOrigins restricts cross-origin requests. Empty means allow all + // origins ("*"); set a comma-separated allowlist to tighten in production. + CORSAllowedOrigins []string + + // Feature toggles + CapsuleEnabled bool + + // LeaderElectionEnabled gates the singleton scheduler behind a Kubernetes + // lease so it runs on exactly one replica. Disable for single-replica or + // out-of-cluster development. + LeaderElectionEnabled bool } +// Built-in development defaults that MUST NOT be used in production once auth is +// enabled — startup refuses to proceed if they are left unchanged. +const ( + defaultAdminPassword = "admin" + defaultJWTSecret = "bison-secret-key-change-in-production" +) + // Load reads configuration from environment variables func Load() (*Config, error) { cfg := &Config{ - Port: 8080, - Mode: "release", - AuthEnabled: false, - AdminUsername: "admin", - AdminPassword: "admin", - JWTSecret: "bison-secret-key-change-in-production", - OpenCostURL: "", - PrometheusURL: "", + Port: 8080, + Mode: "release", + AuthEnabled: false, + AdminUsername: "admin", + AdminPassword: defaultAdminPassword, + JWTSecret: defaultJWTSecret, + OpenCostURL: "", + PrometheusURL: "", + CapsuleEnabled: true, + LeaderElectionEnabled: true, } if port := os.Getenv("PORT"); port != "" { @@ -70,5 +92,42 @@ func Load() (*Config, error) { cfg.PrometheusURL = prometheusURL } + // Feature toggles + if capsuleEnabled := os.Getenv("CAPSULE_ENABLED"); capsuleEnabled == "false" { + cfg.CapsuleEnabled = false + } + if le := os.Getenv("LEADER_ELECTION_ENABLED"); le == "false" { + cfg.LeaderElectionEnabled = false + } + + // CORS allowlist (comma-separated origins). Empty -> allow all. + if origins := os.Getenv("CORS_ALLOWED_ORIGINS"); origins != "" { + for _, o := range strings.Split(origins, ",") { + if o = strings.TrimSpace(o); o != "" { + cfg.CORSAllowedOrigins = append(cfg.CORSAllowedOrigins, o) + } + } + } + + if err := cfg.validate(); err != nil { + return nil, err + } + return cfg, nil } + +// validate refuses to start with insecure defaults once authentication is enabled, +// so a production deployment cannot accidentally run with the public default JWT +// signing key or the well-known "admin" password. +func (c *Config) validate() error { + if !c.AuthEnabled { + return nil + } + if c.JWTSecret == "" || c.JWTSecret == defaultJWTSecret { + return fmt.Errorf("refusing to start: JWT_SECRET must be set to a non-default value when AUTH_ENABLED=true") + } + if c.AdminPassword == "" || c.AdminPassword == defaultAdminPassword { + return fmt.Errorf("refusing to start: ADMIN_PASSWORD must be set to a non-default value when AUTH_ENABLED=true") + } + return nil +} diff --git a/api-server/internal/config/config_test.go b/api-server/internal/config/config_test.go new file mode 100644 index 0000000..f55d0bf --- /dev/null +++ b/api-server/internal/config/config_test.go @@ -0,0 +1,43 @@ +package config + +import "testing" + +func TestValidate(t *testing.T) { + cases := []struct { + name string + cfg Config + wantErr bool + }{ + { + name: "auth disabled allows defaults", + cfg: Config{AuthEnabled: false, JWTSecret: defaultJWTSecret, AdminPassword: defaultAdminPassword}, + }, + { + name: "auth enabled rejects default jwt secret", + cfg: Config{AuthEnabled: true, JWTSecret: defaultJWTSecret, AdminPassword: "strong-pass"}, + wantErr: true, + }, + { + name: "auth enabled rejects empty jwt secret", + cfg: Config{AuthEnabled: true, JWTSecret: "", AdminPassword: "strong-pass"}, + wantErr: true, + }, + { + name: "auth enabled rejects default password", + cfg: Config{AuthEnabled: true, JWTSecret: "a-real-secret", AdminPassword: defaultAdminPassword}, + wantErr: true, + }, + { + name: "auth enabled accepts strong values", + cfg: Config{AuthEnabled: true, JWTSecret: "a-real-secret", AdminPassword: "strong-pass"}, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := tc.cfg.validate() + if (err != nil) != tc.wantErr { + t.Fatalf("validate() err=%v, wantErr=%v", err, tc.wantErr) + } + }) + } +} diff --git a/api-server/internal/handler/auth.go b/api-server/internal/handler/auth.go index 1705883..3841bd9 100644 --- a/api-server/internal/handler/auth.go +++ b/api-server/internal/handler/auth.go @@ -1,8 +1,11 @@ package handler import ( + "crypto/subtle" "net/http" + "strconv" "strings" + "sync" "time" "github.com/gin-gonic/gin" @@ -11,12 +14,79 @@ import ( "github.com/bison/api-server/pkg/logger" ) +// Login brute-force protection: after maxLoginFails failed attempts from one IP +// within loginWindow, that IP is locked out for loginBlock. +const ( + maxLoginFails = 5 + loginWindow = 5 * time.Minute + loginBlock = 15 * time.Minute +) + +type failRecord struct { + count int + resetAt time.Time + blockedUntil time.Time +} + +// loginLimiter is a small in-memory per-IP failed-login limiter. +type loginLimiter struct { + mu sync.Mutex + fails map[string]*failRecord +} + +func newLoginLimiter() *loginLimiter { + return &loginLimiter{fails: make(map[string]*failRecord)} +} + +// allowed reports whether the IP may attempt a login now; if blocked it returns +// the number of seconds to wait. +func (l *loginLimiter) allowed(ip string, now time.Time) (bool, int) { + l.mu.Lock() + defer l.mu.Unlock() + rec := l.fails[ip] + if rec != nil && now.Before(rec.blockedUntil) { + return false, int(rec.blockedUntil.Sub(now).Seconds()) + 1 + } + return true, 0 +} + +// recordFailure increments the failure counter for an IP and blocks it once the +// threshold within the window is exceeded. +func (l *loginLimiter) recordFailure(ip string, now time.Time) { + l.mu.Lock() + defer l.mu.Unlock() + rec := l.fails[ip] + if rec == nil || now.After(rec.resetAt) { + rec = &failRecord{resetAt: now.Add(loginWindow)} + l.fails[ip] = rec + } + rec.count++ + if rec.count >= maxLoginFails { + rec.blockedUntil = now.Add(loginBlock) + } + // Opportunistic prune to bound memory. + if len(l.fails) > 1024 { + for k, v := range l.fails { + if now.After(v.resetAt) && now.After(v.blockedUntil) { + delete(l.fails, k) + } + } + } +} + +func (l *loginLimiter) recordSuccess(ip string) { + l.mu.Lock() + defer l.mu.Unlock() + delete(l.fails, ip) +} + // AuthHandler handles authentication type AuthHandler struct { username string password string jwtSecret []byte enabled bool + limiter *loginLimiter } // NewAuthHandler creates a new AuthHandler @@ -26,6 +96,7 @@ func NewAuthHandler(username, password, jwtSecret string, enabled bool) *AuthHan password: password, jwtSecret: []byte(jwtSecret), enabled: enabled, + limiter: newLoginLimiter(), } } @@ -44,6 +115,17 @@ type LoginResponse struct { // Login handles user login func (h *AuthHandler) Login(c *gin.Context) { + ip := c.ClientIP() + now := time.Now() + + // Reject brute-force attempts before doing any credential work. + if ok, retryAfter := h.limiter.allowed(ip, now); !ok { + logger.Warn("Login blocked: too many failed attempts", "ip", ip, "retryAfterSec", retryAfter) + c.Header("Retry-After", strconv.Itoa(retryAfter)) + c.JSON(http.StatusTooManyRequests, gin.H{"error": "登录尝试过于频繁,请稍后再试", "code": "TOO_MANY_ATTEMPTS"}) + return + } + var req LoginRequest if err := c.ShouldBindJSON(&req); err != nil { logger.Warn("Login failed: invalid request", "error", err) @@ -51,12 +133,17 @@ func (h *AuthHandler) Login(c *gin.Context) { return } - // Validate credentials - if req.Username != h.username || req.Password != h.password { - logger.Warn("Login failed: invalid credentials", "username", req.Username) + // Validate credentials using constant-time comparison to avoid leaking timing + // information. Both comparisons always run so username validity is not revealed. + userOK := subtle.ConstantTimeCompare([]byte(req.Username), []byte(h.username)) == 1 + passOK := subtle.ConstantTimeCompare([]byte(req.Password), []byte(h.password)) == 1 + if !userOK || !passOK { + h.limiter.recordFailure(ip, now) + logger.Warn("Login failed: invalid credentials", "username", req.Username, "ip", ip) c.JSON(http.StatusUnauthorized, gin.H{"error": "用户名或密码错误", "code": "INVALID_CREDENTIALS"}) return } + h.limiter.recordSuccess(ip) // Generate JWT token expiresAt := time.Now().Add(24 * time.Hour) diff --git a/api-server/internal/handler/auth_test.go b/api-server/internal/handler/auth_test.go new file mode 100644 index 0000000..3550864 --- /dev/null +++ b/api-server/internal/handler/auth_test.go @@ -0,0 +1,66 @@ +package handler + +import ( + "testing" + "time" +) + +func TestLoginLimiterBlocksAfterMaxFails(t *testing.T) { + l := newLoginLimiter() + now := time.Now() + ip := "1.2.3.4" + + for i := 0; i < maxLoginFails-1; i++ { + l.recordFailure(ip, now) + if ok, _ := l.allowed(ip, now); !ok { + t.Fatalf("blocked too early after %d fails", i+1) + } + } + // The maxLoginFails-th failure triggers the block. + l.recordFailure(ip, now) + ok, retry := l.allowed(ip, now) + if ok { + t.Fatal("expected block after maxLoginFails failures") + } + if retry <= 0 { + t.Fatalf("expected positive Retry-After, got %d", retry) + } + + // Block clears after loginBlock elapses. + if ok, _ := l.allowed(ip, now.Add(loginBlock+time.Second)); !ok { + t.Fatal("expected unblock after loginBlock elapsed") + } +} + +func TestLoginLimiterSuccessResets(t *testing.T) { + l := newLoginLimiter() + now := time.Now() + ip := "5.6.7.8" + + for i := 0; i < maxLoginFails; i++ { + l.recordFailure(ip, now) + } + if ok, _ := l.allowed(ip, now); ok { + t.Fatal("expected block") + } + // A successful login from a different (unblocked) state clears the record. + l.recordSuccess(ip) + if ok, _ := l.allowed(ip, now); !ok { + t.Fatal("expected allowed after recordSuccess") + } +} + +func TestLoginLimiterWindowResets(t *testing.T) { + l := newLoginLimiter() + now := time.Now() + ip := "9.9.9.9" + + // A few failures, then let the window expire before reaching the threshold. + l.recordFailure(ip, now) + l.recordFailure(ip, now) + later := now.Add(loginWindow + time.Second) + l.recordFailure(ip, later) // counter resets, so this is failure #1 in a new window + if ok, _ := l.allowed(ip, later); !ok { + t.Fatal("expected allowed; window should have reset the counter") + } +} diff --git a/api-server/internal/handler/config_transfer.go b/api-server/internal/handler/config_transfer.go new file mode 100644 index 0000000..c101a72 --- /dev/null +++ b/api-server/internal/handler/config_transfer.go @@ -0,0 +1,116 @@ +package handler + +import ( + "encoding/json" + "fmt" + "net/http" + "strings" + "time" + + "github.com/gin-gonic/gin" + + "github.com/bison/api-server/internal/service" + "github.com/bison/api-server/pkg/logger" +) + +// ConfigTransferHandler handles configuration import/export requests +type ConfigTransferHandler struct { + configTransferSvc *service.ConfigTransferService +} + +// NewConfigTransferHandler creates a new ConfigTransferHandler +func NewConfigTransferHandler(svc *service.ConfigTransferService) *ConfigTransferHandler { + return &ConfigTransferHandler{ + configTransferSvc: svc, + } +} + +// ExportConfig exports configuration as a JSON file download +func (h *ConfigTransferHandler) ExportConfig(c *gin.Context) { + sectionsParam := c.DefaultQuery("sections", strings.Join(service.AllSections, ",")) + includeSensitive := c.DefaultQuery("includeSensitive", "false") == "true" + + sections := strings.Split(sectionsParam, ",") + for i := range sections { + sections[i] = strings.TrimSpace(sections[i]) + } + + operator := "admin" + if username, exists := c.Get("username"); exists { + operator = username.(string) + } + + config, err := h.configTransferSvc.Export(c.Request.Context(), sections, includeSensitive, operator) + if err != nil { + logger.Error("Failed to export config", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + data, err := json.MarshalIndent(config, "", " ") + if err != nil { + logger.Error("Failed to marshal export config", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "序列化配置失败"}) + return + } + + filename := fmt.Sprintf("bison-config-%s.json", time.Now().Format("20060102-150405")) + c.Header("Content-Type", "application/json") + c.Header("Content-Disposition", fmt.Sprintf("attachment; filename=%s", filename)) + c.Data(http.StatusOK, "application/json", data) +} + +// PreviewImport validates and previews an import configuration +func (h *ConfigTransferHandler) PreviewImport(c *gin.Context) { + var config service.ExportConfig + if err := c.ShouldBindJSON(&config); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "无效的 JSON 格式: " + err.Error()}) + return + } + + if config.Version == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "缺少 version 字段"}) + return + } + if config.Sections == nil || len(config.Sections) == 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "缺少 sections 字段"}) + return + } + + result, err := h.configTransferSvc.Preview(c.Request.Context(), &config) + if err != nil { + logger.Error("Failed to preview import", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, result) +} + +// ApplyImport applies the imported configuration +func (h *ConfigTransferHandler) ApplyImport(c *gin.Context) { + var req service.ImportRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "无效的请求格式: " + err.Error()}) + return + } + + if len(req.Sections) == 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "请选择至少一个配置模块"}) + return + } + + if req.Config.Version == "" || req.Config.Sections == nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "无效的配置数据"}) + return + } + + result, err := h.configTransferSvc.Apply(c.Request.Context(), &req) + if err != nil { + logger.Error("Failed to apply import", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, result) +} diff --git a/api-server/internal/handler/onboarding.go b/api-server/internal/handler/onboarding.go new file mode 100644 index 0000000..d33b051 --- /dev/null +++ b/api-server/internal/handler/onboarding.go @@ -0,0 +1,276 @@ +package handler + +import ( + "net/http" + + "github.com/gin-gonic/gin" + + "github.com/bison/api-server/internal/service" + "github.com/bison/api-server/pkg/logger" +) + +// OnboardingHandler handles node onboarding requests +type OnboardingHandler struct { + onboardingSvc *service.OnboardingService + initScriptSvc *service.InitScriptService +} + +// NewOnboardingHandler creates a new OnboardingHandler +func NewOnboardingHandler(onboardingSvc *service.OnboardingService, initScriptSvc *service.InitScriptService) *OnboardingHandler { + return &OnboardingHandler{ + onboardingSvc: onboardingSvc, + initScriptSvc: initScriptSvc, + } +} + +// StartOnboarding starts a new node onboarding job +// POST /api/v1/nodes/onboard +func (h *OnboardingHandler) StartOnboarding(c *gin.Context) { + var req service.OnboardingRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + job, err := h.onboardingSvc.StartOnboarding(c.Request.Context(), &req) + if err != nil { + logger.Error("Failed to start onboarding", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusAccepted, job) +} + +// GetOnboardingJob returns a specific onboarding job +// GET /api/v1/nodes/onboard/:jobId +func (h *OnboardingHandler) GetOnboardingJob(c *gin.Context) { + jobID := c.Param("jobId") + + job, err := h.onboardingSvc.GetJob(c.Request.Context(), jobID) + if err != nil { + c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, job) +} + +// ListOnboardingJobs returns all onboarding jobs +// GET /api/v1/nodes/onboard +func (h *OnboardingHandler) ListOnboardingJobs(c *gin.Context) { + jobs, err := h.onboardingSvc.ListJobs(c.Request.Context()) + if err != nil { + logger.Error("Failed to list onboarding jobs", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"items": jobs}) +} + +// CancelOnboardingJob cancels a running onboarding job +// DELETE /api/v1/nodes/onboard/:jobId +func (h *OnboardingHandler) CancelOnboardingJob(c *gin.Context) { + jobID := c.Param("jobId") + + err := h.onboardingSvc.CancelJob(c.Request.Context(), jobID) + if err != nil { + logger.Error("Failed to cancel onboarding job", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Job cancelled"}) +} + +// GetControlPlaneConfig returns the control plane configuration +// GET /api/v1/settings/control-plane +func (h *OnboardingHandler) GetControlPlaneConfig(c *gin.Context) { + config, err := h.initScriptSvc.GetControlPlaneConfig(c.Request.Context()) + if err != nil { + logger.Error("Failed to get control plane config", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + // Mask sensitive data + response := gin.H{ + "host": config.Host, + "sshPort": config.SSHPort, + "sshUser": config.SSHUser, + "authMethod": config.AuthMethod, + "hasPassword": config.Password != "", + "hasPrivateKey": config.PrivateKey != "", + } + + c.JSON(http.StatusOK, response) +} + +// UpdateControlPlaneConfig updates the control plane configuration +// PUT /api/v1/settings/control-plane +func (h *OnboardingHandler) UpdateControlPlaneConfig(c *gin.Context) { + var config service.ControlPlaneConfig + if err := c.ShouldBindJSON(&config); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + // Get existing config to preserve credentials if not provided + existing, _ := h.initScriptSvc.GetControlPlaneConfig(c.Request.Context()) + if existing != nil { + if config.Password == "" && existing.Password != "" { + config.Password = existing.Password + } + if config.PrivateKey == "" && existing.PrivateKey != "" { + config.PrivateKey = existing.PrivateKey + } + } + + err := h.initScriptSvc.SaveControlPlaneConfig(c.Request.Context(), &config) + if err != nil { + logger.Error("Failed to save control plane config", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Control plane configuration saved"}) +} + +// TestControlPlaneConnection tests the control plane SSH connection +// POST /api/v1/settings/control-plane/test +func (h *OnboardingHandler) TestControlPlaneConnection(c *gin.Context) { + err := h.onboardingSvc.TestControlPlaneConnection(c.Request.Context()) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Connection successful"}) +} + +// ListInitScripts returns all init script groups +// GET /api/v1/settings/init-scripts +func (h *OnboardingHandler) ListInitScripts(c *gin.Context) { + groups, err := h.initScriptSvc.GetAllScriptGroups(c.Request.Context()) + if err != nil { + logger.Error("Failed to list init scripts", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"items": groups}) +} + +// GetInitScript returns a specific init script group +// GET /api/v1/settings/init-scripts/:id +func (h *OnboardingHandler) GetInitScript(c *gin.Context) { + id := c.Param("id") + + group, err := h.initScriptSvc.GetScriptGroup(c.Request.Context(), id) + if err != nil { + c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, group) +} + +// CreateInitScript creates a new init script group +// POST /api/v1/settings/init-scripts +func (h *OnboardingHandler) CreateInitScript(c *gin.Context) { + var group service.ScriptGroup + if err := c.ShouldBindJSON(&group); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + err := h.initScriptSvc.CreateScriptGroup(c.Request.Context(), &group) + if err != nil { + logger.Error("Failed to create init script", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusCreated, group) +} + +// UpdateInitScript updates an init script group +// PUT /api/v1/settings/init-scripts/:id +func (h *OnboardingHandler) UpdateInitScript(c *gin.Context) { + id := c.Param("id") + + var group service.ScriptGroup + if err := c.ShouldBindJSON(&group); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + err := h.initScriptSvc.UpdateScriptGroup(c.Request.Context(), id, &group) + if err != nil { + logger.Error("Failed to update init script", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, group) +} + +// DeleteInitScript deletes an init script group +// DELETE /api/v1/settings/init-scripts/:id +func (h *OnboardingHandler) DeleteInitScript(c *gin.Context) { + id := c.Param("id") + + err := h.initScriptSvc.DeleteScriptGroup(c.Request.Context(), id) + if err != nil { + logger.Error("Failed to delete init script", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Script group deleted"}) +} + +// ToggleInitScript enables or disables an init script group +// PUT /api/v1/settings/init-scripts/:id/toggle +func (h *OnboardingHandler) ToggleInitScript(c *gin.Context) { + id := c.Param("id") + + var req struct { + Enabled bool `json:"enabled"` + } + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + err := h.initScriptSvc.ToggleScriptGroup(c.Request.Context(), id, req.Enabled) + if err != nil { + logger.Error("Failed to toggle init script", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Script group toggled"}) +} + +// ReorderInitScripts updates the order of init script groups +// PUT /api/v1/settings/init-scripts/reorder +func (h *OnboardingHandler) ReorderInitScripts(c *gin.Context) { + var req struct { + IDs []string `json:"ids"` + } + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + err := h.initScriptSvc.ReorderScriptGroups(c.Request.Context(), req.IDs) + if err != nil { + logger.Error("Failed to reorder init scripts", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Script groups reordered"}) +} diff --git a/api-server/internal/handler/settings.go b/api-server/internal/handler/settings.go index ccdf5cb..7419710 100644 --- a/api-server/internal/handler/settings.go +++ b/api-server/internal/handler/settings.go @@ -32,8 +32,17 @@ func (h *SettingsHandler) GetSettings(c *gin.Context) { func (h *SettingsHandler) GetNodeMetrics(c *gin.Context) { nodeName := c.Param("name") hours, _ := strconv.Atoi(c.DefaultQuery("hours", "24")) + hasGpu := c.DefaultQuery("hasGpu", "false") == "true" + hasNpu := c.DefaultQuery("hasNpu", "false") == "true" + + req := service.NodeMetricsRequest{ + NodeName: nodeName, + Hours: hours, + HasGpu: hasGpu, + HasNpu: hasNpu, + } - metrics, err := h.settingsSvc.GetNodeMetrics(c.Request.Context(), nodeName, hours) + metrics, err := h.settingsSvc.GetNodeMetrics(c.Request.Context(), req) if err != nil { logger.Error("Failed to get node metrics", "node", nodeName, "error", err) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) diff --git a/api-server/internal/handler/team.go b/api-server/internal/handler/team.go index e24a457..8ca3594 100644 --- a/api-server/internal/handler/team.go +++ b/api-server/internal/handler/team.go @@ -34,17 +34,9 @@ func (h *TeamHandler) ListTeams(c *gin.Context) { return } - // Enrich with usage data if cost service is enabled - if h.costSvc.IsEnabled() { - window := c.DefaultQuery("window", "7d") - for _, team := range teams { - usage, _ := h.costSvc.GetTeamUsageByName(c.Request.Context(), team.Name, window) - if usage != nil { - // Add usage info (could extend Team struct or return separately) - _ = usage - } - } - } + // Per-team usage is fetched on demand by the team detail / dashboard endpoints, + // not here: the previous enrichment loop issued one OpenCost query per team and + // then discarded the result, scaling cost linearly with team count for nothing. c.JSON(http.StatusOK, gin.H{"items": teams}) } diff --git a/api-server/internal/k8s/client.go b/api-server/internal/k8s/client.go index 59fb7c8..07925fa 100644 --- a/api-server/internal/k8s/client.go +++ b/api-server/internal/k8s/client.go @@ -24,10 +24,24 @@ import ( // Client wraps Kubernetes client operations type Client struct { - clientset *kubernetes.Clientset + clientset kubernetes.Interface dynamicClient dynamic.Interface } +// NewClientWithInterfaces builds a Client from pre-constructed clients. +// It is primarily used to inject fake clients in unit tests. +func NewClientWithInterfaces(clientset kubernetes.Interface, dynamicClient dynamic.Interface) *Client { + return &Client{ + clientset: clientset, + dynamicClient: dynamicClient, + } +} + +// Clientset exposes the underlying typed client (used e.g. for leader election). +func (c *Client) Clientset() kubernetes.Interface { + return c.clientset +} + // NewClient creates a new Kubernetes client func NewClient() (*Client, error) { var config *rest.Config @@ -52,6 +66,16 @@ func NewClient() (*Client, error) { logger.Info("Using in-cluster config") } + // Raise client-side rate limits well above the client-go default (5 QPS / 10 burst) + // so dashboard and billing list bursts are not serialized behind the throttler. + // Only override when the loaded config has not set explicit limits. + if config.QPS == 0 { + config.QPS = 50 + } + if config.Burst == 0 { + config.Burst = 100 + } + clientset, err := kubernetes.NewForConfig(config) if err != nil { logger.Error("Failed to create clientset", "error", err) diff --git a/api-server/internal/leader/leader.go b/api-server/internal/leader/leader.go new file mode 100644 index 0000000..35b9e21 --- /dev/null +++ b/api-server/internal/leader/leader.go @@ -0,0 +1,98 @@ +// Package leader provides Kubernetes lease-based leader election so that +// singleton background work (the billing/auto-recharge/alert scheduler) runs on +// exactly one api-server replica at a time, even when scaled horizontally. +package leader + +import ( + "context" + "os" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/leaderelection" + "k8s.io/client-go/tools/leaderelection/resourcelock" + + "github.com/bison/api-server/pkg/logger" +) + +// LeaseName is the coordination.k8s.io Lease used to elect the scheduler leader. +const LeaseName = "bison-scheduler" + +// identity returns a per-process identity. In Kubernetes the pod name is the +// hostname, which is unique per replica; POD_NAME overrides it when set. +func identity() string { + if v := os.Getenv("POD_NAME"); v != "" { + return v + } + if h, err := os.Hostname(); err == nil && h != "" { + return h + } + return "bison-api" +} + +func namespace(def string) string { + if v := os.Getenv("POD_NAMESPACE"); v != "" { + return v + } + return def +} + +// Run blocks running leader election until ctx is cancelled. +// +// onStarted is invoked (in its own goroutine) with a context that is cancelled +// when leadership is lost or ctx is cancelled; it should start the leader-only +// work and return promptly when its context is done. onStopped is invoked when +// leadership is lost. The scheduler must be re-startable, since leadership can be +// re-acquired after a transient loss. +func Run(ctx context.Context, clientset kubernetes.Interface, ns string, onStarted func(context.Context), onStopped func()) { + id := identity() + leaseNS := namespace(ns) + + lock := &resourcelock.LeaseLock{ + LeaseMeta: metav1.ObjectMeta{Name: LeaseName, Namespace: leaseNS}, + Client: clientset.CoordinationV1(), + LockConfig: resourcelock.ResourceLockConfig{Identity: id}, + } + + logger.Info("Starting leader election", "identity", id, "namespace", leaseNS, "lease", LeaseName) + + config := leaderelection.LeaderElectionConfig{ + Lock: lock, + ReleaseOnCancel: true, + LeaseDuration: 15 * time.Second, + RenewDeadline: 10 * time.Second, + RetryPeriod: 2 * time.Second, + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: func(leaderCtx context.Context) { + logger.Info("Acquired scheduler leadership", "identity", id) + onStarted(leaderCtx) + }, + OnStoppedLeading: func() { + logger.Warn("Lost scheduler leadership", "identity", id) + onStopped() + }, + OnNewLeader: func(current string) { + if current != id { + logger.Info("Observed scheduler leader", "leader", current) + } + }, + }, + } + + // RunOrDie returns when ctx is cancelled or leadership is lost. Loop so a + // transient loss leads to re-election rather than permanently idle. + for { + select { + case <-ctx.Done(): + return + default: + } + leaderelection.RunOrDie(ctx, config) + select { + case <-ctx.Done(): + return + case <-time.After(2 * time.Second): + } + } +} diff --git a/api-server/internal/opencost/cache.go b/api-server/internal/opencost/cache.go new file mode 100644 index 0000000..9bd6e8d --- /dev/null +++ b/api-server/internal/opencost/cache.go @@ -0,0 +1,58 @@ +package opencost + +import ( + "sync" + "time" +) + +type allocCacheEntry struct { + ready chan struct{} // closed when val/err are populated + val []Allocation + err error + expiry time.Time // guarded by allocCache.mu; zero while in-flight +} + +// allocCache is a small TTL cache that also coalesces concurrent identical +// allocation queries, so a burst of dashboard/billing requests for the same +// window/aggregate/filter hits OpenCost once instead of once per caller. +type allocCache struct { + ttl time.Duration + mu sync.Mutex + entries map[string]*allocCacheEntry +} + +func newAllocCache(ttl time.Duration) *allocCache { + return &allocCache{ttl: ttl, entries: make(map[string]*allocCacheEntry)} +} + +// do returns a cached result if fresh, joins an in-flight fetch for the same key, +// or runs fetch once and caches the (successful) result for ttl. Errors are not +// cached so the next caller retries. +func (c *allocCache) do(key string, fetch func() ([]Allocation, error)) ([]Allocation, error) { + c.mu.Lock() + if e := c.entries[key]; e != nil && (e.expiry.IsZero() || time.Now().Before(e.expiry)) { + // In-flight (zero expiry) or still-fresh cached result: reuse it. + c.mu.Unlock() + <-e.ready + return e.val, e.err + } + e := &allocCacheEntry{ready: make(chan struct{})} + c.entries[key] = e + c.mu.Unlock() + + e.val, e.err = fetch() + + c.mu.Lock() + if e.err != nil { + // Do not cache failures; drop so the next caller retries. + if c.entries[key] == e { + delete(c.entries, key) + } + } else { + e.expiry = time.Now().Add(c.ttl) + } + c.mu.Unlock() + + close(e.ready) + return e.val, e.err +} diff --git a/api-server/internal/opencost/cache_test.go b/api-server/internal/opencost/cache_test.go new file mode 100644 index 0000000..d0fac3f --- /dev/null +++ b/api-server/internal/opencost/cache_test.go @@ -0,0 +1,78 @@ +package opencost + +import ( + "errors" + "sync" + "sync/atomic" + "testing" + "time" +) + +func TestAllocCacheCoalescesConcurrentCalls(t *testing.T) { + c := newAllocCache(time.Minute) + var calls int32 + + fetch := func() ([]Allocation, error) { + atomic.AddInt32(&calls, 1) + time.Sleep(40 * time.Millisecond) // hold the in-flight window open + return []Allocation{{Name: "ns"}}, nil + } + + const n = 12 + var wg sync.WaitGroup + wg.Add(n) + for i := 0; i < n; i++ { + go func() { + defer wg.Done() + res, err := c.do("k", fetch) + if err != nil || len(res) != 1 { + t.Errorf("unexpected result: %v %v", res, err) + } + }() + } + wg.Wait() + + if got := atomic.LoadInt32(&calls); got != 1 { + t.Fatalf("expected 1 underlying fetch (coalesced), got %d", got) + } +} + +func TestAllocCacheTTL(t *testing.T) { + c := newAllocCache(40 * time.Millisecond) + var calls int32 + fetch := func() ([]Allocation, error) { + atomic.AddInt32(&calls, 1) + return nil, nil + } + + _, _ = c.do("k", fetch) + _, _ = c.do("k", fetch) // within TTL -> cached + if got := atomic.LoadInt32(&calls); got != 1 { + t.Fatalf("expected 1 fetch within TTL, got %d", got) + } + + time.Sleep(60 * time.Millisecond) // let it expire + _, _ = c.do("k", fetch) + if got := atomic.LoadInt32(&calls); got != 2 { + t.Fatalf("expected re-fetch after TTL, got %d", got) + } +} + +func TestAllocCacheDoesNotCacheErrors(t *testing.T) { + c := newAllocCache(time.Minute) + var calls int32 + fetch := func() ([]Allocation, error) { + atomic.AddInt32(&calls, 1) + return nil, errors.New("boom") + } + + if _, err := c.do("k", fetch); err == nil { + t.Fatal("expected error") + } + if _, err := c.do("k", fetch); err == nil { + t.Fatal("expected error on retry") + } + if got := atomic.LoadInt32(&calls); got != 2 { + t.Fatalf("errors must not be cached; expected 2 fetches, got %d", got) + } +} diff --git a/api-server/internal/opencost/client.go b/api-server/internal/opencost/client.go index 25698f7..5a057e7 100644 --- a/api-server/internal/opencost/client.go +++ b/api-server/internal/opencost/client.go @@ -16,6 +16,7 @@ import ( type Client struct { baseURL string httpClient *http.Client + cache *allocCache } // NewClient creates a new OpenCost client @@ -25,6 +26,9 @@ func NewClient(baseURL string) *Client { httpClient: &http.Client{ Timeout: 30 * time.Second, }, + // Short TTL: dashboard/billing repeatedly query the same windows; 30s keeps + // data near-real-time while collapsing duplicate concurrent queries. + cache: newAllocCache(30 * time.Second), } } @@ -109,12 +113,20 @@ func (c *Client) GetAllocationForNamespace(ctx context.Context, window, namespac return c.getAllocation(ctx, window, "namespace", fmt.Sprintf("namespace:\"%s\"", namespace)) } -// getAllocation is the internal method to query allocations +// getAllocation queries allocations through a short-TTL coalescing cache so that +// concurrent dashboard/billing requests for the same window hit OpenCost once. func (c *Client) getAllocation(ctx context.Context, window, aggregate, filter string) ([]Allocation, error) { if !c.IsEnabled() { return nil, fmt.Errorf("opencost not configured") } + key := window + "|" + aggregate + "|" + filter + return c.cache.do(key, func() ([]Allocation, error) { + return c.fetchAllocation(ctx, window, aggregate, filter) + }) +} +// fetchAllocation performs the actual OpenCost HTTP query (uncached). +func (c *Client) fetchAllocation(ctx context.Context, window, aggregate, filter string) ([]Allocation, error) { // Build URL params := url.Values{} params.Set("window", window) diff --git a/api-server/internal/scheduler/scheduler.go b/api-server/internal/scheduler/scheduler.go index f7ab8d4..7996386 100644 --- a/api-server/internal/scheduler/scheduler.go +++ b/api-server/internal/scheduler/scheduler.go @@ -2,6 +2,7 @@ package scheduler import ( "context" + "math/rand" "sync" "time" @@ -18,8 +19,10 @@ type Scheduler struct { executions []service.TaskExecution executionsMu sync.RWMutex - stopCh chan struct{} - wg sync.WaitGroup + mu sync.Mutex + started bool + stopCh chan struct{} + wg sync.WaitGroup } // NewScheduler creates a new Scheduler @@ -33,31 +36,41 @@ func NewScheduler( balanceSvc: balanceSvc, alertSvc: alertSvc, executions: make([]service.TaskExecution, 0), - stopCh: make(chan struct{}), } } -// Start starts all scheduled tasks +// Start starts all scheduled tasks. It is idempotent and re-startable: calling +// Start after a Stop (e.g. when leadership is re-acquired) spins up a fresh set +// of tasks against a new stop channel. func (s *Scheduler) Start(ctx context.Context) { + s.mu.Lock() + defer s.mu.Unlock() + if s.started { + return + } + s.started = true + s.stopCh = make(chan struct{}) logger.Info("Starting scheduler") - // Start billing task (every hour) - s.wg.Add(1) + s.wg.Add(3) go s.runBillingTask(ctx) - - // Start auto-recharge task (every hour) - s.wg.Add(1) go s.runAutoRechargeTask(ctx) - - // Start alert check task (every 15 minutes) - s.wg.Add(1) go s.runAlertTask(ctx) } -// Stop stops all scheduled tasks +// Stop stops all scheduled tasks and waits for them to exit. Safe to call when +// not started. func (s *Scheduler) Stop() { - logger.Info("Stopping scheduler") + s.mu.Lock() + if !s.started { + s.mu.Unlock() + return + } + s.started = false close(s.stopCh) + s.mu.Unlock() + + logger.Info("Stopping scheduler") s.wg.Wait() } @@ -87,18 +100,50 @@ func (s *Scheduler) GetExecutions(limit int) []service.TaskExecution { return result } +// safeExecute runs a task body with panic recovery so a single failing task can +// never take down the whole api-server process. +func (s *Scheduler) safeExecute(name string, fn func()) { + defer func() { + if r := recover(); r != nil { + logger.Error("Scheduled task panicked and was recovered", "task", name, "panic", r) + } + }() + fn() +} + +// sleepWithJitter waits a random duration in [0, max) to desynchronize task +// firing across replicas, returning false if the scheduler is stopped meanwhile. +func (s *Scheduler) sleepWithJitter(stopCh <-chan struct{}, max time.Duration) bool { + if max <= 0 { + return true + } + timer := time.NewTimer(time.Duration(rand.Int63n(int64(max)))) + defer timer.Stop() + select { + case <-stopCh: + return false + case <-timer.C: + return true + } +} + func (s *Scheduler) runBillingTask(ctx context.Context) { defer s.wg.Done() + stopCh := s.stopCh + + if !s.sleepWithJitter(stopCh, 60*time.Second) { + return + } ticker := time.NewTicker(1 * time.Hour) defer ticker.Stop() for { select { - case <-s.stopCh: + case <-stopCh: return case <-ticker.C: - s.executeBillingTask(ctx) + s.safeExecute("billing", func() { s.executeBillingTask(ctx) }) } } } @@ -129,16 +174,21 @@ func (s *Scheduler) executeBillingTask(ctx context.Context) { func (s *Scheduler) runAutoRechargeTask(ctx context.Context) { defer s.wg.Done() + stopCh := s.stopCh + + if !s.sleepWithJitter(stopCh, 60*time.Second) { + return + } ticker := time.NewTicker(1 * time.Hour) defer ticker.Stop() for { select { - case <-s.stopCh: + case <-stopCh: return case <-ticker.C: - s.executeAutoRechargeTask(ctx) + s.safeExecute("auto_recharge", func() { s.executeAutoRechargeTask(ctx) }) } } } @@ -169,16 +219,21 @@ func (s *Scheduler) executeAutoRechargeTask(ctx context.Context) { func (s *Scheduler) runAlertTask(ctx context.Context) { defer s.wg.Done() + stopCh := s.stopCh + + if !s.sleepWithJitter(stopCh, 30*time.Second) { + return + } ticker := time.NewTicker(15 * time.Minute) defer ticker.Stop() for { select { - case <-s.stopCh: + case <-stopCh: return case <-ticker.C: - s.executeAlertTask(ctx) + s.safeExecute("alert_check", func() { s.executeAlertTask(ctx) }) } } } diff --git a/api-server/internal/scheduler/scheduler_test.go b/api-server/internal/scheduler/scheduler_test.go new file mode 100644 index 0000000..b5f2c44 --- /dev/null +++ b/api-server/internal/scheduler/scheduler_test.go @@ -0,0 +1,45 @@ +package scheduler + +import ( + "context" + "os" + "testing" + "time" + + "github.com/bison/api-server/pkg/logger" +) + +func TestMain(m *testing.M) { + logger.Init(false) + os.Exit(m.Run()) +} + +// TestSchedulerRestartable verifies the scheduler can be stopped and started +// again (required for leader-election re-acquisition) and that Start/Stop are +// idempotent and do not deadlock. +func TestSchedulerRestartable(t *testing.T) { + s := NewScheduler(nil, nil, nil) + ctx := context.Background() + + done := make(chan struct{}) + go func() { + s.Start(ctx) + s.Start(ctx) // idempotent: second Start is a no-op + s.Stop() + s.Stop() // idempotent: second Stop is a no-op + s.Start(ctx) // re-startable after Stop + s.Stop() + close(done) + }() + + select { + case <-done: + case <-time.After(5 * time.Second): + t.Fatal("Start/Stop deadlocked") + } +} + +func TestStopBeforeStartIsSafe(t *testing.T) { + s := NewScheduler(nil, nil, nil) + s.Stop() // must not panic or block when never started +} diff --git a/api-server/internal/service/balance_service.go b/api-server/internal/service/balance_service.go index c07b463..5aecd1b 100644 --- a/api-server/internal/service/balance_service.go +++ b/api-server/internal/service/balance_service.go @@ -10,6 +10,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" "github.com/bison/api-server/internal/k8s" "github.com/bison/api-server/pkg/logger" @@ -120,7 +121,9 @@ func (s *BalanceService) GetAllBalances(ctx context.Context) ([]*Balance, error) return balances, nil } -// Recharge adds balance to a team +// Recharge adds balance to a team. The read-modify-write is performed under +// optimistic-concurrency retry so concurrent recharges/deductions on the shared +// balances ConfigMap cannot silently lose updates. func (s *BalanceService) Recharge(ctx context.Context, teamName string, amount float64, operator, remark string) error { logger.Info("Recharging team", "team", teamName, "amount", amount, "operator", operator) @@ -128,18 +131,11 @@ func (s *BalanceService) Recharge(ctx context.Context, teamName string, amount f return fmt.Errorf("recharge amount must be positive") } - // Get current balance - balance, err := s.GetBalance(ctx, teamName) + newAmount, err := s.applyBalanceDelta(ctx, teamName, amount) if err != nil { return err } - // Update balance - newAmount := balance.Amount + amount - if err := s.updateBalance(ctx, teamName, newAmount); err != nil { - return err - } - // Record history record := &RechargeRecord{ ID: fmt.Sprintf("%d", time.Now().UnixNano()), @@ -154,24 +150,19 @@ func (s *BalanceService) Recharge(ctx context.Context, teamName string, amount f return s.addRechargeRecord(ctx, teamName, record) } -// Deduct deducts balance from a team -func (s *BalanceService) Deduct(ctx context.Context, teamName string, amount float64, reason string) error { +// Deduct deducts balance from a team (negative balances are allowed) and returns +// the balance AFTER the write, so callers do not need a second racy read to decide +// suspension. The update is conflict-retried for concurrency safety. +func (s *BalanceService) Deduct(ctx context.Context, teamName string, amount float64, reason string) (float64, error) { logger.Info("Deducting from team", "team", teamName, "amount", amount, "reason", reason) if amount <= 0 { - return fmt.Errorf("deduction amount must be positive") + return 0, fmt.Errorf("deduction amount must be positive") } - // Get current balance - balance, err := s.GetBalance(ctx, teamName) + newAmount, err := s.applyBalanceDelta(ctx, teamName, -amount) if err != nil { - return err - } - - // Update balance (allow negative balance) - newAmount := balance.Amount - amount - if err := s.updateBalance(ctx, teamName, newAmount); err != nil { - return err + return 0, err } // Record history @@ -185,7 +176,10 @@ func (s *BalanceService) Deduct(ctx context.Context, teamName string, amount flo Balance: newAmount, } - return s.addRechargeRecord(ctx, teamName, record) + if err := s.addRechargeRecord(ctx, teamName, record); err != nil { + return newAmount, err + } + return newAmount, nil } // GetRechargeHistory returns recharge/deduction history for a team @@ -297,18 +291,16 @@ func (s *BalanceService) ProcessAutoRecharge(ctx context.Context) error { continue } - logger.Info("Executing auto-recharge", "team", teamName, "amount", config.Amount) - - // Get current balance - balance, err := s.GetBalance(ctx, teamName) - if err != nil { - logger.Error("Failed to get balance for auto-recharge", "team", teamName, "error", err) + if config.Amount <= 0 { + logger.Warn("Skipping auto-recharge with non-positive amount", "team", teamName, "amount", config.Amount) continue } - // Update balance - newAmount := balance.Amount + config.Amount - if err := s.updateBalance(ctx, teamName, newAmount); err != nil { + logger.Info("Executing auto-recharge", "team", teamName, "amount", config.Amount) + + // Atomically add the recharge amount to the current balance. + newAmount, err := s.applyBalanceDelta(ctx, teamName, config.Amount) + if err != nil { logger.Error("Failed to update balance for auto-recharge", "team", teamName, "error", err) continue } @@ -372,64 +364,78 @@ func (s *BalanceService) GetTotalBalance(ctx context.Context) (float64, error) { // Helper methods -func (s *BalanceService) updateBalance(ctx context.Context, teamName string, amount float64) error { - balance := &Balance{ - TeamName: teamName, - Amount: amount, - LastUpdated: time.Now(), - } - - data, err := json.Marshal(balance) - if err != nil { - return fmt.Errorf("failed to marshal balance: %w", err) - } - - cm, err := s.getOrCreateConfigMap(ctx, BalancesConfigMap) - if err != nil { - return err - } +// mutateConfigMap performs an optimistic-concurrency read-modify-write on a Bison +// ConfigMap. It re-reads the ConfigMap and re-applies mutate on every resourceVersion +// conflict, so concurrent writers (recharge, billing deduction, auto-recharge, +// overdue marking) cannot silently lose each other's updates. +func (s *BalanceService) mutateConfigMap(ctx context.Context, name string, mutate func(cm *corev1.ConfigMap) error) error { + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + cm, err := s.getOrCreateConfigMap(ctx, name) + if err != nil { + return err + } + if cm.Data == nil { + cm.Data = make(map[string]string) + } + if err := mutate(cm); err != nil { + return err + } + // Use the raw client so a conflict error is returned unwrapped for RetryOnConflict. + return s.k8sClient.UpdateConfigMap(ctx, BisonNamespace, cm) + }) +} - if cm.Data == nil { - cm.Data = make(map[string]string) - } - cm.Data[teamName] = string(data) +// applyBalanceDelta atomically adds delta (negative to deduct) to a team's balance, +// preserving other persisted fields such as OverdueAt, and returns the new amount. +func (s *BalanceService) applyBalanceDelta(ctx context.Context, teamName string, delta float64) (float64, error) { + var newAmount float64 + err := s.mutateConfigMap(ctx, BalancesConfigMap, func(cm *corev1.ConfigMap) error { + balance := Balance{TeamName: teamName} + if existing, ok := cm.Data[teamName]; ok { + if err := json.Unmarshal([]byte(existing), &balance); err != nil { + return fmt.Errorf("failed to parse balance: %w", err) + } + } + balance.TeamName = teamName + balance.Amount += delta + balance.LastUpdated = time.Now() - return s.updateConfigMap(ctx, cm) + data, err := json.Marshal(&balance) + if err != nil { + return fmt.Errorf("failed to marshal balance: %w", err) + } + cm.Data[teamName] = string(data) + newAmount = balance.Amount + return nil + }) + return newAmount, err } func (s *BalanceService) addRechargeRecord(ctx context.Context, teamName string, record *RechargeRecord) error { - cm, err := s.getOrCreateConfigMap(ctx, RechargeHistoryConfigMap) - if err != nil { - return err - } - - var records []*RechargeRecord - if data, ok := cm.Data[teamName]; ok { - if err := json.Unmarshal([]byte(data), &records); err != nil { - logger.Warn("Failed to unmarshal existing history, starting fresh", "team", teamName) - records = []*RechargeRecord{} + return s.mutateConfigMap(ctx, RechargeHistoryConfigMap, func(cm *corev1.ConfigMap) error { + var records []*RechargeRecord + if data, ok := cm.Data[teamName]; ok { + if err := json.Unmarshal([]byte(data), &records); err != nil { + logger.Warn("Failed to unmarshal existing history, starting fresh", "team", teamName) + records = []*RechargeRecord{} + } } - } - - // Add new record - records = append(records, record) - - // Keep only last 1000 records - if len(records) > 1000 { - records = records[len(records)-1000:] - } - data, err := json.Marshal(records) - if err != nil { - return fmt.Errorf("failed to marshal history: %w", err) - } + // Add new record + records = append(records, record) - if cm.Data == nil { - cm.Data = make(map[string]string) - } - cm.Data[teamName] = string(data) + // Keep only last 1000 records + if len(records) > 1000 { + records = records[len(records)-1000:] + } - return s.updateConfigMap(ctx, cm) + data, err := json.Marshal(records) + if err != nil { + return fmt.Errorf("failed to marshal history: %w", err) + } + cm.Data[teamName] = string(data) + return nil + }) } func (s *BalanceService) getOrCreateConfigMap(ctx context.Context, name string) (*corev1.ConfigMap, error) { @@ -496,68 +502,82 @@ func (s *BalanceService) calculateNextExecution(config *AutoRechargeConfig) time } } -// CalculateDailyConsumption calculates the average daily consumption for a team based on recent history +// CalculateDailyConsumption estimates the average daily spend for a team from its +// recent deduction history. +// +// The denominator is the actual span of deduction activity within the last 7 days +// (capped at 7 days, floored to avoid wildly overestimating from a single very +// recent deduction) — NOT a fixed 7 days, which previously underestimated the burn +// rate whenever the real activity window was shorter. We also fetch enough records +// to cover a full week of hourly billing (168+), since the old 100-record cap +// truncated the 7-day window and undercounted total deductions. func (s *BalanceService) CalculateDailyConsumption(ctx context.Context, teamName string) (float64, error) { - records, err := s.GetRechargeHistory(ctx, teamName, 100) // Get last 100 records + const ( + lookbackDays = 7 + maxSpanDays = 7.0 + minSpanDays = 0.5 + fetchRecords = 400 + ) + + records, err := s.GetRechargeHistory(ctx, teamName, fetchRecords) if err != nil { return 0, err } - // Calculate total deductions in last 7 days now := time.Now() - sevenDaysAgo := now.AddDate(0, 0, -7) + windowStart := now.AddDate(0, 0, -lookbackDays) var totalDeductions float64 - var daysWithData float64 = 7 // Default to 7 days + var oldestDeduction time.Time + hasDeduction := false for _, record := range records { - if record.Type == "deduction" && record.Timestamp.After(sevenDaysAgo) { - totalDeductions += -record.Amount // Amount is negative for deductions + if record.Type != "deduction" || !record.Timestamp.After(windowStart) { + continue } - } - - // If we have less than 7 days of data, calculate based on actual time span - if len(records) > 0 { - oldestRecord := records[len(records)-1] - if oldestRecord.Timestamp.After(sevenDaysAgo) { - actualDays := now.Sub(oldestRecord.Timestamp).Hours() / 24 - if actualDays > 0 { - daysWithData = actualDays - } + totalDeductions += -record.Amount // Amount is negative for deductions + if !hasDeduction || record.Timestamp.Before(oldestDeduction) { + oldestDeduction = record.Timestamp + hasDeduction = true } } - if daysWithData == 0 { + if !hasDeduction || totalDeductions <= 0 { return 0, nil } - return totalDeductions / daysWithData, nil -} - -// SetOverdueAt records when a team first went into negative balance -func (s *BalanceService) SetOverdueAt(ctx context.Context, teamName string, overdueAt *time.Time) error { - balance, err := s.GetBalance(ctx, teamName) - if err != nil { - return err + spanDays := now.Sub(oldestDeduction).Hours() / 24 + if spanDays > maxSpanDays { + spanDays = maxSpanDays } - - balance.OverdueAt = overdueAt - data, err := json.Marshal(balance) - if err != nil { - return fmt.Errorf("failed to marshal balance: %w", err) + if spanDays < minSpanDays { + spanDays = minSpanDays } - cm, err := s.getOrCreateConfigMap(ctx, BalancesConfigMap) - if err != nil { - return err - } + return totalDeductions / spanDays, nil +} - if cm.Data == nil { - cm.Data = make(map[string]string) - } - cm.Data[teamName] = string(data) +// SetOverdueAt records (or clears) when a team first went into negative balance. +// The update is conflict-retried and preserves the current amount, so it cannot +// clobber a concurrent deduction/recharge. +func (s *BalanceService) SetOverdueAt(ctx context.Context, teamName string, overdueAt *time.Time) error { + return s.mutateConfigMap(ctx, BalancesConfigMap, func(cm *corev1.ConfigMap) error { + balance := Balance{TeamName: teamName} + if existing, ok := cm.Data[teamName]; ok { + if err := json.Unmarshal([]byte(existing), &balance); err != nil { + return fmt.Errorf("failed to parse balance: %w", err) + } + } + balance.TeamName = teamName + balance.OverdueAt = overdueAt - return s.updateConfigMap(ctx, cm) + data, err := json.Marshal(&balance) + if err != nil { + return fmt.Errorf("failed to marshal balance: %w", err) + } + cm.Data[teamName] = string(data) + return nil + }) } // GetBalanceWithEstimate returns the balance with consumption and estimated overdue time calculated diff --git a/api-server/internal/service/balance_service_test.go b/api-server/internal/service/balance_service_test.go new file mode 100644 index 0000000..5ded60b --- /dev/null +++ b/api-server/internal/service/balance_service_test.go @@ -0,0 +1,265 @@ +package service + +import ( + "context" + "fmt" + "strconv" + "sync" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + k8sfake "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" + + "github.com/bison/api-server/internal/k8s" +) + +func newTestBalanceService() *BalanceService { + cs := k8sfake.NewSimpleClientset() + client := k8s.NewClientWithInterfaces(cs, nil) + return NewBalanceService(client) +} + +// installOptimisticConcurrency makes the fake clientset enforce resourceVersion on +// ConfigMap update so it behaves like etcd. Without this the default fake silently +// accepts stale writes and the RetryOnConflict path is never exercised. The tracker +// is the single source of truth for the current resourceVersion. +func installOptimisticConcurrency(cs *k8sfake.Clientset) { + var mu sync.Mutex + + cs.PrependReactor("update", "configmaps", func(action k8stesting.Action) (bool, runtime.Object, error) { + cm := action.(k8stesting.UpdateAction).GetObject().(*corev1.ConfigMap).DeepCopy() + mu.Lock() + defer mu.Unlock() + + existing, err := cs.Tracker().Get(action.GetResource(), action.GetNamespace(), cm.Name) + if err != nil { + return true, nil, err + } + existingCM := existing.(*corev1.ConfigMap) + if cm.ResourceVersion != existingCM.ResourceVersion { + return true, nil, apierrors.NewConflict( + schema.GroupResource{Resource: "configmaps"}, cm.Name, + fmt.Errorf("resourceVersion conflict")) + } + + rv, _ := strconv.Atoi(existingCM.ResourceVersion) + cm.ResourceVersion = strconv.Itoa(rv + 1) + if err := cs.Tracker().Update(action.GetResource(), cm, action.GetNamespace()); err != nil { + return true, nil, err + } + return true, cm, nil + }) +} + +func seedConfigMap(name string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: BisonNamespace, ResourceVersion: "1"}, + Data: map[string]string{}, + } +} + +func TestRechargeThenDeduct(t *testing.T) { + svc := newTestBalanceService() + ctx := context.Background() + + if err := svc.Recharge(ctx, "team-a", 100, "admin", "init"); err != nil { + t.Fatalf("Recharge: %v", err) + } + + bal, err := svc.GetBalance(ctx, "team-a") + if err != nil { + t.Fatalf("GetBalance: %v", err) + } + if bal.Amount != 100 { + t.Fatalf("after recharge: got %v, want 100", bal.Amount) + } + + newBal, err := svc.Deduct(ctx, "team-a", 30, "usage") + if err != nil { + t.Fatalf("Deduct: %v", err) + } + if newBal != 70 { + t.Fatalf("Deduct returned %v, want 70", newBal) + } + + bal, _ = svc.GetBalance(ctx, "team-a") + if bal.Amount != 70 { + t.Fatalf("stored balance %v, want 70", bal.Amount) + } + + recs, err := svc.GetRechargeHistory(ctx, "team-a", 0) + if err != nil { + t.Fatalf("GetRechargeHistory: %v", err) + } + if len(recs) != 2 { + t.Fatalf("history records = %d, want 2", len(recs)) + } +} + +func TestDeductAllowsNegativeBalance(t *testing.T) { + svc := newTestBalanceService() + ctx := context.Background() + + if err := svc.Recharge(ctx, "team-a", 10, "admin", ""); err != nil { + t.Fatal(err) + } + newBal, err := svc.Deduct(ctx, "team-a", 25, "usage") + if err != nil { + t.Fatalf("Deduct: %v", err) + } + if newBal != -15 { + t.Fatalf("Deduct returned %v, want -15", newBal) + } +} + +func TestRechargeRejectsNonPositive(t *testing.T) { + svc := newTestBalanceService() + ctx := context.Background() + + if err := svc.Recharge(ctx, "team-a", 0, "admin", ""); err == nil { + t.Fatal("Recharge(0) should error") + } + if err := svc.Recharge(ctx, "team-a", -5, "admin", ""); err == nil { + t.Fatal("Recharge(-5) should error") + } + if _, err := svc.Deduct(ctx, "team-a", 0, "usage"); err == nil { + t.Fatal("Deduct(0) should error") + } +} + +// TestDeductPreservesOverdueAt guards the regression where a deduction overwrote +// the whole Balance object and silently wiped OverdueAt, which would reset the +// grace-period clock on every billing cycle and prevent suspension. +func TestDeductPreservesOverdueAt(t *testing.T) { + svc := newTestBalanceService() + ctx := context.Background() + + if err := svc.Recharge(ctx, "team-a", 10, "admin", ""); err != nil { + t.Fatal(err) + } + overdue := time.Now().Add(-2 * time.Hour).Truncate(time.Second) + if err := svc.SetOverdueAt(ctx, "team-a", &overdue); err != nil { + t.Fatal(err) + } + + if _, err := svc.Deduct(ctx, "team-a", 50, "usage"); err != nil { + t.Fatal(err) + } + + bal, _ := svc.GetBalance(ctx, "team-a") + if bal.OverdueAt == nil { + t.Fatal("OverdueAt was wiped by Deduct") + } + if !bal.OverdueAt.Equal(overdue) { + t.Fatalf("OverdueAt = %v, want %v", bal.OverdueAt, overdue) + } + if bal.Amount != -40 { + t.Fatalf("amount = %v, want -40", bal.Amount) + } +} + +func addDeduction(t *testing.T, svc *BalanceService, team string, amount float64, ts time.Time) { + t.Helper() + if err := svc.addRechargeRecord(context.Background(), team, &RechargeRecord{ + ID: ts.Format(time.RFC3339Nano), + Timestamp: ts, + Type: "deduction", + Amount: -amount, + Operator: "system", + }); err != nil { + t.Fatalf("addRechargeRecord: %v", err) + } +} + +func TestCalculateDailyConsumptionUsesDeductionSpan(t *testing.T) { + svc := newTestBalanceService() + ctx := context.Background() + now := time.Now() + + // Two deductions spanning ~4 days, total 200 -> ~50/day. + addDeduction(t, svc, "team-a", 100, now.Add(-4*24*time.Hour)) + addDeduction(t, svc, "team-a", 100, now) + + rate, err := svc.CalculateDailyConsumption(ctx, "team-a") + if err != nil { + t.Fatal(err) + } + if rate < 45 || rate > 55 { + t.Fatalf("expected ~50/day, got %.2f", rate) + } +} + +func TestCalculateDailyConsumptionFloorsSpan(t *testing.T) { + svc := newTestBalanceService() + ctx := context.Background() + + // Single very recent deduction: span floored to 0.5 day -> 10 / 0.5 = 20. + addDeduction(t, svc, "team-a", 10, time.Now()) + rate, err := svc.CalculateDailyConsumption(ctx, "team-a") + if err != nil { + t.Fatal(err) + } + if rate < 18 || rate > 22 { + t.Fatalf("expected ~20/day (floored span), got %.2f", rate) + } +} + +func TestCalculateDailyConsumptionIgnoresOldAndNonDeductions(t *testing.T) { + svc := newTestBalanceService() + ctx := context.Background() + now := time.Now() + + addDeduction(t, svc, "team-a", 1000, now.Add(-30*24*time.Hour)) // outside 7d window + // a recharge inside the window must not count as consumption + if err := svc.addRechargeRecord(ctx, "team-a", &RechargeRecord{ + ID: "r1", Timestamp: now, Type: "recharge", Amount: 500, + }); err != nil { + t.Fatal(err) + } + rate, err := svc.CalculateDailyConsumption(ctx, "team-a") + if err != nil { + t.Fatal(err) + } + if rate != 0 { + t.Fatalf("expected 0 (no in-window deductions), got %.2f", rate) + } +} + +// TestConcurrentRecharge exercises the optimistic-concurrency retry path against a +// fake that enforces resourceVersion. Each Recharge re-reads, recomputes and +// re-writes under retry.RetryOnConflict, so the final balance must equal the sum of +// all operations with no lost updates. Contention is kept within DefaultRetry's +// budget (5 attempts) by using a small number of writers. +func TestConcurrentRecharge(t *testing.T) { + cs := k8sfake.NewSimpleClientset(seedConfigMap(BalancesConfigMap), seedConfigMap(RechargeHistoryConfigMap)) + installOptimisticConcurrency(cs) + svc := NewBalanceService(k8s.NewClientWithInterfaces(cs, nil)) + ctx := context.Background() + + const n = 4 + var wg sync.WaitGroup + wg.Add(n) + for i := 0; i < n; i++ { + go func() { + defer wg.Done() + if err := svc.Recharge(ctx, "team-a", 5, "admin", "concurrent"); err != nil { + t.Errorf("Recharge: %v", err) + } + }() + } + wg.Wait() + + bal, err := svc.GetBalance(ctx, "team-a") + if err != nil { + t.Fatal(err) + } + if bal.Amount != float64(n*5) { + t.Fatalf("concurrent recharge total = %v, want %v (lost update)", bal.Amount, n*5) + } +} diff --git a/api-server/internal/service/billing_service.go b/api-server/internal/service/billing_service.go index 57924af..d9a480f 100644 --- a/api-server/internal/service/billing_service.go +++ b/api-server/internal/service/billing_service.go @@ -7,9 +7,10 @@ import ( "strconv" "time" - appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" "github.com/bison/api-server/internal/k8s" "github.com/bison/api-server/internal/opencost" @@ -18,6 +19,10 @@ import ( const ( BillingConfigMap = "bison-billing-config" + // lastBilledKey stores (in the billing ConfigMap) the RFC3339 timestamp of the + // last successful billing run, so billing is not duplicated when the ticker + // fires more often than the configured interval or after a process restart. + lastBilledKey = "lastBilledAt" ) // BillingConfig represents the billing configuration @@ -150,6 +155,31 @@ func (s *BillingService) ProcessBilling(ctx context.Context) error { return nil } + // Enforce the configured billing interval regardless of how often the + // scheduler ticks, and survive restarts, by gating on a persisted timestamp. + interval := config.Interval + if interval <= 0 { + interval = 1 + } + minGap := time.Duration(interval) * time.Hour + now := time.Now() + lastBilled, _ := s.getLastBilled(ctx) + if lastBilled.IsZero() { + // First run on a fresh deployment: establish a baseline instead of billing + // an unknown historical window. + if err := s.setLastBilled(ctx, now); err != nil { + logger.Warn("Failed to initialize billing baseline", "error", err) + } + logger.Info("Billing baseline initialized; skipping first cycle") + return nil + } + // Tolerate scheduler jitter: require ~95% of the interval to have elapsed. + if now.Sub(lastBilled) < time.Duration(float64(minGap)*0.95) { + logger.Debug("Skipping billing: interval not yet elapsed", + "sinceLastBilled", now.Sub(lastBilled).String(), "interval", minGap.String()) + return nil + } + // Get usage from OpenCost if s.opencostClient == nil || !s.opencostClient.IsEnabled() { logger.Warn("OpenCost not available, skipping billing") @@ -180,7 +210,8 @@ func (s *BillingService) ProcessBilling(ctx context.Context) error { } } - // Aggregate costs by team + // Aggregate costs by team. Prices are read once for the whole run. + prices := s.loadPrices(ctx) teamCosts := make(map[string]float64) for _, alloc := range allocations { teamName, ok := nsToTeam[alloc.Name] @@ -188,8 +219,7 @@ func (s *BillingService) ProcessBilling(ctx context.Context) error { continue } - // Calculate cost based on pricing config - cost := s.calculateCost(ctx, config, &alloc) + cost := costFromPrices(config, prices, &alloc) teamCosts[teamName] += cost } @@ -200,46 +230,114 @@ func (s *BillingService) ProcessBilling(ctx context.Context) error { } reason := fmt.Sprintf("Usage billing for %s", window) - if err := s.balanceSvc.Deduct(ctx, teamName, cost, reason); err != nil { + // Deduct returns the authoritative post-write balance, so the suspension + // decision below is no longer based on a racy second read. + newBalance, err := s.balanceSvc.Deduct(ctx, teamName, cost, reason) + if err != nil { logger.Error("Failed to deduct balance", "team", teamName, "cost", cost, "error", err) continue } - // Check if team is now in debt - balance, _ := s.balanceSvc.GetBalance(ctx, teamName) - if balance != nil && balance.Amount < 0 { - logger.Warn("Team is in debt", "team", teamName, "balance", balance.Amount) + if newBalance < 0 { + logger.Warn("Team is in debt", "team", teamName, "balance", newBalance) - // Record when balance first went negative - if balance.OverdueAt == nil { + // Determine the overdue start time, preserving any existing marker so the + // grace period is measured from when the balance first went negative. + cur, err := s.balanceSvc.GetBalance(ctx, teamName) + if err != nil { + logger.Error("Failed to read balance for overdue check", "team", teamName, "error", err) + continue + } + overdueAt := cur.OverdueAt + if overdueAt == nil { now := time.Now() - if err := s.balanceSvc.SetOverdueAt(ctx, teamName, &now); err != nil { + overdueAt = &now + if err := s.balanceSvc.SetOverdueAt(ctx, teamName, overdueAt); err != nil { logger.Error("Failed to set overdue time", "team", teamName, "error", err) } - balance.OverdueAt = &now } // Check if grace period has passed - if s.isGracePeriodExpired(config, balance.OverdueAt) { - logger.Warn("Grace period expired, suspending team", "team", teamName, "overdueAt", balance.OverdueAt) + if s.isGracePeriodExpired(config, overdueAt) { + logger.Warn("Grace period expired, suspending team", "team", teamName, "overdueAt", overdueAt) if err := s.SuspendTeam(ctx, teamName); err != nil { logger.Error("Failed to suspend team", "team", teamName, "error", err) } } else { - remaining := s.balanceSvc.CalculateGraceRemaining(balance.OverdueAt, config.GracePeriodValue, config.GracePeriodUnit) + remaining := s.balanceSvc.CalculateGraceRemaining(overdueAt, config.GracePeriodValue, config.GracePeriodUnit) logger.Info("Team in grace period", "team", teamName, "remaining", remaining) } - } else if balance != nil && balance.Amount >= 0 && balance.OverdueAt != nil { - // Balance is positive again, clear overdue time - if err := s.balanceSvc.SetOverdueAt(ctx, teamName, nil); err != nil { - logger.Error("Failed to clear overdue time", "team", teamName, "error", err) + } else { + // Balance is non-negative again, clear any overdue marker. + if cur, err := s.balanceSvc.GetBalance(ctx, teamName); err == nil && cur.OverdueAt != nil { + if err := s.balanceSvc.SetOverdueAt(ctx, teamName, nil); err != nil { + logger.Error("Failed to clear overdue time", "team", teamName, "error", err) + } } } } + // Record successful billing time so the next cycle bills the correct window. + if err := s.setLastBilled(ctx, now); err != nil { + logger.Error("Failed to update last-billed timestamp", "error", err) + } + return nil } +// getLastBilled returns the timestamp of the last successful billing run, or the +// zero time if none has been recorded yet. +func (s *BillingService) getLastBilled(ctx context.Context) (time.Time, error) { + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, BillingConfigMap) + if err != nil { + if errors.IsNotFound(err) { + return time.Time{}, nil + } + return time.Time{}, err + } + v, ok := cm.Data[lastBilledKey] + if !ok || v == "" { + return time.Time{}, nil + } + t, err := time.Parse(time.RFC3339, v) + if err != nil { + logger.Warn("Invalid lastBilledAt timestamp, treating as unset", "value", v) + return time.Time{}, nil + } + return t, nil +} + +// setLastBilled persists the last successful billing time, using optimistic +// concurrency so it cannot clobber a concurrent config update. +func (s *BillingService) setLastBilled(ctx context.Context, t time.Time) error { + value := t.UTC().Format(time.RFC3339) + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, BillingConfigMap) + if err != nil { + if errors.IsNotFound(err) { + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: BillingConfigMap, + Namespace: BisonNamespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "bison", + "app.kubernetes.io/component": "billing", + }, + }, + Data: map[string]string{lastBilledKey: value}, + } + return s.k8sClient.CreateConfigMap(ctx, BisonNamespace, cm) + } + return err + } + if cm.Data == nil { + cm.Data = make(map[string]string) + } + cm.Data[lastBilledKey] = value + return s.k8sClient.UpdateConfigMap(ctx, BisonNamespace, cm) + }) +} + // isGracePeriodExpired checks if the grace period has expired for a team func (s *BillingService) isGracePeriodExpired(config *BillingConfig, overdueAt *time.Time) bool { if overdueAt == nil { @@ -274,6 +372,7 @@ func (s *BillingService) GetTeamBill(ctx context.Context, teamName, window strin resourceCosts := make(map[string]float64) config, _ := s.GetConfig(ctx) + prices := s.loadPrices(ctx) if s.opencostClient != nil && s.opencostClient.IsEnabled() { for _, project := range projects { @@ -289,7 +388,7 @@ func (s *BillingService) GetTeamBill(ctx context.Context, teamName, window strin totalUsage.GPUHours += alloc.GPUHours totalUsage.Minutes += alloc.Minutes - cost := s.calculateCost(ctx, config, &alloc) + cost := costFromPrices(config, prices, &alloc) totalCost += cost resourceCosts["cpu"] += alloc.CPUCost @@ -323,6 +422,7 @@ func (s *BillingService) GetProjectBill(ctx context.Context, projectName, window resourceCosts := make(map[string]float64) config, _ := s.GetConfig(ctx) + prices := s.loadPrices(ctx) if s.opencostClient != nil && s.opencostClient.IsEnabled() { allocations, err := s.opencostClient.GetAllocationForNamespace(ctx, window, projectName) @@ -336,7 +436,7 @@ func (s *BillingService) GetProjectBill(ctx context.Context, projectName, window usage.GPUHours += alloc.GPUHours usage.Minutes += alloc.Minutes - cost := s.calculateCost(ctx, config, &alloc) + cost := costFromPrices(config, prices, &alloc) totalCost += cost resourceCosts["cpu"] += alloc.CPUCost @@ -452,62 +552,72 @@ func (s *BillingService) getDefaultConfig() *BillingConfig { } } -func (s *BillingService) calculateCost(ctx context.Context, config *BillingConfig, alloc *opencost.Allocation) float64 { - if config == nil || !config.Enabled { - return alloc.TotalCost - } - - var cost float64 +// resourcePrices holds the per-unit prices used for cost computation, resolved +// once per billing/report operation instead of per allocation row. +type resourcePrices struct { + cpu float64 + memory float64 + accelerator float64 +} - // Get resource configs for pricing +// loadPrices reads the enabled resource configs once and builds a price table. +func (s *BillingService) loadPrices(ctx context.Context) resourcePrices { resourceConfigs, _ := s.resourceConfigSvc.GetEnabledResourceConfigs(ctx) - - // Build price map and find accelerator price - cpuPrice := float64(0) - memoryPrice := float64(0) - acceleratorPrice := float64(0) - + var p resourcePrices for _, rc := range resourceConfigs { if rc.Price <= 0 { continue } switch rc.Name { case "cpu": - cpuPrice = rc.Price + p.cpu = rc.Price case "memory": - memoryPrice = rc.Price + p.memory = rc.Price default: - // For accelerators (any non-cpu/memory resource), use the first one with price - if rc.Category == CategoryAccelerator && acceleratorPrice == 0 { - acceleratorPrice = rc.Price + // For accelerators (any non-cpu/memory resource), use the first priced one. + if rc.Category == CategoryAccelerator && p.accelerator == 0 { + p.accelerator = rc.Price } } } + return p +} - // CPU cost - if cpuPrice > 0 { - cost += alloc.CPUCoreHours * cpuPrice +// costFromPrices computes the cost of a single allocation from a precomputed price table. +func costFromPrices(config *BillingConfig, p resourcePrices, alloc *opencost.Allocation) float64 { + if config == nil || !config.Enabled { + return alloc.TotalCost + } + + var cost float64 + if p.cpu > 0 { + cost += alloc.CPUCoreHours * p.cpu } else { cost += alloc.CPUCost } - - // Memory cost - if memoryPrice > 0 { - cost += alloc.RAMGBHours * memoryPrice + if p.memory > 0 { + cost += alloc.RAMGBHours * p.memory } else { cost += alloc.RAMCost } - - // GPU/Accelerator cost (OpenCost reports all accelerators as GPUHours) - if acceleratorPrice > 0 { - cost += alloc.GPUHours * acceleratorPrice + // OpenCost reports all accelerators as GPUHours. + if p.accelerator > 0 { + cost += alloc.GPUHours * p.accelerator } else { cost += alloc.GPUCost } - return cost } +// calculateCost computes the cost of a single allocation, loading prices each call. +// In loops prefer loadPrices + costFromPrices to avoid repeated ConfigMap reads. +func (s *BillingService) calculateCost(ctx context.Context, config *BillingConfig, alloc *opencost.Allocation) float64 { + if config == nil || !config.Enabled { + return alloc.TotalCost + } + return costFromPrices(config, s.loadPrices(ctx), alloc) +} + func (s *BillingService) scaleDownNamespace(ctx context.Context, namespace string) error { // Scale down deployments deployments, err := s.k8sClient.ListDeployments(ctx, namespace) @@ -639,6 +749,3 @@ func (s *BillingService) scaleUpNamespace(ctx context.Context, namespace string) return nil } - -// Unused import fix -var _ = appsv1.Deployment{} diff --git a/api-server/internal/service/billing_service_test.go b/api-server/internal/service/billing_service_test.go new file mode 100644 index 0000000..e6f6f59 --- /dev/null +++ b/api-server/internal/service/billing_service_test.go @@ -0,0 +1,143 @@ +package service + +import ( + "context" + "encoding/json" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sfake "k8s.io/client-go/kubernetes/fake" + + "github.com/bison/api-server/internal/k8s" + "github.com/bison/api-server/internal/opencost" +) + +func newTestBillingService(resourceConfigs []ResourceDefinition) *BillingService { + data, _ := json.Marshal(resourceConfigs) + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: ResourceConfigName, + Namespace: ResourceConfigNamespace, + }, + Data: map[string]string{ResourceConfigDataKey: string(data)}, + } + cs := k8sfake.NewSimpleClientset(cm) + client := k8s.NewClientWithInterfaces(cs, nil) + rcSvc := NewResourceConfigService(client) + balSvc := NewBalanceService(client) + return NewBillingService(client, nil, balSvc, nil, nil, rcSvc) +} + +func TestCalculateCostWithConfiguredPrices(t *testing.T) { + svc := newTestBillingService([]ResourceDefinition{ + {Name: "cpu", Enabled: true, Price: 0.1, Category: CategoryCompute}, + {Name: "memory", Enabled: true, Price: 0.05, Category: CategoryMemory}, + {Name: "nvidia.com/gpu", Enabled: true, Price: 8, Category: CategoryAccelerator}, + }) + + config := &BillingConfig{Enabled: true} + alloc := &opencost.Allocation{ + CPUCoreHours: 10, + RAMGBHours: 20, + GPUHours: 2, + // Fallback costs that must be ignored when a configured price exists. + CPUCost: 999, + RAMCost: 999, + GPUCost: 999, + } + + got := svc.calculateCost(context.Background(), config, alloc) + want := 10*0.1 + 20*0.05 + 2*8.0 // 1 + 1 + 16 = 18 + if got != want { + t.Fatalf("calculateCost = %v, want %v", got, want) + } +} + +func TestCalculateCostFallsBackToAllocationCost(t *testing.T) { + // No enabled/priced resources configured -> use OpenCost's own cost numbers. + svc := newTestBillingService([]ResourceDefinition{}) + + config := &BillingConfig{Enabled: true} + alloc := &opencost.Allocation{ + CPUCost: 1.5, + RAMCost: 2.5, + GPUCost: 6.0, + } + + got := svc.calculateCost(context.Background(), config, alloc) + want := 1.5 + 2.5 + 6.0 + if got != want { + t.Fatalf("calculateCost fallback = %v, want %v", got, want) + } +} + +func TestCalculateCostDisabledReturnsTotalCost(t *testing.T) { + svc := newTestBillingService([]ResourceDefinition{}) + + alloc := &opencost.Allocation{TotalCost: 42} + if got := svc.calculateCost(context.Background(), nil, alloc); got != 42 { + t.Fatalf("calculateCost(nil config) = %v, want 42", got) + } + if got := svc.calculateCost(context.Background(), &BillingConfig{Enabled: false}, alloc); got != 42 { + t.Fatalf("calculateCost(disabled) = %v, want 42", got) + } +} + +func TestLastBilledRoundTrip(t *testing.T) { + svc := newTestBillingService([]ResourceDefinition{}) + ctx := context.Background() + + // No timestamp yet -> zero time. + if ts, err := svc.getLastBilled(ctx); err != nil || !ts.IsZero() { + t.Fatalf("expected zero time initially, got %v err=%v", ts, err) + } + + now := time.Now().UTC().Truncate(time.Second) + if err := svc.setLastBilled(ctx, now); err != nil { + t.Fatalf("setLastBilled: %v", err) + } + got, err := svc.getLastBilled(ctx) + if err != nil { + t.Fatalf("getLastBilled: %v", err) + } + if !got.Equal(now) { + t.Fatalf("round-trip mismatch: got %v want %v", got, now) + } + + // Overwrite works. + later := now.Add(2 * time.Hour) + if err := svc.setLastBilled(ctx, later); err != nil { + t.Fatalf("setLastBilled (overwrite): %v", err) + } + if got, _ := svc.getLastBilled(ctx); !got.Equal(later) { + t.Fatalf("overwrite mismatch: got %v want %v", got, later) + } +} + +func TestIsGracePeriodExpired(t *testing.T) { + s := &BillingService{} + recent := time.Now().Add(-1 * time.Hour) + old := time.Now().Add(-72 * time.Hour) + + cases := []struct { + name string + cfg *BillingConfig + overdueAt *time.Time + want bool + }{ + {"nil overdue", &BillingConfig{GracePeriodValue: 1, GracePeriodUnit: "days"}, nil, false}, + {"within hours", &BillingConfig{GracePeriodValue: 24, GracePeriodUnit: "hours"}, &recent, false}, + {"expired hours", &BillingConfig{GracePeriodValue: 1, GracePeriodUnit: "hours"}, &recent, true}, + {"within days", &BillingConfig{GracePeriodValue: 7, GracePeriodUnit: "days"}, &old, false}, + {"expired days", &BillingConfig{GracePeriodValue: 1, GracePeriodUnit: "days"}, &old, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := s.isGracePeriodExpired(tc.cfg, tc.overdueAt); got != tc.want { + t.Fatalf("isGracePeriodExpired = %v, want %v", got, tc.want) + } + }) + } +} diff --git a/api-server/internal/service/config_transfer_service.go b/api-server/internal/service/config_transfer_service.go new file mode 100644 index 0000000..5366c49 --- /dev/null +++ b/api-server/internal/service/config_transfer_service.go @@ -0,0 +1,664 @@ +package service + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/bison/api-server/pkg/logger" +) + +const ( + ExportVersion = "1.0" + RedactedValue = "***REDACTED***" + SectionBilling = "billing" + SectionAlerts = "alerts" + SectionResources = "resources" + SectionCP = "controlPlane" + SectionScripts = "initScripts" +) + +var AllSections = []string{SectionBilling, SectionAlerts, SectionResources, SectionCP, SectionScripts} + +// ExportConfig represents the full export file structure +type ExportConfig struct { + Version string `json:"version"` + ExportedAt time.Time `json:"exportedAt"` + ExportedBy string `json:"exportedBy"` + Sections map[string]json.RawMessage `json:"sections"` +} + +// SectionPreview holds diff info for one config section +type SectionPreview struct { + Present bool `json:"present"` + Valid bool `json:"valid"` + HasSensitiveData bool `json:"hasSensitiveData"` + Changes map[string]*FieldChange `json:"changes,omitempty"` + Summary *ResourceSummary `json:"summary,omitempty"` + Warnings []string `json:"warnings,omitempty"` + Errors []string `json:"errors,omitempty"` +} + +// FieldChange represents a single field change +type FieldChange struct { + Current interface{} `json:"current"` + Imported interface{} `json:"imported"` +} + +// ResourceSummary for array-based configs +type ResourceSummary struct { + Added []string `json:"added,omitempty"` + Modified []string `json:"modified,omitempty"` + Removed []string `json:"removed,omitempty"` + Unchanged []string `json:"unchanged,omitempty"` +} + +// ImportPreviewResult holds the preview/diff analysis +type ImportPreviewResult struct { + Valid bool `json:"valid"` + Version string `json:"version"` + ExportedAt string `json:"exportedAt,omitempty"` + Sections map[string]*SectionPreview `json:"sections"` + Errors []string `json:"errors"` + Warnings []string `json:"warnings"` +} + +// ImportRequest holds the import apply request +type ImportRequest struct { + Config ExportConfig `json:"config"` + Sections []string `json:"sections"` + PreserveSensitive bool `json:"preserveSensitive"` +} + +// ImportResult holds the import apply result +type ImportResult struct { + Message string `json:"message"` + Applied []string `json:"applied"` + Skipped []string `json:"skipped"` + Warnings []string `json:"warnings"` +} + +// ConfigTransferService handles configuration export and import +type ConfigTransferService struct { + billingSvc *BillingService + alertSvc *AlertService + resourceConfigSvc *ResourceConfigService + initScriptSvc *InitScriptService +} + +// NewConfigTransferService creates a new ConfigTransferService +func NewConfigTransferService( + billingSvc *BillingService, + alertSvc *AlertService, + resourceConfigSvc *ResourceConfigService, + initScriptSvc *InitScriptService, +) *ConfigTransferService { + return &ConfigTransferService{ + billingSvc: billingSvc, + alertSvc: alertSvc, + resourceConfigSvc: resourceConfigSvc, + initScriptSvc: initScriptSvc, + } +} + +// Export exports selected configuration sections +func (s *ConfigTransferService) Export(ctx context.Context, sections []string, includeSensitive bool, operator string) (*ExportConfig, error) { + logger.Info("Exporting configuration", "sections", sections, "includeSensitive", includeSensitive, "operator", operator) + + sectionSet := make(map[string]bool) + for _, sec := range sections { + sectionSet[sec] = true + } + + result := &ExportConfig{ + Version: ExportVersion, + ExportedAt: time.Now(), + ExportedBy: operator, + Sections: make(map[string]json.RawMessage), + } + + if sectionSet[SectionBilling] { + config, err := s.billingSvc.GetConfig(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export billing config: %w", err) + } + data, _ := json.Marshal(config) + result.Sections[SectionBilling] = data + } + + if sectionSet[SectionAlerts] { + config, err := s.alertSvc.GetConfig(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export alert config: %w", err) + } + if !includeSensitive { + s.redactAlertChannels(config) + } + data, _ := json.Marshal(config) + result.Sections[SectionAlerts] = data + } + + if sectionSet[SectionResources] { + configs, err := s.resourceConfigSvc.GetResourceConfigs(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export resource configs: %w", err) + } + data, _ := json.Marshal(configs) + result.Sections[SectionResources] = data + } + + if sectionSet[SectionCP] { + config, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export control plane config: %w", err) + } + if !includeSensitive { + if config.Password != "" { + config.Password = RedactedValue + } + if config.PrivateKey != "" { + config.PrivateKey = RedactedValue + } + } + data, _ := json.Marshal(config) + result.Sections[SectionCP] = data + } + + if sectionSet[SectionScripts] { + groups, err := s.initScriptSvc.GetAllScriptGroups(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export init scripts: %w", err) + } + data, _ := json.Marshal(groups) + result.Sections[SectionScripts] = data + } + + return result, nil +} + +// redactAlertChannels masks sensitive webhook URLs in alert channels +func (s *ConfigTransferService) redactAlertChannels(config *AlertConfig) { + sensitiveKeys := map[string]bool{ + "url": true, + "webhook": true, + "smtp": true, + } + for i := range config.Channels { + for key := range config.Channels[i].Config { + if sensitiveKeys[key] { + val := config.Channels[i].Config[key] + if len(val) > 20 { + config.Channels[i].Config[key] = val[:10] + "***" + val[len(val)-5:] + } else if val != "" { + config.Channels[i].Config[key] = RedactedValue + } + } + } + } +} + +// Preview validates and previews an import configuration +func (s *ConfigTransferService) Preview(ctx context.Context, config *ExportConfig) (*ImportPreviewResult, error) { + logger.Info("Previewing configuration import") + + result := &ImportPreviewResult{ + Valid: true, + Version: config.Version, + Sections: make(map[string]*SectionPreview), + Errors: []string{}, + Warnings: []string{}, + } + + if config.Version != ExportVersion { + result.Valid = false + result.Errors = append(result.Errors, fmt.Sprintf("不支持的版本: %s (期望 %s)", config.Version, ExportVersion)) + return result, nil + } + + if !config.ExportedAt.IsZero() { + result.ExportedAt = config.ExportedAt.Format(time.RFC3339) + } + + for section, raw := range config.Sections { + switch section { + case SectionBilling: + preview := s.previewBilling(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + case SectionAlerts: + preview := s.previewAlerts(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + case SectionResources: + preview := s.previewResources(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + case SectionCP: + preview := s.previewControlPlane(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + case SectionScripts: + preview := s.previewInitScripts(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + default: + result.Warnings = append(result.Warnings, fmt.Sprintf("未知的配置模块: %s (将被忽略)", section)) + } + } + + return result, nil +} + +func (s *ConfigTransferService) previewBilling(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported BillingConfig + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "计费配置格式无效: "+err.Error()) + return preview + } + + if imported.Interval <= 0 || imported.Interval > 24 { + preview.Errors = append(preview.Errors, "计费间隔必须在 1-24 小时之间") + preview.Valid = false + } + if imported.Currency == "" { + preview.Errors = append(preview.Errors, "货币代码不能为空") + preview.Valid = false + } + + current, err := s.billingSvc.GetConfig(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前计费配置进行对比") + return preview + } + + preview.Changes = make(map[string]*FieldChange) + if current.Enabled != imported.Enabled { + preview.Changes["enabled"] = &FieldChange{Current: current.Enabled, Imported: imported.Enabled} + } + if current.Interval != imported.Interval { + preview.Changes["interval"] = &FieldChange{Current: current.Interval, Imported: imported.Interval} + } + if current.Currency != imported.Currency { + preview.Changes["currency"] = &FieldChange{Current: current.Currency, Imported: imported.Currency} + } + if current.CurrencySymbol != imported.CurrencySymbol { + preview.Changes["currencySymbol"] = &FieldChange{Current: current.CurrencySymbol, Imported: imported.CurrencySymbol} + } + if current.GracePeriodValue != imported.GracePeriodValue { + preview.Changes["gracePeriodValue"] = &FieldChange{Current: current.GracePeriodValue, Imported: imported.GracePeriodValue} + } + if current.GracePeriodUnit != imported.GracePeriodUnit { + preview.Changes["gracePeriodUnit"] = &FieldChange{Current: current.GracePeriodUnit, Imported: imported.GracePeriodUnit} + } + + return preview +} + +func (s *ConfigTransferService) previewAlerts(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported AlertConfig + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "告警配置格式无效: "+err.Error()) + return preview + } + + if imported.BalanceThreshold < 0 { + preview.Errors = append(preview.Errors, "告警阈值不能为负数") + preview.Valid = false + } + + for _, ch := range imported.Channels { + if ch.ID == "" || ch.Type == "" || ch.Name == "" { + preview.Errors = append(preview.Errors, fmt.Sprintf("告警通道 '%s' 缺少必填字段 (id/type/name)", ch.Name)) + preview.Valid = false + } + for _, val := range ch.Config { + if val == RedactedValue { + preview.HasSensitiveData = true + preview.Warnings = append(preview.Warnings, "告警通道包含已脱敏的敏感数据,导入时将保留当前值") + break + } + } + } + + current, err := s.alertSvc.GetConfig(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前告警配置进行对比") + return preview + } + + preview.Changes = make(map[string]*FieldChange) + if current.BalanceThreshold != imported.BalanceThreshold { + preview.Changes["balanceThreshold"] = &FieldChange{Current: current.BalanceThreshold, Imported: imported.BalanceThreshold} + } + if len(current.Channels) != len(imported.Channels) { + preview.Changes["channels"] = &FieldChange{ + Current: fmt.Sprintf("%d 个通道", len(current.Channels)), + Imported: fmt.Sprintf("%d 个通道", len(imported.Channels)), + } + } + + return preview +} + +func (s *ConfigTransferService) previewResources(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported []ResourceDefinition + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "资源配置格式无效: "+err.Error()) + return preview + } + + for _, r := range imported { + if r.Name == "" { + preview.Errors = append(preview.Errors, "资源名称不能为空") + preview.Valid = false + } + if r.Divisor <= 0 { + preview.Errors = append(preview.Errors, fmt.Sprintf("资源 '%s' 的 divisor 必须大于 0", r.Name)) + preview.Valid = false + } + } + + current, err := s.resourceConfigSvc.GetResourceConfigs(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前资源配置进行对比") + return preview + } + + currentMap := make(map[string]ResourceDefinition) + for _, r := range current { + currentMap[r.Name] = r + } + importedMap := make(map[string]ResourceDefinition) + for _, r := range imported { + importedMap[r.Name] = r + } + + summary := &ResourceSummary{} + for _, r := range imported { + if _, exists := currentMap[r.Name]; exists { + curR := currentMap[r.Name] + if curR.DisplayName != r.DisplayName || curR.Unit != r.Unit || curR.Divisor != r.Divisor || + curR.Category != r.Category || curR.Enabled != r.Enabled || curR.Price != r.Price || + curR.SortOrder != r.SortOrder || curR.ShowInQuota != r.ShowInQuota { + summary.Modified = append(summary.Modified, r.Name) + } else { + summary.Unchanged = append(summary.Unchanged, r.Name) + } + } else { + summary.Added = append(summary.Added, r.Name) + } + } + for _, r := range current { + if _, exists := importedMap[r.Name]; !exists { + summary.Removed = append(summary.Removed, r.Name) + } + } + + if len(summary.Removed) > 0 { + preview.Warnings = append(preview.Warnings, fmt.Sprintf("以下资源将被移除: %v", summary.Removed)) + } + + preview.Summary = summary + return preview +} + +func (s *ConfigTransferService) previewControlPlane(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported ControlPlaneConfig + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "控制面配置格式无效: "+err.Error()) + return preview + } + + if imported.SSHPort < 1 || imported.SSHPort > 65535 { + preview.Errors = append(preview.Errors, "SSH 端口必须在 1-65535 之间") + preview.Valid = false + } + if imported.AuthMethod != "" && imported.AuthMethod != "password" && imported.AuthMethod != "privateKey" { + preview.Errors = append(preview.Errors, "认证方式必须为 password 或 privateKey") + preview.Valid = false + } + + if imported.Password == RedactedValue || imported.PrivateKey == RedactedValue { + preview.HasSensitiveData = true + preview.Warnings = append(preview.Warnings, "敏感数据 (密码/私钥) 已被排除,导入时将保留当前值") + } + + current, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前控制面配置进行对比") + return preview + } + + preview.Changes = make(map[string]*FieldChange) + if current.Host != imported.Host { + preview.Changes["host"] = &FieldChange{Current: current.Host, Imported: imported.Host} + } + if current.SSHPort != imported.SSHPort { + preview.Changes["sshPort"] = &FieldChange{Current: current.SSHPort, Imported: imported.SSHPort} + } + if current.SSHUser != imported.SSHUser { + preview.Changes["sshUser"] = &FieldChange{Current: current.SSHUser, Imported: imported.SSHUser} + } + if current.AuthMethod != imported.AuthMethod { + preview.Changes["authMethod"] = &FieldChange{Current: current.AuthMethod, Imported: imported.AuthMethod} + } + + return preview +} + +func (s *ConfigTransferService) previewInitScripts(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported []ScriptGroup + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "初始化脚本配置格式无效: "+err.Error()) + return preview + } + + for _, g := range imported { + if g.ID == "" || g.Name == "" { + preview.Errors = append(preview.Errors, fmt.Sprintf("脚本组 '%s' 缺少必填字段 (id/name)", g.Name)) + preview.Valid = false + } + if g.Phase != PhasePreJoin && g.Phase != PhasePostJoin { + preview.Errors = append(preview.Errors, fmt.Sprintf("脚本组 '%s' 的 phase 必须为 pre-join 或 post-join", g.Name)) + preview.Valid = false + } + } + + current, err := s.initScriptSvc.GetAllScriptGroups(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前初始化脚本进行对比") + return preview + } + + currentMap := make(map[string]ScriptGroup) + for _, g := range current { + currentMap[g.ID] = g + } + + summary := &ResourceSummary{} + for _, g := range imported { + if _, exists := currentMap[g.ID]; exists { + summary.Modified = append(summary.Modified, g.Name) + } else { + summary.Added = append(summary.Added, g.Name) + } + } + importedMap := make(map[string]bool) + for _, g := range imported { + importedMap[g.ID] = true + } + for _, g := range current { + if !importedMap[g.ID] { + summary.Removed = append(summary.Removed, g.Name) + } + } + + builtinOverwrite := 0 + for _, g := range imported { + if cur, exists := currentMap[g.ID]; exists && cur.Builtin { + builtinOverwrite++ + } + } + if builtinOverwrite > 0 { + preview.Warnings = append(preview.Warnings, fmt.Sprintf("将覆盖 %d 个内置脚本组", builtinOverwrite)) + } + + preview.Summary = summary + return preview +} + +// Apply applies the imported configuration +func (s *ConfigTransferService) Apply(ctx context.Context, req *ImportRequest) (*ImportResult, error) { + logger.Info("Applying imported configuration", "sections", req.Sections) + + result := &ImportResult{ + Applied: []string{}, + Skipped: []string{}, + Warnings: []string{}, + } + + sectionSet := make(map[string]bool) + for _, sec := range req.Sections { + sectionSet[sec] = true + } + + for _, section := range AllSections { + raw, exists := req.Config.Sections[section] + if !exists || !sectionSet[section] { + if sectionSet[section] { + result.Skipped = append(result.Skipped, section) + } + continue + } + + var err error + switch section { + case SectionBilling: + err = s.applyBilling(ctx, raw) + case SectionAlerts: + err = s.applyAlerts(ctx, raw, req.PreserveSensitive) + case SectionResources: + err = s.applyResources(ctx, raw) + case SectionCP: + err = s.applyControlPlane(ctx, raw, req.PreserveSensitive) + case SectionScripts: + err = s.applyInitScripts(ctx, raw) + } + + if err != nil { + result.Warnings = append(result.Warnings, fmt.Sprintf("%s 导入失败: %s", section, err.Error())) + result.Skipped = append(result.Skipped, section) + } else { + result.Applied = append(result.Applied, section) + } + } + + if len(result.Applied) > 0 { + result.Message = fmt.Sprintf("成功导入 %d 个配置模块", len(result.Applied)) + } else { + result.Message = "未成功导入任何配置模块" + } + + return result, nil +} + +func (s *ConfigTransferService) applyBilling(ctx context.Context, raw json.RawMessage) error { + var config BillingConfig + if err := json.Unmarshal(raw, &config); err != nil { + return fmt.Errorf("解析计费配置失败: %w", err) + } + return s.billingSvc.SetConfig(ctx, &config) +} + +func (s *ConfigTransferService) applyAlerts(ctx context.Context, raw json.RawMessage, preserveSensitive bool) error { + var config AlertConfig + if err := json.Unmarshal(raw, &config); err != nil { + return fmt.Errorf("解析告警配置失败: %w", err) + } + + if preserveSensitive { + current, err := s.alertSvc.GetConfig(ctx) + if err == nil { + currentChannelMap := make(map[string]NotifyChannel) + for _, ch := range current.Channels { + currentChannelMap[ch.ID] = ch + } + for i, ch := range config.Channels { + if curCh, exists := currentChannelMap[ch.ID]; exists { + for key, val := range ch.Config { + if val == RedactedValue || (len(val) > 8 && val[len(val)-3:] == "***") { + if curVal, ok := curCh.Config[key]; ok { + config.Channels[i].Config[key] = curVal + } + } + } + } + } + } + } + + return s.alertSvc.SetConfig(ctx, &config) +} + +func (s *ConfigTransferService) applyResources(ctx context.Context, raw json.RawMessage) error { + var configs []ResourceDefinition + if err := json.Unmarshal(raw, &configs); err != nil { + return fmt.Errorf("解析资源配置失败: %w", err) + } + return s.resourceConfigSvc.SaveResourceConfigs(ctx, configs) +} + +func (s *ConfigTransferService) applyControlPlane(ctx context.Context, raw json.RawMessage, preserveSensitive bool) error { + var config ControlPlaneConfig + if err := json.Unmarshal(raw, &config); err != nil { + return fmt.Errorf("解析控制面配置失败: %w", err) + } + + if preserveSensitive { + current, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err == nil { + if config.Password == RedactedValue { + config.Password = current.Password + } + if config.PrivateKey == RedactedValue { + config.PrivateKey = current.PrivateKey + } + } + } + + return s.initScriptSvc.SaveControlPlaneConfig(ctx, &config) +} + +func (s *ConfigTransferService) applyInitScripts(ctx context.Context, raw json.RawMessage) error { + var groups []ScriptGroup + if err := json.Unmarshal(raw, &groups); err != nil { + return fmt.Errorf("解析初始化脚本配置失败: %w", err) + } + return s.initScriptSvc.SaveAllScriptGroups(ctx, groups) +} diff --git a/api-server/internal/service/init_script_service.go b/api-server/internal/service/init_script_service.go new file mode 100644 index 0000000..1444b90 --- /dev/null +++ b/api-server/internal/service/init_script_service.go @@ -0,0 +1,924 @@ +package service + +import ( + "context" + "encoding/json" + "fmt" + "sort" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/bison/api-server/internal/k8s" + "github.com/bison/api-server/pkg/logger" +) + +const ( + InitScriptsConfigMap = "bison-init-scripts" + ControlPlaneConfigConfigMap = "bison-control-plane-config" +) + +// ScriptPhase represents when a script should be executed +type ScriptPhase string + +const ( + PhasePreJoin ScriptPhase = "pre-join" + PhasePostJoin ScriptPhase = "post-join" +) + +// Script represents a platform-specific script implementation +type Script struct { + ID string `json:"id"` + OS string `json:"os"` // "ubuntu", "centos", "debian", "*" (wildcard) + Arch string `json:"arch"` // "amd64", "arm64", "*" (wildcard) + Content string `json:"content"` // Shell script content +} + +// ScriptGroup represents a group of scripts for a specific functionality +type ScriptGroup struct { + ID string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + Phase ScriptPhase `json:"phase"` + Enabled bool `json:"enabled"` + Order int `json:"order"` + Builtin bool `json:"builtin"` + Scripts []Script `json:"scripts"` +} + +// InitScriptsConfig holds all script groups +type InitScriptsConfig struct { + Groups []ScriptGroup `json:"groups"` +} + +// NodePlatform represents the detected platform of a node +type NodePlatform struct { + OS string `json:"os"` + Version string `json:"version"` + Arch string `json:"arch"` +} + +// ControlPlaneConfig holds the control plane SSH configuration +type ControlPlaneConfig struct { + Host string `json:"host"` + SSHPort int `json:"sshPort"` + SSHUser string `json:"sshUser"` + AuthMethod string `json:"authMethod"` // "password" or "privateKey" + Password string `json:"password,omitempty"` + PrivateKey string `json:"privateKey,omitempty"` +} + +// InitScriptService handles initialization script operations +type InitScriptService struct { + k8sClient *k8s.Client +} + +// NewInitScriptService creates a new InitScriptService +func NewInitScriptService(k8sClient *k8s.Client) *InitScriptService { + return &InitScriptService{ + k8sClient: k8sClient, + } +} + +// GetAllScriptGroups returns all script groups +func (s *InitScriptService) GetAllScriptGroups(ctx context.Context) ([]ScriptGroup, error) { + logger.Debug("Getting all script groups") + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return nil, err + } + + // Sort by order + sort.Slice(config.Groups, func(i, j int) bool { + return config.Groups[i].Order < config.Groups[j].Order + }) + + return config.Groups, nil +} + +// GetScriptGroup returns a specific script group by ID +func (s *InitScriptService) GetScriptGroup(ctx context.Context, id string) (*ScriptGroup, error) { + logger.Debug("Getting script group", "id", id) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return nil, err + } + + for _, group := range config.Groups { + if group.ID == id { + return &group, nil + } + } + + return nil, fmt.Errorf("script group not found: %s", id) +} + +// CreateScriptGroup creates a new script group +func (s *InitScriptService) CreateScriptGroup(ctx context.Context, group *ScriptGroup) error { + logger.Info("Creating script group", "name", group.Name) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + // Generate ID if not provided + if group.ID == "" { + group.ID = fmt.Sprintf("custom-%d", time.Now().UnixNano()) + } + + // Check for duplicate ID + for _, existing := range config.Groups { + if existing.ID == group.ID { + return fmt.Errorf("script group with ID %s already exists", group.ID) + } + } + + // Set order to last + if group.Order == 0 { + maxOrder := 0 + for _, g := range config.Groups { + if g.Order > maxOrder { + maxOrder = g.Order + } + } + group.Order = maxOrder + 1 + } + + // Custom scripts are not builtin + group.Builtin = false + + config.Groups = append(config.Groups, *group) + + return s.saveInitScriptsConfig(ctx, config) +} + +// UpdateScriptGroup updates an existing script group +func (s *InitScriptService) UpdateScriptGroup(ctx context.Context, id string, group *ScriptGroup) error { + logger.Info("Updating script group", "id", id) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + found := false + for i, existing := range config.Groups { + if existing.ID == id { + // Preserve builtin status and ID + group.ID = id + group.Builtin = existing.Builtin + config.Groups[i] = *group + found = true + break + } + } + + if !found { + return fmt.Errorf("script group not found: %s", id) + } + + return s.saveInitScriptsConfig(ctx, config) +} + +// DeleteScriptGroup deletes a script group (only custom scripts can be deleted) +func (s *InitScriptService) DeleteScriptGroup(ctx context.Context, id string) error { + logger.Info("Deleting script group", "id", id) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + newGroups := make([]ScriptGroup, 0, len(config.Groups)) + deleted := false + + for _, group := range config.Groups { + if group.ID == id { + if group.Builtin { + return fmt.Errorf("cannot delete builtin script group: %s", id) + } + deleted = true + continue + } + newGroups = append(newGroups, group) + } + + if !deleted { + return fmt.Errorf("script group not found: %s", id) + } + + config.Groups = newGroups + return s.saveInitScriptsConfig(ctx, config) +} + +// ToggleScriptGroup enables or disables a script group +func (s *InitScriptService) ToggleScriptGroup(ctx context.Context, id string, enabled bool) error { + logger.Info("Toggling script group", "id", id, "enabled", enabled) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + found := false + for i, group := range config.Groups { + if group.ID == id { + config.Groups[i].Enabled = enabled + found = true + break + } + } + + if !found { + return fmt.Errorf("script group not found: %s", id) + } + + return s.saveInitScriptsConfig(ctx, config) +} + +// ReorderScriptGroups updates the order of script groups +func (s *InitScriptService) ReorderScriptGroups(ctx context.Context, ids []string) error { + logger.Info("Reordering script groups", "ids", ids) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + // Create a map of current groups + groupMap := make(map[string]*ScriptGroup) + for i := range config.Groups { + groupMap[config.Groups[i].ID] = &config.Groups[i] + } + + // Update orders based on the provided order + for i, id := range ids { + if group, ok := groupMap[id]; ok { + group.Order = i + 1 + } + } + + return s.saveInitScriptsConfig(ctx, config) +} + +// GetMatchingScript returns the best matching script for a given platform +func (s *InitScriptService) GetMatchingScript(group *ScriptGroup, platform NodePlatform) *Script { + if len(group.Scripts) == 0 { + return nil + } + + // Priority: exact match > OS match with wildcard arch > wildcard OS with arch match > all wildcards + var exactMatch, osMatch, archMatch, wildcardMatch *Script + + for i := range group.Scripts { + script := &group.Scripts[i] + osMatches := script.OS == platform.OS || script.OS == "*" + archMatches := script.Arch == platform.Arch || script.Arch == "*" + + if !osMatches || !archMatches { + continue + } + + if script.OS == platform.OS && script.Arch == platform.Arch { + exactMatch = script + break // Best match found + } else if script.OS == platform.OS && script.Arch == "*" { + osMatch = script + } else if script.OS == "*" && script.Arch == platform.Arch { + archMatch = script + } else if script.OS == "*" && script.Arch == "*" { + wildcardMatch = script + } + } + + // Return by priority + if exactMatch != nil { + return exactMatch + } + if osMatch != nil { + return osMatch + } + if archMatch != nil { + return archMatch + } + return wildcardMatch +} + +// GetScriptsForPhase returns all enabled scripts for a specific phase, matched to the platform +func (s *InitScriptService) GetScriptsForPhase(ctx context.Context, phase ScriptPhase, platform NodePlatform) ([]struct { + Group ScriptGroup + Script Script +}, error) { + groups, err := s.GetAllScriptGroups(ctx) + if err != nil { + return nil, err + } + + var result []struct { + Group ScriptGroup + Script Script + } + + for _, group := range groups { + if group.Phase != phase || !group.Enabled { + continue + } + + script := s.GetMatchingScript(&group, platform) + if script != nil { + result = append(result, struct { + Group ScriptGroup + Script Script + }{ + Group: group, + Script: *script, + }) + } + } + + return result, nil +} + +// GetControlPlaneConfig returns the control plane SSH configuration +func (s *InitScriptService) GetControlPlaneConfig(ctx context.Context) (*ControlPlaneConfig, error) { + logger.Debug("Getting control plane config") + + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, ControlPlaneConfigConfigMap) + if err != nil { + if errors.IsNotFound(err) { + return &ControlPlaneConfig{ + SSHPort: 22, + SSHUser: "root", + }, nil + } + return nil, fmt.Errorf("failed to get control plane config: %w", err) + } + + data, ok := cm.Data["config"] + if !ok { + return &ControlPlaneConfig{ + SSHPort: 22, + SSHUser: "root", + }, nil + } + + var config ControlPlaneConfig + if err := json.Unmarshal([]byte(data), &config); err != nil { + return nil, fmt.Errorf("failed to parse control plane config: %w", err) + } + + return &config, nil +} + +// SaveControlPlaneConfig saves the control plane SSH configuration +func (s *InitScriptService) SaveControlPlaneConfig(ctx context.Context, config *ControlPlaneConfig) error { + logger.Info("Saving control plane config", "host", config.Host) + + // Set defaults + if config.SSHPort == 0 { + config.SSHPort = 22 + } + if config.SSHUser == "" { + config.SSHUser = "root" + } + + data, err := json.Marshal(config) + if err != nil { + return fmt.Errorf("failed to marshal control plane config: %w", err) + } + + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, ControlPlaneConfigConfigMap) + if err != nil { + if errors.IsNotFound(err) { + // Create new ConfigMap + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: ControlPlaneConfigConfigMap, + Namespace: BisonNamespace, + }, + Data: map[string]string{ + "config": string(data), + }, + } + return s.k8sClient.CreateConfigMap(ctx, BisonNamespace, cm) + } + return fmt.Errorf("failed to get control plane config: %w", err) + } + + // Update existing ConfigMap + if cm.Data == nil { + cm.Data = make(map[string]string) + } + cm.Data["config"] = string(data) + + return s.k8sClient.UpdateConfigMap(ctx, BisonNamespace, cm) +} + +// SaveAllScriptGroups replaces all script groups at once (used by config import) +func (s *InitScriptService) SaveAllScriptGroups(ctx context.Context, groups []ScriptGroup) error { + logger.Info("Saving all script groups", "count", len(groups)) + config := &InitScriptsConfig{Groups: groups} + return s.saveInitScriptsConfig(ctx, config) +} + +// getInitScriptsConfig returns the init scripts configuration, initializing with defaults if not found +func (s *InitScriptService) getInitScriptsConfig(ctx context.Context) (*InitScriptsConfig, error) { + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, InitScriptsConfigMap) + if err != nil { + if errors.IsNotFound(err) { + // Initialize with default builtin scripts + config := s.getDefaultInitScriptsConfig() + if err := s.saveInitScriptsConfig(ctx, config); err != nil { + return nil, err + } + return config, nil + } + return nil, fmt.Errorf("failed to get init scripts config: %w", err) + } + + data, ok := cm.Data["config"] + if !ok { + config := s.getDefaultInitScriptsConfig() + if err := s.saveInitScriptsConfig(ctx, config); err != nil { + return nil, err + } + return config, nil + } + + var config InitScriptsConfig + if err := json.Unmarshal([]byte(data), &config); err != nil { + return nil, fmt.Errorf("failed to parse init scripts config: %w", err) + } + + return &config, nil +} + +// saveInitScriptsConfig saves the init scripts configuration +func (s *InitScriptService) saveInitScriptsConfig(ctx context.Context, config *InitScriptsConfig) error { + data, err := json.Marshal(config) + if err != nil { + return fmt.Errorf("failed to marshal init scripts config: %w", err) + } + + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, InitScriptsConfigMap) + if err != nil { + if errors.IsNotFound(err) { + // Create new ConfigMap + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: InitScriptsConfigMap, + Namespace: BisonNamespace, + }, + Data: map[string]string{ + "config": string(data), + }, + } + return s.k8sClient.CreateConfigMap(ctx, BisonNamespace, cm) + } + return fmt.Errorf("failed to get init scripts config: %w", err) + } + + // Update existing ConfigMap + if cm.Data == nil { + cm.Data = make(map[string]string) + } + cm.Data["config"] = string(data) + + return s.k8sClient.UpdateConfigMap(ctx, BisonNamespace, cm) +} + +// getDefaultInitScriptsConfig returns the default builtin script groups +func (s *InitScriptService) getDefaultInitScriptsConfig() *InitScriptsConfig { + return &InitScriptsConfig{ + Groups: []ScriptGroup{ + { + ID: "disable-swap", + Name: "禁用 Swap", + Description: "禁用 Swap 分区(Kubernetes 要求)", + Phase: PhasePreJoin, + Enabled: true, + Order: 1, + Builtin: true, + Scripts: []Script{ + { + ID: "disable-swap-universal", + OS: "*", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling swap..." +swapoff -a || true +sed -i '/swap/d' /etc/fstab || true +echo "Swap disabled successfully" +`, + }, + }, + }, + { + ID: "configure-kernel", + Name: "配置内核参数", + Description: "配置 Kubernetes 所需的内核参数", + Phase: PhasePreJoin, + Enabled: true, + Order: 2, + Builtin: true, + Scripts: []Script{ + { + ID: "configure-kernel-universal", + OS: "*", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Configuring kernel parameters..." + +# Load required modules +modprobe br_netfilter || true +modprobe overlay || true + +# Ensure modules load on boot +cat > /etc/modules-load.d/k8s.conf << EOF +br_netfilter +overlay +EOF + +# Configure sysctl +cat > /etc/sysctl.d/k8s.conf << EOF +net.bridge.bridge-nf-call-iptables = 1 +net.bridge.bridge-nf-call-ip6tables = 1 +net.ipv4.ip_forward = 1 +EOF + +sysctl --system +echo "Kernel parameters configured successfully" +`, + }, + }, + }, + { + ID: "disable-firewall", + Name: "禁用防火墙", + Description: "禁用节点防火墙(firewalld/ufw)", + Phase: PhasePreJoin, + Enabled: false, + Order: 3, + Builtin: true, + Scripts: []Script{ + { + ID: "disable-firewall-debian", + OS: "ubuntu", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if command -v ufw &> /dev/null; then + ufw disable || true +fi +echo "Firewall disabled successfully" +`, + }, + { + ID: "disable-firewall-debian2", + OS: "debian", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if command -v ufw &> /dev/null; then + ufw disable || true +fi +echo "Firewall disabled successfully" +`, + }, + { + ID: "disable-firewall-rhel", + OS: "centos", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if systemctl is-active --quiet firewalld 2>/dev/null; then + systemctl stop firewalld + systemctl disable firewalld +fi +echo "Firewall disabled successfully" +`, + }, + { + ID: "disable-firewall-rhel2", + OS: "rhel", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if systemctl is-active --quiet firewalld 2>/dev/null; then + systemctl stop firewalld + systemctl disable firewalld +fi +echo "Firewall disabled successfully" +`, + }, + { + ID: "disable-firewall-openeuler", + OS: "openEuler", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if systemctl is-active --quiet firewalld 2>/dev/null; then + systemctl stop firewalld + systemctl disable firewalld +fi +echo "Firewall disabled successfully" +`, + }, + }, + }, + { + ID: "configure-selinux", + Name: "配置 SELinux", + Description: "设置 SELinux 为 Permissive 模式(仅 RHEL/CentOS/openEuler)", + Phase: PhasePreJoin, + Enabled: false, + Order: 4, + Builtin: true, + Scripts: []Script{ + { + ID: "configure-selinux-centos", + OS: "centos", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Configuring SELinux to permissive mode..." +if command -v setenforce &> /dev/null; then + setenforce 0 || true + if [ -f /etc/selinux/config ]; then + sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config + fi +fi +echo "SELinux configured successfully" +`, + }, + { + ID: "configure-selinux-rhel", + OS: "rhel", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Configuring SELinux to permissive mode..." +if command -v setenforce &> /dev/null; then + setenforce 0 || true + if [ -f /etc/selinux/config ]; then + sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config + fi +fi +echo "SELinux configured successfully" +`, + }, + { + ID: "configure-selinux-openeuler", + OS: "openEuler", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Configuring SELinux to permissive mode..." +if command -v setenforce &> /dev/null; then + setenforce 0 || true + if [ -f /etc/selinux/config ]; then + sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config + fi +fi +echo "SELinux configured successfully" +`, + }, + }, + }, + { + ID: "configure-timezone", + Name: "配置时区和 NTP", + Description: "设置系统时区并启用 NTP 时间同步", + Phase: PhasePreJoin, + Enabled: false, + Order: 5, + Builtin: true, + Scripts: []Script{ + { + ID: "configure-timezone-universal", + OS: "*", + Arch: "*", + Content: `#!/bin/bash +set -e +TIMEZONE="${TIMEZONE:-Asia/Shanghai}" + +echo "Configuring timezone to $TIMEZONE..." +timedatectl set-timezone $TIMEZONE || true + +echo "Enabling and starting NTP service..." +if systemctl list-unit-files | grep -q chronyd; then + systemctl enable chronyd || true + systemctl start chronyd || true +elif systemctl list-unit-files | grep -q ntpd; then + systemctl enable ntpd || true + systemctl start ntpd || true +elif systemctl list-unit-files | grep -q systemd-timesyncd; then + systemctl enable systemd-timesyncd || true + systemctl start systemd-timesyncd || true +fi + +echo "Timezone and NTP configured successfully" +`, + }, + }, + }, + { + ID: "configure-registry", + Name: "配置私有镜像仓库", + Description: "配置 containerd 使用私有镜像仓库(支持 HTTP)", + Phase: PhasePreJoin, + Enabled: false, + Order: 6, + Builtin: true, + Scripts: []Script{ + { + ID: "configure-registry-ubuntu", + OS: "ubuntu", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + { + ID: "configure-registry-debian", + OS: "debian", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + { + ID: "configure-registry-centos", + OS: "centos", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + { + ID: "configure-registry-rhel", + OS: "rhel", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + { + ID: "configure-registry-openeuler", + OS: "openEuler", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + }, + }, + { + ID: "add-node-labels", + Name: "添加节点标签", + Description: "为节点添加 Worker 角色标签", + Phase: PhasePostJoin, + Enabled: false, + Order: 7, + Builtin: true, + Scripts: []Script{ + { + ID: "add-node-labels-universal", + OS: "*", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Adding worker label to node ${NODE_NAME}..." + +# Wait for node to be registered +sleep 5 + +# Add worker role label +kubectl label node ${NODE_NAME} node-role.kubernetes.io/worker= --overwrite || true + +echo "Node label added successfully" +`, + }, + }, + }, + }, + } +} + +// ReplaceVariables replaces variables in the script content +func ReplaceVariables(content string, vars map[string]string) string { + result := content + for key, value := range vars { + placeholder := "${" + key + "}" + result = strings.ReplaceAll(result, placeholder, value) + } + return result +} diff --git a/api-server/internal/service/main_test.go b/api-server/internal/service/main_test.go new file mode 100644 index 0000000..c0d86b0 --- /dev/null +++ b/api-server/internal/service/main_test.go @@ -0,0 +1,15 @@ +package service + +import ( + "os" + "testing" + + "github.com/bison/api-server/pkg/logger" +) + +// TestMain initializes the package-level logger so service tests that log do not +// hit a nil SugaredLogger. +func TestMain(m *testing.M) { + logger.Init(false) + os.Exit(m.Run()) +} diff --git a/api-server/internal/service/onboarding_service.go b/api-server/internal/service/onboarding_service.go new file mode 100644 index 0000000..7efc8a1 --- /dev/null +++ b/api-server/internal/service/onboarding_service.go @@ -0,0 +1,760 @@ +package service + +import ( + "context" + "encoding/json" + "fmt" + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/bison/api-server/internal/k8s" + "github.com/bison/api-server/internal/ssh" + "github.com/bison/api-server/pkg/logger" +) + +// Ensure metav1 is used +var _ = metav1.Now + +const ( + OnboardingJobsConfigMap = "bison-onboarding-jobs" +) + +// OnboardingJobStatus represents the status of an onboarding job +type OnboardingJobStatus string + +const ( + JobStatusPending OnboardingJobStatus = "pending" + JobStatusRunning OnboardingJobStatus = "running" + JobStatusSuccess OnboardingJobStatus = "success" + JobStatusFailed OnboardingJobStatus = "failed" + JobStatusCancelled OnboardingJobStatus = "cancelled" +) + +// SubStepStatus represents the status of a sub-step +type SubStepStatus string + +const ( + SubStepPending SubStepStatus = "pending" + SubStepRunning SubStepStatus = "running" + SubStepSuccess SubStepStatus = "success" + SubStepFailed SubStepStatus = "failed" + SubStepSkipped SubStepStatus = "skipped" +) + +// SubStep represents a sub-step within a main step +type SubStep struct { + Name string `json:"name"` + Status SubStepStatus `json:"status"` + Error string `json:"error,omitempty"` +} + +// OnboardingJob represents a node onboarding job +type OnboardingJob struct { + ID string `json:"id"` + NodeIP string `json:"nodeIP"` + NodeName string `json:"nodeName,omitempty"` + Platform NodePlatform `json:"platform"` + Status OnboardingJobStatus `json:"status"` + CurrentStep int `json:"currentStep"` + TotalSteps int `json:"totalSteps"` + StepMessage string `json:"stepMessage"` + SubSteps []SubStep `json:"subSteps,omitempty"` + ErrorMessage string `json:"errorMessage,omitempty"` + CreatedAt time.Time `json:"createdAt"` + UpdatedAt time.Time `json:"updatedAt"` + CompletedAt *time.Time `json:"completedAt,omitempty"` +} + +// OnboardingRequest represents a request to onboard a new node +type OnboardingRequest struct { + NodeIP string `json:"nodeIP" binding:"required"` + SSHPort int `json:"sshPort"` + SSHUsername string `json:"sshUsername" binding:"required"` + AuthMethod string `json:"authMethod" binding:"required,oneof=password privateKey"` + Password string `json:"password"` + PrivateKey string `json:"privateKey"` +} + +// OnboardingService handles node onboarding operations +type OnboardingService struct { + k8sClient *k8s.Client + nodeSvc *NodeService + initScriptSvc *InitScriptService + runningJobs map[string]context.CancelFunc + runningJobsMu sync.RWMutex +} + +// NewOnboardingService creates a new OnboardingService +func NewOnboardingService(k8sClient *k8s.Client, nodeSvc *NodeService, initScriptSvc *InitScriptService) *OnboardingService { + return &OnboardingService{ + k8sClient: k8sClient, + nodeSvc: nodeSvc, + initScriptSvc: initScriptSvc, + runningJobs: make(map[string]context.CancelFunc), + } +} + +// StartOnboarding starts a new node onboarding job +func (s *OnboardingService) StartOnboarding(ctx context.Context, req *OnboardingRequest) (*OnboardingJob, error) { + logger.Info("Starting node onboarding", "nodeIP", req.NodeIP) + + // Set defaults + if req.SSHPort == 0 { + req.SSHPort = 22 + } + + // Validate authentication + if req.AuthMethod == "password" && req.Password == "" { + return nil, fmt.Errorf("password is required for password authentication") + } + if req.AuthMethod == "privateKey" && req.PrivateKey == "" { + return nil, fmt.Errorf("private key is required for private key authentication") + } + + // Check if there's already a running job for this IP + jobs, err := s.ListJobs(ctx) + if err != nil { + return nil, err + } + for _, job := range jobs { + if job.NodeIP == req.NodeIP && (job.Status == JobStatusPending || job.Status == JobStatusRunning) { + return nil, fmt.Errorf("there is already a running onboarding job for this IP: %s", job.ID) + } + } + + // Create job + job := &OnboardingJob{ + ID: fmt.Sprintf("job-%d", time.Now().UnixNano()), + NodeIP: req.NodeIP, + Status: JobStatusPending, + CurrentStep: 0, + TotalSteps: 9, + StepMessage: "Job created, waiting to start", + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + + // Save job + if err := s.saveJob(ctx, job); err != nil { + return nil, err + } + + // Start async execution + jobCtx, cancel := context.WithCancel(context.Background()) + s.runningJobsMu.Lock() + s.runningJobs[job.ID] = cancel + s.runningJobsMu.Unlock() + + go s.executeOnboarding(jobCtx, job, req) + + return job, nil +} + +// GetJob returns a specific job by ID +func (s *OnboardingService) GetJob(ctx context.Context, jobID string) (*OnboardingJob, error) { + jobs, err := s.getJobsMap(ctx) + if err != nil { + return nil, err + } + + jobData, ok := jobs[jobID] + if !ok { + return nil, fmt.Errorf("job not found: %s", jobID) + } + + var job OnboardingJob + if err := json.Unmarshal([]byte(jobData), &job); err != nil { + return nil, fmt.Errorf("failed to parse job data: %w", err) + } + + return &job, nil +} + +// ListJobs returns all onboarding jobs +func (s *OnboardingService) ListJobs(ctx context.Context) ([]*OnboardingJob, error) { + jobs, err := s.getJobsMap(ctx) + if err != nil { + return nil, err + } + + result := make([]*OnboardingJob, 0, len(jobs)) + for _, jobData := range jobs { + var job OnboardingJob + if err := json.Unmarshal([]byte(jobData), &job); err != nil { + continue + } + result = append(result, &job) + } + + return result, nil +} + +// CancelJob cancels a running job +func (s *OnboardingService) CancelJob(ctx context.Context, jobID string) error { + logger.Info("Cancelling onboarding job", "jobID", jobID) + + job, err := s.GetJob(ctx, jobID) + if err != nil { + return err + } + + if job.Status != JobStatusPending && job.Status != JobStatusRunning { + return fmt.Errorf("job is not running: %s", job.Status) + } + + // Cancel the job context + s.runningJobsMu.Lock() + if cancel, ok := s.runningJobs[jobID]; ok { + cancel() + delete(s.runningJobs, jobID) + } + s.runningJobsMu.Unlock() + + // Update job status + job.Status = JobStatusCancelled + job.StepMessage = "Job cancelled by user" + job.UpdatedAt = time.Now() + now := time.Now() + job.CompletedAt = &now + + return s.saveJob(ctx, job) +} + +// executeOnboarding executes the onboarding process +func (s *OnboardingService) executeOnboarding(ctx context.Context, job *OnboardingJob, req *OnboardingRequest) { + defer func() { + s.runningJobsMu.Lock() + delete(s.runningJobs, job.ID) + s.runningJobsMu.Unlock() + }() + + // Update job status to running + job.Status = JobStatusRunning + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Create SSH executor for target node + sshConfig := &ssh.Config{ + Host: req.NodeIP, + Port: req.SSHPort, + Username: req.SSHUsername, + AuthMethod: ssh.AuthMethod(req.AuthMethod), + Password: req.Password, + PrivateKey: req.PrivateKey, + Timeout: 30 * time.Second, + } + executor := ssh.NewExecutor(sshConfig) + defer executor.Close() + + // Step 1: Connection test + if err := s.stepConnectionTest(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 2: Platform detection + if err := s.stepPlatformDetection(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 3: Environment check + if err := s.stepEnvironmentCheck(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 4: Pre-join scripts + if err := s.stepPreJoinScripts(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 5: Get join token + joinCommand, err := s.stepGetJoinToken(ctx, job) + if err != nil { + s.failJob(job, err) + return + } + + // Step 6: Execute kubeadm join + if err := s.stepKubeadmJoin(ctx, job, executor, joinCommand); err != nil { + s.failJob(job, err) + return + } + + // Step 7: Post-join scripts + if err := s.stepPostJoinScripts(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 8: Wait for node ready + if err := s.stepWaitForNodeReady(ctx, job); err != nil { + s.failJob(job, err) + return + } + + // Step 9: Enable node + if err := s.stepEnableNode(ctx, job); err != nil { + s.failJob(job, err) + return + } + + // Mark job as successful + job.Status = JobStatusSuccess + job.StepMessage = "Node onboarding completed successfully" + job.UpdatedAt = time.Now() + now := time.Now() + job.CompletedAt = &now + s.saveJob(context.Background(), job) + + logger.Info("Node onboarding completed successfully", "nodeIP", job.NodeIP, "nodeName", job.NodeName) +} + +func (s *OnboardingService) stepConnectionTest(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 1 + job.StepMessage = "Testing SSH connection..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + if err := executor.TestConnection(ctx); err != nil { + return fmt.Errorf("SSH connection test failed: %w", err) + } + + return nil +} + +func (s *OnboardingService) stepPlatformDetection(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 2 + job.StepMessage = "Detecting node platform..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + info, err := executor.GetHostInfo(ctx) + if err != nil { + return fmt.Errorf("failed to detect platform: %w", err) + } + + job.Platform = NodePlatform{ + OS: info["os"], + Version: info["version"], + Arch: info["arch"], + } + + if info["hostname"] != "" { + job.NodeName = info["hostname"] + } + + job.StepMessage = fmt.Sprintf("Detected: %s %s (%s)", job.Platform.OS, job.Platform.Version, job.Platform.Arch) + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + return nil +} + +func (s *OnboardingService) stepEnvironmentCheck(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 3 + job.StepMessage = "Checking environment..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Check if kubeadm is installed + if !executor.CheckCommand(ctx, "kubeadm") { + return fmt.Errorf("kubeadm is not installed on the target node") + } + + // Check if kubelet is installed + if !executor.CheckCommand(ctx, "kubelet") { + return fmt.Errorf("kubelet is not installed on the target node") + } + + return nil +} + +func (s *OnboardingService) stepPreJoinScripts(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 4 + job.StepMessage = "Executing pre-join scripts..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Get init scripts for pre-join phase + scripts, err := s.initScriptSvc.GetScriptsForPhase(ctx, PhasePreJoin, job.Platform) + if err != nil { + return fmt.Errorf("failed to get pre-join scripts: %w", err) + } + + if len(scripts) == 0 { + job.StepMessage = "No pre-join scripts to execute" + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + return nil + } + + // Initialize sub-steps + job.SubSteps = make([]SubStep, len(scripts)) + for i, script := range scripts { + job.SubSteps[i] = SubStep{ + Name: script.Group.Name, + Status: SubStepPending, + } + } + s.saveJob(context.Background(), job) + + // Get variables for script replacement + cpConfig, _ := s.initScriptSvc.GetControlPlaneConfig(ctx) + controlPlaneIP := "" + if cpConfig != nil { + controlPlaneIP = cpConfig.Host + } + vars := map[string]string{ + "NODE_IP": job.NodeIP, + "NODE_NAME": job.NodeName, + "CONTROL_PLANE_IP": controlPlaneIP, + } + + // Execute scripts + for stepIdx, script := range scripts { + job.SubSteps[stepIdx].Status = SubStepRunning + job.StepMessage = fmt.Sprintf("Executing: %s", script.Group.Name) + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Replace variables in script content + content := ReplaceVariables(script.Script.Content, vars) + + // Execute script + result := executor.ExecuteScript(ctx, content) + if result.Error != nil || result.ExitCode != 0 { + job.SubSteps[stepIdx].Status = SubStepFailed + errMsg := result.Stderr + if result.Error != nil { + errMsg = result.Error.Error() + } + job.SubSteps[stepIdx].Error = errMsg + s.saveJob(context.Background(), job) + return fmt.Errorf("pre-join script '%s' failed: %s", script.Group.Name, errMsg) + } + + job.SubSteps[stepIdx].Status = SubStepSuccess + s.saveJob(context.Background(), job) + } + + job.SubSteps = nil // Clear sub-steps after completion + return nil +} + +func (s *OnboardingService) stepGetJoinToken(ctx context.Context, job *OnboardingJob) (string, error) { + job.CurrentStep = 5 + job.StepMessage = "Getting join token from control plane..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Get control plane config + cpConfig, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err != nil { + return "", fmt.Errorf("failed to get control plane config: %w", err) + } + + if cpConfig.Host == "" { + return "", fmt.Errorf("control plane host is not configured") + } + + // Create SSH executor for control plane + cpSSHConfig := &ssh.Config{ + Host: cpConfig.Host, + Port: cpConfig.SSHPort, + Username: cpConfig.SSHUser, + AuthMethod: ssh.AuthMethod(cpConfig.AuthMethod), + Password: cpConfig.Password, + PrivateKey: cpConfig.PrivateKey, + Timeout: 30 * time.Second, + } + cpExecutor := ssh.NewExecutor(cpSSHConfig) + defer cpExecutor.Close() + + if err := cpExecutor.Connect(ctx); err != nil { + return "", fmt.Errorf("failed to connect to control plane: %w", err) + } + + // Generate join command + result := cpExecutor.Execute(ctx, "kubeadm token create --print-join-command") + if result.Error != nil || result.ExitCode != 0 { + errMsg := result.Stderr + if result.Error != nil { + errMsg = result.Error.Error() + } + return "", fmt.Errorf("failed to generate join command: %s", errMsg) + } + + joinCommand := result.Stdout + if joinCommand == "" { + return "", fmt.Errorf("empty join command returned") + } + + return joinCommand, nil +} + +func (s *OnboardingService) stepKubeadmJoin(ctx context.Context, job *OnboardingJob, executor *ssh.Executor, joinCommand string) error { + job.CurrentStep = 6 + job.StepMessage = "Executing kubeadm join..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Execute kubeadm join with a longer timeout + joinCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + + result := executor.Execute(joinCtx, joinCommand) + if result.Error != nil || result.ExitCode != 0 { + errMsg := result.Stderr + if result.Error != nil { + errMsg = result.Error.Error() + } + return fmt.Errorf("kubeadm join failed: %s", errMsg) + } + + return nil +} + +func (s *OnboardingService) stepPostJoinScripts(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 7 + job.StepMessage = "Executing post-join scripts..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Get init scripts for post-join phase + scripts, err := s.initScriptSvc.GetScriptsForPhase(ctx, PhasePostJoin, job.Platform) + if err != nil { + return fmt.Errorf("failed to get post-join scripts: %w", err) + } + + if len(scripts) == 0 { + job.StepMessage = "No post-join scripts to execute" + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + return nil + } + + // Initialize sub-steps + job.SubSteps = make([]SubStep, len(scripts)) + for i, script := range scripts { + job.SubSteps[i] = SubStep{ + Name: script.Group.Name, + Status: SubStepPending, + } + } + s.saveJob(context.Background(), job) + + // Get variables for script replacement + cpConfig, _ := s.initScriptSvc.GetControlPlaneConfig(ctx) + controlPlaneIP := "" + if cpConfig != nil { + controlPlaneIP = cpConfig.Host + } + vars := map[string]string{ + "NODE_IP": job.NodeIP, + "NODE_NAME": job.NodeName, + "CONTROL_PLANE_IP": controlPlaneIP, + } + + // Execute scripts + for stepIdx, script := range scripts { + job.SubSteps[stepIdx].Status = SubStepRunning + job.StepMessage = fmt.Sprintf("Executing: %s", script.Group.Name) + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Replace variables in script content + content := ReplaceVariables(script.Script.Content, vars) + + // Execute script + result := executor.ExecuteScript(ctx, content) + if result.Error != nil || result.ExitCode != 0 { + job.SubSteps[stepIdx].Status = SubStepFailed + errMsg := result.Stderr + if result.Error != nil { + errMsg = result.Error.Error() + } + job.SubSteps[stepIdx].Error = errMsg + s.saveJob(context.Background(), job) + return fmt.Errorf("post-join script '%s' failed: %s", script.Group.Name, errMsg) + } + + job.SubSteps[stepIdx].Status = SubStepSuccess + s.saveJob(context.Background(), job) + } + + job.SubSteps = nil // Clear sub-steps after completion + return nil +} + +func (s *OnboardingService) stepWaitForNodeReady(ctx context.Context, job *OnboardingJob) error { + job.CurrentStep = 8 + job.StepMessage = "Waiting for node to be ready..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Wait for node to appear and become ready + timeout := time.After(5 * time.Minute) + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-timeout: + return fmt.Errorf("timeout waiting for node to be ready") + case <-ticker.C: + // Try to find the node + nodes, err := s.k8sClient.ListNodes(ctx) + if err != nil { + continue + } + + for _, node := range nodes.Items { + // Match by IP or hostname + nodeIP := "" + for _, addr := range node.Status.Addresses { + if addr.Type == corev1.NodeInternalIP { + nodeIP = addr.Address + break + } + } + + if nodeIP == job.NodeIP || node.Name == job.NodeName { + job.NodeName = node.Name + + // Check if node is ready + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { + job.StepMessage = fmt.Sprintf("Node %s is ready", node.Name) + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + return nil + } + } + } + } + } + } +} + +func (s *OnboardingService) stepEnableNode(ctx context.Context, job *OnboardingJob) error { + job.CurrentStep = 9 + job.StepMessage = "Enabling node in Bison..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + if job.NodeName == "" { + return fmt.Errorf("node name is not set") + } + + // Enable node in Bison (add to shared pool) + if err := s.nodeSvc.EnableNode(ctx, job.NodeName); err != nil { + return fmt.Errorf("failed to enable node: %w", err) + } + + return nil +} + +func (s *OnboardingService) failJob(job *OnboardingJob, err error) { + job.Status = JobStatusFailed + job.ErrorMessage = err.Error() + job.UpdatedAt = time.Now() + now := time.Now() + job.CompletedAt = &now + s.saveJob(context.Background(), job) + + logger.Error("Node onboarding failed", "nodeIP", job.NodeIP, "error", err) +} + +func (s *OnboardingService) saveJob(ctx context.Context, job *OnboardingJob) error { + data, err := json.Marshal(job) + if err != nil { + return fmt.Errorf("failed to marshal job: %w", err) + } + + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, OnboardingJobsConfigMap) + if err != nil { + if errors.IsNotFound(err) { + // Create new ConfigMap + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: OnboardingJobsConfigMap, + Namespace: BisonNamespace, + }, + Data: map[string]string{ + job.ID: string(data), + }, + } + return s.k8sClient.CreateConfigMap(ctx, BisonNamespace, cm) + } + return fmt.Errorf("failed to get jobs config: %w", err) + } + + // Update existing ConfigMap + if cm.Data == nil { + cm.Data = make(map[string]string) + } + cm.Data[job.ID] = string(data) + + return s.k8sClient.UpdateConfigMap(ctx, BisonNamespace, cm) +} + +func (s *OnboardingService) getJobsMap(ctx context.Context) (map[string]string, error) { + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, OnboardingJobsConfigMap) + if err != nil { + if errors.IsNotFound(err) { + return make(map[string]string), nil + } + return nil, fmt.Errorf("failed to get jobs config: %w", err) + } + + if cm.Data == nil { + return make(map[string]string), nil + } + + return cm.Data, nil +} + +// TestControlPlaneConnection tests the SSH connection to the control plane +func (s *OnboardingService) TestControlPlaneConnection(ctx context.Context) error { + cpConfig, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err != nil { + return fmt.Errorf("failed to get control plane config: %w", err) + } + + if cpConfig.Host == "" { + return fmt.Errorf("control plane host is not configured") + } + + sshConfig := &ssh.Config{ + Host: cpConfig.Host, + Port: cpConfig.SSHPort, + Username: cpConfig.SSHUser, + AuthMethod: ssh.AuthMethod(cpConfig.AuthMethod), + Password: cpConfig.Password, + PrivateKey: cpConfig.PrivateKey, + Timeout: 30 * time.Second, + } + + executor := ssh.NewExecutor(sshConfig) + defer executor.Close() + + if err := executor.TestConnection(ctx); err != nil { + return fmt.Errorf("SSH connection test failed: %w", err) + } + + // Also verify kubeadm is available + if !executor.CheckCommand(ctx, "kubeadm") { + return fmt.Errorf("kubeadm is not available on the control plane") + } + + return nil +} diff --git a/api-server/internal/service/settings_service.go b/api-server/internal/service/settings_service.go index 0a561f4..d9c506d 100644 --- a/api-server/internal/service/settings_service.go +++ b/api-server/internal/service/settings_service.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "net/http" + "net/url" "time" ) @@ -48,28 +49,66 @@ type PrometheusMetric struct { Value float64 `json:"value"` } +// LabeledMetricSeries represents a Prometheus metric series with labels +type LabeledMetricSeries struct { + Labels map[string]string `json:"labels"` + Metrics []PrometheusMetric `json:"metrics"` +} + // NodeMetrics represents metrics for a node type NodeMetrics struct { CPUUsage []PrometheusMetric `json:"cpuUsage"` MemoryUsage []PrometheusMetric `json:"memoryUsage"` + // Network IO + NetworkReceive []PrometheusMetric `json:"networkReceive,omitempty"` + NetworkTransmit []PrometheusMetric `json:"networkTransmit,omitempty"` + // RDMA IO + RdmaReceive []PrometheusMetric `json:"rdmaReceive,omitempty"` + RdmaTransmit []PrometheusMetric `json:"rdmaTransmit,omitempty"` + // GPU (NVIDIA DCGM) + GpuUtilization []PrometheusMetric `json:"gpuUtilization,omitempty"` + GpuMemoryUtil []PrometheusMetric `json:"gpuMemoryUtil,omitempty"` + GpuPerDevice []LabeledMetricSeries `json:"gpuPerDevice,omitempty"` + // NPU (Huawei Ascend) + NpuUtilization []PrometheusMetric `json:"npuUtilization,omitempty"` + NpuMemoryUtil []PrometheusMetric `json:"npuMemoryUtil,omitempty"` + NpuTemperature []PrometheusMetric `json:"npuTemperature,omitempty"` } -// QueryPrometheus queries Prometheus API -func (s *SettingsService) QueryPrometheus(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]PrometheusMetric, error) { +// NodeMetricsRequest holds parameters for querying node metrics +type NodeMetricsRequest struct { + NodeName string + Hours int + HasGpu bool + HasNpu bool +} + +// prometheusResponse is the JSON structure returned by Prometheus query_range API +type prometheusResponse struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []struct { + Metric map[string]string `json:"metric"` + Values [][]interface{} `json:"values"` + } `json:"result"` + } `json:"data"` +} + +// queryPrometheusRaw executes a Prometheus range query and returns the raw response +func (s *SettingsService) queryPrometheusRaw(ctx context.Context, query string, start, end time.Time, step time.Duration) (*prometheusResponse, error) { if s.prometheusURL == "" { return nil, fmt.Errorf("prometheus URL not configured") } - // Build query URL - url := fmt.Sprintf("%s/api/v1/query_range?query=%s&start=%d&end=%d&step=%d", - s.prometheusURL, - query, - start.Unix(), - end.Unix(), - int(step.Seconds()), - ) + params := url.Values{} + params.Set("query", query) + params.Set("start", fmt.Sprintf("%d", start.Unix())) + params.Set("end", fmt.Sprintf("%d", end.Unix())) + params.Set("step", fmt.Sprintf("%d", int(step.Seconds()))) + fullURL := fmt.Sprintf("%s/api/v1/query_range?%s", s.prometheusURL, params.Encode()) - req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + req, err := http.NewRequestWithContext(ctx, "GET", fullURL, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } @@ -86,17 +125,7 @@ func (s *SettingsService) QueryPrometheus(ctx context.Context, query string, sta return nil, fmt.Errorf("prometheus returned status %d: %s", resp.StatusCode, string(body)) } - var result struct { - Status string `json:"status"` - Data struct { - ResultType string `json:"resultType"` - Result []struct { - Metric map[string]string `json:"metric"` - Values [][]interface{} `json:"values"` - } `json:"result"` - } `json:"data"` - } - + var result prometheusResponse if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return nil, fmt.Errorf("failed to decode response: %w", err) } @@ -105,51 +134,127 @@ func (s *SettingsService) QueryPrometheus(ctx context.Context, query string, sta return nil, fmt.Errorf("prometheus query failed") } + return &result, nil +} + +// parseMetricValues extracts PrometheusMetric slice from raw Prometheus values +func parseMetricValues(values [][]interface{}) []PrometheusMetric { var metrics []PrometheusMetric - if len(result.Data.Result) > 0 { - for _, v := range result.Data.Result[0].Values { - if len(v) >= 2 { - ts, _ := v[0].(float64) - val := 0.0 - switch vv := v[1].(type) { - case string: - fmt.Sscanf(vv, "%f", &val) - case float64: - val = vv - } - metrics = append(metrics, PrometheusMetric{ - Timestamp: ts, - Value: val, - }) + for _, v := range values { + if len(v) >= 2 { + ts, _ := v[0].(float64) + val := 0.0 + switch vv := v[1].(type) { + case string: + fmt.Sscanf(vv, "%f", &val) + case float64: + val = vv } + metrics = append(metrics, PrometheusMetric{ + Timestamp: ts, + Value: val, + }) } } + return metrics +} + +// QueryPrometheus queries Prometheus API and returns the first result series +func (s *SettingsService) QueryPrometheus(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]PrometheusMetric, error) { + result, err := s.queryPrometheusRaw(ctx, query, start, end, step) + if err != nil { + return nil, err + } - return metrics, nil + if len(result.Data.Result) > 0 { + return parseMetricValues(result.Data.Result[0].Values), nil + } + + return nil, nil +} + +// QueryPrometheusMultiSeries queries Prometheus API and returns all result series with labels +func (s *SettingsService) QueryPrometheusMultiSeries(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]LabeledMetricSeries, error) { + result, err := s.queryPrometheusRaw(ctx, query, start, end, step) + if err != nil { + return nil, err + } + + var series []LabeledMetricSeries + for _, r := range result.Data.Result { + series = append(series, LabeledMetricSeries{ + Labels: r.Metric, + Metrics: parseMetricValues(r.Values), + }) + } + + return series, nil } // GetNodeMetrics returns metrics for a specific node -func (s *SettingsService) GetNodeMetrics(ctx context.Context, nodeName string, hours int) (*NodeMetrics, error) { +func (s *SettingsService) GetNodeMetrics(ctx context.Context, req NodeMetricsRequest) (*NodeMetrics, error) { end := time.Now() - start := end.Add(-time.Duration(hours) * time.Hour) + start := end.Add(-time.Duration(req.Hours) * time.Hour) step := time.Minute * 5 + node := req.NodeName - // Query CPU usage - cpuQuery := fmt.Sprintf(`100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle", instance=~"%s.*"}[5m])) * 100)`, nodeName) - cpuMetrics, err := s.QueryPrometheus(ctx, cpuQuery, start, end, step) - if err != nil { - cpuMetrics = nil // Non-fatal, continue + result := &NodeMetrics{} + + // --- Always query: CPU, Memory, Network, RDMA --- + + // CPU usage (%) + cpuQuery := fmt.Sprintf(`100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle", instance=~"%s.*"}[5m])) * 100)`, node) + result.CPUUsage, _ = s.QueryPrometheus(ctx, cpuQuery, start, end, step) + + // Memory usage (%) + memQuery := fmt.Sprintf(`(1 - (node_memory_MemAvailable_bytes{instance=~"%s.*"} / node_memory_MemTotal_bytes{instance=~"%s.*"})) * 100`, node, node) + result.MemoryUsage, _ = s.QueryPrometheus(ctx, memQuery, start, end, step) + + // Network receive (bytes/sec, excluding virtual interfaces) + netRecvQuery := fmt.Sprintf(`sum(rate(node_network_receive_bytes_total{instance=~"%s.*",device!~"lo|docker.*|veth.*|br.*|cni.*|flannel.*|cali.*|tunl.*|kube.*|virbr.*"}[5m]))`, node) + result.NetworkReceive, _ = s.QueryPrometheus(ctx, netRecvQuery, start, end, step) + + // Network transmit (bytes/sec) + netTransQuery := fmt.Sprintf(`sum(rate(node_network_transmit_bytes_total{instance=~"%s.*",device!~"lo|docker.*|veth.*|br.*|cni.*|flannel.*|cali.*|tunl.*|kube.*|virbr.*"}[5m]))`, node) + result.NetworkTransmit, _ = s.QueryPrometheus(ctx, netTransQuery, start, end, step) + + // RDMA receive (bytes/sec, InfiniBand via node_exporter) + rdmaRecvQuery := fmt.Sprintf(`sum(rate(node_infiniband_port_data_received_bytes_total{instance=~"%s.*"}[5m]))`, node) + result.RdmaReceive, _ = s.QueryPrometheus(ctx, rdmaRecvQuery, start, end, step) + + // RDMA transmit (bytes/sec) + rdmaTransQuery := fmt.Sprintf(`sum(rate(node_infiniband_port_data_transmitted_bytes_total{instance=~"%s.*"}[5m]))`, node) + result.RdmaTransmit, _ = s.QueryPrometheus(ctx, rdmaTransQuery, start, end, step) + + // --- Conditional: GPU (DCGM) --- + if req.HasGpu { + // Average GPU SM utilization (%) + gpuUtilQuery := fmt.Sprintf(`avg(DCGM_FI_DEV_GPU_UTIL{Hostname="%s"} or DCGM_FI_DEV_GPU_UTIL{instance=~"%s.*"})`, node, node) + result.GpuUtilization, _ = s.QueryPrometheus(ctx, gpuUtilQuery, start, end, step) + + // Average GPU memory utilization (%) + gpuMemQuery := fmt.Sprintf(`avg(DCGM_FI_DEV_MEM_COPY_UTIL{Hostname="%s"} or DCGM_FI_DEV_MEM_COPY_UTIL{instance=~"%s.*"})`, node, node) + result.GpuMemoryUtil, _ = s.QueryPrometheus(ctx, gpuMemQuery, start, end, step) + + // Per-GPU SM utilization (multi-series) + gpuPerDeviceQuery := fmt.Sprintf(`DCGM_FI_DEV_GPU_UTIL{Hostname="%s"} or DCGM_FI_DEV_GPU_UTIL{instance=~"%s.*"}`, node, node) + result.GpuPerDevice, _ = s.QueryPrometheusMultiSeries(ctx, gpuPerDeviceQuery, start, end, step) } - // Query memory usage - memQuery := fmt.Sprintf(`(1 - (node_memory_MemAvailable_bytes{instance=~"%s.*"} / node_memory_MemTotal_bytes{instance=~"%s.*"})) * 100`, nodeName, nodeName) - memMetrics, err := s.QueryPrometheus(ctx, memQuery, start, end, step) - if err != nil { - memMetrics = nil // Non-fatal, continue + // --- Conditional: NPU (Huawei Ascend) --- + if req.HasNpu { + // NPU utilization (%) + npuUtilQuery := fmt.Sprintf(`avg(npu_chip_info_utilization{id=~"%s.*"})`, node) + result.NpuUtilization, _ = s.QueryPrometheus(ctx, npuUtilQuery, start, end, step) + + // NPU HBM usage (%) + npuMemQuery := fmt.Sprintf(`avg(npu_chip_info_hbm_usage{id=~"%s.*"})`, node) + result.NpuMemoryUtil, _ = s.QueryPrometheus(ctx, npuMemQuery, start, end, step) + + // NPU temperature (°C) + npuTempQuery := fmt.Sprintf(`avg(npu_chip_info_temperature{id=~"%s.*"})`, node) + result.NpuTemperature, _ = s.QueryPrometheus(ctx, npuTempQuery, start, end, step) } - return &NodeMetrics{ - CPUUsage: cpuMetrics, - MemoryUsage: memMetrics, - }, nil + return result, nil } diff --git a/api-server/internal/ssh/executor.go b/api-server/internal/ssh/executor.go new file mode 100644 index 0000000..e1feb9e --- /dev/null +++ b/api-server/internal/ssh/executor.go @@ -0,0 +1,370 @@ +package ssh + +import ( + "bytes" + "context" + "fmt" + "io" + "net" + "strings" + "sync" + "time" + + "golang.org/x/crypto/ssh" +) + +// AuthMethod represents the SSH authentication method +type AuthMethod string + +const ( + AuthMethodPassword AuthMethod = "password" + AuthMethodPrivateKey AuthMethod = "privateKey" +) + +// Config holds SSH connection configuration +type Config struct { + Host string + Port int + Username string + AuthMethod AuthMethod + Password string + PrivateKey string // PEM encoded private key content + Timeout time.Duration +} + +// CommandResult holds the result of a remote command execution +type CommandResult struct { + Stdout string + Stderr string + ExitCode int + Error error +} + +// Executor handles SSH connections and remote command execution +type Executor struct { + config *Config + client *ssh.Client + mu sync.Mutex +} + +// NewExecutor creates a new SSH executor with the given configuration +func NewExecutor(config *Config) *Executor { + if config.Port == 0 { + config.Port = 22 + } + if config.Timeout == 0 { + config.Timeout = 30 * time.Second + } + return &Executor{ + config: config, + } +} + +// Connect establishes an SSH connection to the remote host +func (e *Executor) Connect(ctx context.Context) error { + e.mu.Lock() + defer e.mu.Unlock() + + if e.client != nil { + return nil // Already connected + } + + var authMethods []ssh.AuthMethod + + switch e.config.AuthMethod { + case AuthMethodPassword: + if e.config.Password == "" { + return fmt.Errorf("password is required for password authentication") + } + authMethods = append(authMethods, ssh.Password(e.config.Password)) + + case AuthMethodPrivateKey: + if e.config.PrivateKey == "" { + return fmt.Errorf("private key is required for private key authentication") + } + signer, err := ssh.ParsePrivateKey([]byte(e.config.PrivateKey)) + if err != nil { + return fmt.Errorf("failed to parse private key: %w", err) + } + authMethods = append(authMethods, ssh.PublicKeys(signer)) + + default: + return fmt.Errorf("unsupported authentication method: %s", e.config.AuthMethod) + } + + sshConfig := &ssh.ClientConfig{ + User: e.config.Username, + Auth: authMethods, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), // TODO: Consider using known_hosts in production + Timeout: e.config.Timeout, + } + + addr := fmt.Sprintf("%s:%d", e.config.Host, e.config.Port) + + // Use context for connection timeout + var client *ssh.Client + var err error + + done := make(chan struct{}) + go func() { + client, err = ssh.Dial("tcp", addr, sshConfig) + close(done) + }() + + select { + case <-ctx.Done(): + return ctx.Err() + case <-done: + if err != nil { + return fmt.Errorf("failed to connect to %s: %w", addr, err) + } + } + + e.client = client + return nil +} + +// Execute runs a command on the remote host and returns the result +func (e *Executor) Execute(ctx context.Context, command string) *CommandResult { + e.mu.Lock() + if e.client == nil { + e.mu.Unlock() + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("not connected"), + } + } + client := e.client + e.mu.Unlock() + + session, err := client.NewSession() + if err != nil { + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("failed to create session: %w", err), + } + } + defer session.Close() + + var stdout, stderr bytes.Buffer + session.Stdout = &stdout + session.Stderr = &stderr + + // Run command with context cancellation support + done := make(chan error, 1) + go func() { + done <- session.Run(command) + }() + + select { + case <-ctx.Done(): + // Try to close the session to stop the command + session.Close() + return &CommandResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: -1, + Error: ctx.Err(), + } + case err := <-done: + result := &CommandResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: 0, + } + + if err != nil { + if exitErr, ok := err.(*ssh.ExitError); ok { + result.ExitCode = exitErr.ExitStatus() + } else { + result.ExitCode = -1 + result.Error = err + } + } + + return result + } +} + +// ExecuteWithTimeout runs a command with a specific timeout +func (e *Executor) ExecuteWithTimeout(command string, timeout time.Duration) *CommandResult { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + return e.Execute(ctx, command) +} + +// ExecuteScript executes a shell script on the remote host +// The script content is passed via stdin to avoid escaping issues +func (e *Executor) ExecuteScript(ctx context.Context, script string) *CommandResult { + e.mu.Lock() + if e.client == nil { + e.mu.Unlock() + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("not connected"), + } + } + client := e.client + e.mu.Unlock() + + session, err := client.NewSession() + if err != nil { + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("failed to create session: %w", err), + } + } + defer session.Close() + + var stdout, stderr bytes.Buffer + session.Stdout = &stdout + session.Stderr = &stderr + + // Pass script via stdin + stdin, err := session.StdinPipe() + if err != nil { + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("failed to create stdin pipe: %w", err), + } + } + + done := make(chan error, 1) + go func() { + done <- session.Run("bash -s") + }() + + // Write script to stdin + go func() { + defer stdin.Close() + io.WriteString(stdin, script) + }() + + select { + case <-ctx.Done(): + session.Close() + return &CommandResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: -1, + Error: ctx.Err(), + } + case err := <-done: + result := &CommandResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: 0, + } + + if err != nil { + if exitErr, ok := err.(*ssh.ExitError); ok { + result.ExitCode = exitErr.ExitStatus() + } else { + result.ExitCode = -1 + result.Error = err + } + } + + return result + } +} + +// TestConnection tests if the SSH connection can be established +func (e *Executor) TestConnection(ctx context.Context) error { + if err := e.Connect(ctx); err != nil { + return err + } + + // Run a simple command to verify the connection works + result := e.Execute(ctx, "echo ok") + if result.Error != nil { + return result.Error + } + if result.ExitCode != 0 { + return fmt.Errorf("connection test failed: %s", result.Stderr) + } + if strings.TrimSpace(result.Stdout) != "ok" { + return fmt.Errorf("unexpected response: %s", result.Stdout) + } + + return nil +} + +// Close closes the SSH connection +func (e *Executor) Close() error { + e.mu.Lock() + defer e.mu.Unlock() + + if e.client != nil { + err := e.client.Close() + e.client = nil + return err + } + return nil +} + +// IsConnected returns true if there is an active SSH connection +func (e *Executor) IsConnected() bool { + e.mu.Lock() + defer e.mu.Unlock() + return e.client != nil +} + +// GetHostInfo retrieves basic host information (OS, architecture, etc.) +func (e *Executor) GetHostInfo(ctx context.Context) (map[string]string, error) { + info := make(map[string]string) + + // Get OS information + osResult := e.Execute(ctx, "cat /etc/os-release 2>/dev/null | grep -E '^(ID|VERSION_ID)=' | cut -d'=' -f2 | tr -d '\"'") + if osResult.Error == nil && osResult.ExitCode == 0 { + lines := strings.Split(strings.TrimSpace(osResult.Stdout), "\n") + if len(lines) >= 1 { + info["os"] = strings.TrimSpace(lines[0]) + } + if len(lines) >= 2 { + info["version"] = strings.TrimSpace(lines[1]) + } + } + + // Get architecture + archResult := e.Execute(ctx, "uname -m") + if archResult.Error == nil && archResult.ExitCode == 0 { + arch := strings.TrimSpace(archResult.Stdout) + // Normalize architecture names + switch arch { + case "x86_64": + arch = "amd64" + case "aarch64": + arch = "arm64" + } + info["arch"] = arch + } + + // Get hostname + hostnameResult := e.Execute(ctx, "hostname") + if hostnameResult.Error == nil && hostnameResult.ExitCode == 0 { + info["hostname"] = strings.TrimSpace(hostnameResult.Stdout) + } + + return info, nil +} + +// CheckCommand checks if a command exists on the remote host +func (e *Executor) CheckCommand(ctx context.Context, command string) bool { + result := e.Execute(ctx, fmt.Sprintf("command -v %s", command)) + return result.Error == nil && result.ExitCode == 0 +} + +// DialFunc returns a function that can be used as a proxy dialer +func (e *Executor) DialFunc() func(network, addr string) (net.Conn, error) { + return func(network, addr string) (net.Conn, error) { + e.mu.Lock() + client := e.client + e.mu.Unlock() + + if client == nil { + return nil, fmt.Errorf("not connected") + } + return client.Dial(network, addr) + } +} diff --git a/deploy/charts/bison/Chart.yaml b/deploy/charts/bison/Chart.yaml index 4a36d93..e8401c2 100644 --- a/deploy/charts/bison/Chart.yaml +++ b/deploy/charts/bison/Chart.yaml @@ -2,8 +2,9 @@ apiVersion: v2 name: bison description: Bison - GPU 资源计费平台,基于 Capsule 多租户 + OpenCost 成本追踪 type: application -version: 0.0.1 -appVersion: "0.0.1" +version: 0.0.27 +appVersion: "0.0.27" +kubeVersion: ">=1.22.0-0" keywords: - gpu - billing @@ -14,6 +15,11 @@ keywords: - cost-management maintainers: - name: Bison Team +annotations: + org.opencontainers.image.source: https://github.com/SuperMarioYL/Bison + org.opencontainers.image.description: "Bison Helm Chart - GPU资源计费与多租户管理平台" + org.opencontainers.image.documentation: https://bison.lei6393.com + org.opencontainers.image.usage: "helm install my-bison oci://ghcr.io/supermarioyl/charts/bison --version VERSION" # Dependencies are installed separately: # - Capsule: helm install capsule projectcapsule/capsule -n capsule-system --create-namespace # - OpenCost: helm install opencost opencost/opencost -n opencost --create-namespace diff --git a/deploy/charts/bison/README.md b/deploy/charts/bison/README.md new file mode 100644 index 0000000..49c31f2 --- /dev/null +++ b/deploy/charts/bison/README.md @@ -0,0 +1,99 @@ +# Bison Helm Chart + +Kubernetes-based GPU Resource Billing and Scheduling Platform + +## ⚠️ 重要提示 / Important Notice + +**这是一个 Helm Chart,请使用 `helm` 命令安装,而不是 `docker pull`!** + +**This is a Helm Chart. Use `helm` command to install, NOT `docker pull`!** + +## Installation + +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 + +### Method 1: From GHCR (Recommended) + +Install directly from GitHub Container Registry using OCI format: + +```bash +# Install specific version +helm install my-bison oci://ghcr.io/supermarioyl/charts/bison --version 0.0.9 + +# Or pull first, then install +helm pull oci://ghcr.io/supermarioyl/charts/bison --version 0.0.9 +helm install my-bison bison-0.0.9.tgz + +# With custom configuration +helm install my-bison oci://ghcr.io/supermarioyl/charts/bison \ + --version 0.0.9 \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true +``` + +**Why GHCR OCI Format?** +- ✅ No separate Helm repository needed +- ✅ Unified storage with Docker images in GHCR +- ✅ Faster installation +- ✅ Modern Helm 3.8+ standard + +### Method 2: From GitHub Releases + +Download the chart from [GitHub Releases](https://github.com/SuperMarioYL/Bison/releases) and install locally: + +```bash +# Download from release page +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.9/bison-0.0.9.tgz + +# Install +helm install my-bison bison-0.0.9.tgz \ + --namespace bison-system \ + --create-namespace +``` + +## Prerequisites + +Before installing Bison, ensure the following dependencies are installed: + +1. **Capsule** - Multi-tenant management + ```bash + helm install capsule projectcapsule/capsule -n capsule-system --create-namespace + ``` + +2. **OpenCost** - Cost tracking + ```bash + helm install opencost opencost/opencost -n opencost --create-namespace + ``` + +3. **Prometheus** - Metrics collection + ```bash + helm install prometheus prometheus-community/kube-prometheus-stack -n monitoring --create-namespace + ``` + +## Configuration + +See [values.yaml](./values.yaml) for all configuration options. + +### Basic Configuration + +```bash +helm install my-bison oci://ghcr.io/supermarioyl/charts/bison \ + --set apiServer.replicas=2 \ + --set webUI.replicas=2 +``` + +## Uninstall + +```bash +helm uninstall my-bison -n bison-system +``` + +## More Information + +- [Project Homepage](https://bison.lei6393.com) +- [Documentation](https://bison.lei6393.com/docs/) +- [GitHub Repository](https://github.com/SuperMarioYL/Bison) diff --git a/deploy/charts/bison/README_CN.md b/deploy/charts/bison/README_CN.md new file mode 100644 index 0000000..7cc5411 --- /dev/null +++ b/deploy/charts/bison/README_CN.md @@ -0,0 +1,86 @@ +# Bison Helm Chart + +基于 Kubernetes 的 GPU 资源计费与调度平台 + +## 安装 + +### 从 GHCR 安装(推荐) + +直接从 GitHub Container Registry 使用 OCI 格式安装: + +```bash +# 安装指定版本 +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 + +# 或者先拉取,再安装 +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install my-bison bison-0.0.2.tgz +``` + +**要求:** +- Helm >= 3.8.0(支持 OCI) + +### 从 GitHub Releases 安装 + +从 [GitHub Releases](https://github.com/SuperMarioYL/Bison/releases) 下载 chart 并本地安装: + +```bash +# 从 release 页面下载 +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz + +# 安装 +helm install my-bison bison-0.0.2.tgz +``` + +## 前置条件 + +安装 Bison 前,请确保已安装以下依赖: + +1. **Capsule** - 多租户管理 + ```bash + helm install capsule projectcapsule/capsule -n capsule-system --create-namespace + ``` + +2. **OpenCost** - 成本追踪 + ```bash + helm install opencost opencost/opencost -n opencost --create-namespace + ``` + +3. **Prometheus** - 指标收集 + ```bash + helm install prometheus prometheus-community/kube-prometheus-stack -n monitoring --create-namespace + ``` + +## 配置 + +所有配置选项请查看 [values.yaml](./values.yaml)。 + +### 基础配置 + +```bash +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ + --set apiServer.replicas=2 \ + --set webUI.replicas=2 +``` + +### 常用配置项 + +| 参数 | 说明 | 默认值 | +|------|------|--------| +| `apiServer.replicas` | API 服务器副本数 | `1` | +| `webUI.replicas` | Web UI 副本数 | `1` | +| `auth.enabled` | 启用认证 | `false` | +| `opencost.url` | OpenCost API 地址 | `http://opencost.opencost:9003` | + +## 卸载 + +```bash +helm uninstall my-bison +``` + +## 更多信息 + +- [项目主页](https://supermarioyl.github.io/Bison/) +- [文档](https://supermarioyl.github.io/Bison/docs/) +- [GitHub 仓库](https://github.com/SuperMarioYL/Bison) +- [English README](./README.md) diff --git a/deploy/charts/bison/templates/NOTES.txt b/deploy/charts/bison/templates/NOTES.txt index 7ad6d8b..7806fe9 100644 --- a/deploy/charts/bison/templates/NOTES.txt +++ b/deploy/charts/bison/templates/NOTES.txt @@ -9,27 +9,21 @@ Namespace: {{ .Release.Namespace }} 数据存储在 Kubernetes ConfigMaps 中,无需外部数据库 === 访问方式 === -{{- if .Values.apiServer.enabled }} -{{- if .Values.apiServer.ingress.enabled }} - -API Server: http://{{ .Values.apiServer.ingress.host }}/api/v1 -{{- else }} +{{- if and .Values.webUI.enabled .Values.ingress.enabled }} -API Server: - kubectl port-forward svc/{{ include "bison.apiServer.fullname" . }} 8080:{{ .Values.apiServer.service.port }} -n {{ .Release.Namespace }} - 访问: http://localhost:8080/api/v1 -{{- end }} -{{- end }} -{{- if .Values.webUI.enabled }} -{{- if .Values.webUI.ingress.enabled }} - -Web UI: http://{{ .Values.webUI.ingress.host }} -{{- else }} +Bison: http://{{ .Values.ingress.host }} + Web UI nginx 会自动代理 /api 请求到 API Server +{{- else if .Values.webUI.enabled }} Web UI: kubectl port-forward svc/{{ include "bison.webUI.fullname" . }} 3000:{{ .Values.webUI.service.port }} -n {{ .Release.Namespace }} 访问: http://localhost:3000 {{- end }} +{{- if .Values.apiServer.enabled }} + +API Server (调试): + kubectl port-forward svc/{{ include "bison.apiServer.fullname" . }} 8080:{{ .Values.apiServer.service.port }} -n {{ .Release.Namespace }} + 访问: http://localhost:8080/api/v1 {{- end }} === 认证 === diff --git a/deploy/charts/bison/templates/_helpers.tpl b/deploy/charts/bison/templates/_helpers.tpl index 1247b30..553da2d 100644 --- a/deploy/charts/bison/templates/_helpers.tpl +++ b/deploy/charts/bison/templates/_helpers.tpl @@ -76,14 +76,12 @@ Web UI full name {{- end }} {{/* -Get image registry +Build full image reference: global.imageRegistry/repository:tag +Usage: include "bison.image" (dict "imageConfig" .Values.apiServer.image "global" .Values.global "appVersion" .Chart.AppVersion) */}} -{{- define "bison.imageRegistry" -}} -{{- if .Values.global.imageRegistry }} -{{- printf "%s/" .Values.global.imageRegistry }} -{{- else }} -{{- "" }} -{{- end }} +{{- define "bison.image" -}} +{{- $tag := .imageConfig.tag | default .appVersion -}} +{{- printf "%s/%s:%s" .global.imageRegistry .imageConfig.repository $tag -}} {{- end }} {{/* diff --git a/deploy/charts/bison/templates/api-server/deployment.yaml b/deploy/charts/bison/templates/api-deployment.yaml similarity index 90% rename from deploy/charts/bison/templates/api-server/deployment.yaml rename to deploy/charts/bison/templates/api-deployment.yaml index b141063..ce17f7f 100644 --- a/deploy/charts/bison/templates/api-server/deployment.yaml +++ b/deploy/charts/bison/templates/api-deployment.yaml @@ -8,7 +8,9 @@ metadata: {{- include "bison.labels" . | nindent 4 }} app.kubernetes.io/component: api-server spec: + {{- if not .Values.apiServer.autoscaling.enabled }} replicas: {{ .Values.apiServer.replicaCount }} + {{- end }} selector: matchLabels: {{- include "bison.selectorLabels" . | nindent 6 }} @@ -30,7 +32,7 @@ spec: - name: api securityContext: {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ include "bison.imageRegistry" . }}{{ .Values.apiServer.image.repository }}:{{ .Values.apiServer.image.tag }}" + image: "{{ include "bison.image" (dict "imageConfig" .Values.apiServer.image "global" .Values.global "appVersion" .Chart.AppVersion) }}" imagePullPolicy: {{ .Values.apiServer.image.pullPolicy }} ports: - name: http @@ -56,6 +58,9 @@ spec: name: {{ if .Values.auth.jwt.existingSecret }}{{ .Values.auth.jwt.existingSecret }}{{ else }}{{ include "bison.authSecretName" . }}{{ end }} key: jwt-secret {{- end }} + # Capsule integration + - name: CAPSULE_ENABLED + value: {{ .Values.dependencies.capsule.enabled | quote }} # OpenCost integration {{- if .Values.dependencies.opencost.enabled }} - name: OPENCOST_URL diff --git a/deploy/charts/bison/templates/api-server/auth-secret.yaml b/deploy/charts/bison/templates/api-server/auth-secret.yaml deleted file mode 100644 index 3bf3300..0000000 --- a/deploy/charts/bison/templates/api-server/auth-secret.yaml +++ /dev/null @@ -1,24 +0,0 @@ -{{- if and .Values.apiServer.enabled .Values.auth.enabled }} -{{- if not .Values.auth.admin.existingSecret }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "bison.fullname" . }}-auth - namespace: {{ .Release.Namespace }} - labels: - {{- include "bison.labels" . | nindent 4 }} -type: Opaque -data: - {{- if .Values.auth.admin.password }} - password: {{ .Values.auth.admin.password | b64enc | quote }} - {{- else }} - password: {{ randAlphaNum 16 | b64enc | quote }} - {{- end }} - {{- if .Values.auth.jwt.secret }} - jwt-secret: {{ .Values.auth.jwt.secret | b64enc | quote }} - {{- else }} - jwt-secret: {{ randAlphaNum 32 | b64enc | quote }} - {{- end }} -{{- end }} -{{- end }} - diff --git a/deploy/charts/bison/templates/api-server/ingress.yaml b/deploy/charts/bison/templates/api-server/ingress.yaml deleted file mode 100644 index e979851..0000000 --- a/deploy/charts/bison/templates/api-server/ingress.yaml +++ /dev/null @@ -1,40 +0,0 @@ -{{- if and .Values.apiServer.enabled .Values.apiServer.ingress.enabled }} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: {{ include "bison.apiServer.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "bison.labels" . | nindent 4 }} - app.kubernetes.io/component: api-server - {{- with .Values.apiServer.ingress.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if .Values.apiServer.ingress.className }} - ingressClassName: {{ .Values.apiServer.ingress.className }} - {{- end }} - {{- if .Values.apiServer.ingress.tls }} - tls: - {{- range .Values.apiServer.ingress.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - - host: {{ .Values.apiServer.ingress.host | quote }} - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: {{ include "bison.apiServer.fullname" . }} - port: - number: {{ .Values.apiServer.service.port }} -{{- end }} - diff --git a/deploy/charts/bison/templates/hpa.yaml b/deploy/charts/bison/templates/hpa.yaml new file mode 100644 index 0000000..ed9e011 --- /dev/null +++ b/deploy/charts/bison/templates/hpa.yaml @@ -0,0 +1,49 @@ +{{- if and .Values.apiServer.enabled .Values.apiServer.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "bison.apiServer.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + app.kubernetes.io/component: api-server +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "bison.apiServer.fullname" . }} + minReplicas: {{ .Values.apiServer.autoscaling.minReplicas }} + maxReplicas: {{ .Values.apiServer.autoscaling.maxReplicas }} + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.apiServer.autoscaling.targetCPUUtilizationPercentage }} +{{- end }} +{{- if and .Values.webUI.enabled .Values.webUI.autoscaling.enabled }} +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "bison.webUI.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + app.kubernetes.io/component: web-ui +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "bison.webUI.fullname" . }} + minReplicas: {{ .Values.webUI.autoscaling.minReplicas }} + maxReplicas: {{ .Values.webUI.autoscaling.maxReplicas }} + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.webUI.autoscaling.targetCPUUtilizationPercentage }} +{{- end }} diff --git a/deploy/charts/bison/templates/ingress.yaml b/deploy/charts/bison/templates/ingress.yaml new file mode 100644 index 0000000..9184908 --- /dev/null +++ b/deploy/charts/bison/templates/ingress.yaml @@ -0,0 +1,34 @@ +{{- if and .Values.webUI.enabled .Values.ingress.enabled }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "bison.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls.enabled }} + tls: + - hosts: + - {{ .Values.ingress.host | quote }} + secretName: {{ .Values.ingress.tls.secretName | default (printf "%s-tls" (include "bison.fullname" .)) }} + {{- end }} + rules: + - host: {{ .Values.ingress.host | quote }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ include "bison.webUI.fullname" . }} + port: + number: {{ .Values.webUI.service.port }} +{{- end }} diff --git a/deploy/charts/bison/templates/networkpolicy.yaml b/deploy/charts/bison/templates/networkpolicy.yaml new file mode 100644 index 0000000..2aa8694 --- /dev/null +++ b/deploy/charts/bison/templates/networkpolicy.yaml @@ -0,0 +1,32 @@ +{{- if .Values.networkPolicy.enabled }} +# Restrict ingress to the api-server: only web-ui pods and pods in the release +# namespace may reach it on the API port. api-server egress (K8s API, OpenCost, +# Prometheus) is unaffected since only Ingress is policed here. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "bison.apiServer.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + app.kubernetes.io/component: api-server +spec: + podSelector: + matchLabels: + {{- include "bison.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: api-server + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + {{- include "bison.selectorLabels" . | nindent 14 }} + app.kubernetes.io/component: web-ui + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Release.Namespace }} + ports: + - protocol: TCP + port: {{ .Values.apiServer.service.port }} +{{- end }} diff --git a/deploy/charts/bison/templates/pdb.yaml b/deploy/charts/bison/templates/pdb.yaml new file mode 100644 index 0000000..16f1347 --- /dev/null +++ b/deploy/charts/bison/templates/pdb.yaml @@ -0,0 +1,33 @@ +{{- if and .Values.apiServer.enabled .Values.apiServer.podDisruptionBudget.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "bison.apiServer.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + app.kubernetes.io/component: api-server +spec: + minAvailable: {{ .Values.apiServer.podDisruptionBudget.minAvailable }} + selector: + matchLabels: + {{- include "bison.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: api-server +{{- end }} +{{- if and .Values.webUI.enabled .Values.webUI.podDisruptionBudget.enabled }} +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "bison.webUI.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + app.kubernetes.io/component: web-ui +spec: + minAvailable: {{ .Values.webUI.podDisruptionBudget.minAvailable }} + selector: + matchLabels: + {{- include "bison.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: web-ui +{{- end }} diff --git a/deploy/charts/bison/templates/api-server/rbac.yaml b/deploy/charts/bison/templates/rbac.yaml similarity index 91% rename from deploy/charts/bison/templates/api-server/rbac.yaml rename to deploy/charts/bison/templates/rbac.yaml index 4a70b0e..dfe0984 100644 --- a/deploy/charts/bison/templates/api-server/rbac.yaml +++ b/deploy/charts/bison/templates/rbac.yaml @@ -35,6 +35,10 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["get", "list", "watch", "update", "patch"] + # Leader election for the singleton scheduler (coordination.k8s.io Lease) + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "create", "update"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/deploy/charts/bison/templates/secret.yaml b/deploy/charts/bison/templates/secret.yaml new file mode 100644 index 0000000..8ab02a8 --- /dev/null +++ b/deploy/charts/bison/templates/secret.yaml @@ -0,0 +1,35 @@ +{{- if and .Values.apiServer.enabled .Values.auth.enabled }} +{{- if not .Values.auth.admin.existingSecret }} +{{- $secretName := printf "%s-auth" (include "bison.fullname" .) -}} +{{- /* Reuse the already-generated secret on upgrade so the JWT signing key and + admin password stay stable across `helm upgrade`. Otherwise every upgrade + would re-run randAlphaNum, rotating the JWT key (invalidating all sessions) + and silently changing the admin password. */ -}} +{{- $existing := (lookup "v1" "Secret" .Release.Namespace $secretName) -}} +{{- $existingData := dict -}} +{{- if $existing -}}{{- $existingData = $existing.data -}}{{- end -}} +apiVersion: v1 +kind: Secret +metadata: + name: {{ $secretName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} +type: Opaque +data: + {{- if .Values.auth.admin.password }} + password: {{ .Values.auth.admin.password | b64enc | quote }} + {{- else if hasKey $existingData "password" }} + password: {{ get $existingData "password" | quote }} + {{- else }} + password: {{ randAlphaNum 16 | b64enc | quote }} + {{- end }} + {{- if .Values.auth.jwt.secret }} + jwt-secret: {{ .Values.auth.jwt.secret | b64enc | quote }} + {{- else if hasKey $existingData "jwt-secret" }} + jwt-secret: {{ get $existingData "jwt-secret" | quote }} + {{- else }} + jwt-secret: {{ randAlphaNum 32 | b64enc | quote }} + {{- end }} +{{- end }} +{{- end }} diff --git a/deploy/charts/bison/templates/api-server/service.yaml b/deploy/charts/bison/templates/service.yaml similarity index 50% rename from deploy/charts/bison/templates/api-server/service.yaml rename to deploy/charts/bison/templates/service.yaml index d0c9e18..ab2bcbc 100644 --- a/deploy/charts/bison/templates/api-server/service.yaml +++ b/deploy/charts/bison/templates/service.yaml @@ -18,4 +18,24 @@ spec: {{- include "bison.selectorLabels" . | nindent 4 }} app.kubernetes.io/component: api-server {{- end }} - +--- +{{- if .Values.webUI.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "bison.webUI.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + app.kubernetes.io/component: web-ui +spec: + type: {{ .Values.webUI.service.type }} + ports: + - port: {{ .Values.webUI.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "bison.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: web-ui +{{- end }} diff --git a/deploy/charts/bison/templates/web-ui/deployment.yaml b/deploy/charts/bison/templates/web-deployment.yaml similarity index 90% rename from deploy/charts/bison/templates/web-ui/deployment.yaml rename to deploy/charts/bison/templates/web-deployment.yaml index a637f65..55829cb 100644 --- a/deploy/charts/bison/templates/web-ui/deployment.yaml +++ b/deploy/charts/bison/templates/web-deployment.yaml @@ -8,7 +8,9 @@ metadata: {{- include "bison.labels" . | nindent 4 }} app.kubernetes.io/component: web-ui spec: + {{- if not .Values.webUI.autoscaling.enabled }} replicas: {{ .Values.webUI.replicaCount }} + {{- end }} selector: matchLabels: {{- include "bison.selectorLabels" . | nindent 6 }} @@ -29,7 +31,7 @@ spec: - name: web securityContext: {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ include "bison.imageRegistry" . }}{{ .Values.webUI.image.repository }}:{{ .Values.webUI.image.tag }}" + image: "{{ include "bison.image" (dict "imageConfig" .Values.webUI.image "global" .Values.global "appVersion" .Chart.AppVersion) }}" imagePullPolicy: {{ .Values.webUI.image.pullPolicy }} ports: - name: http @@ -65,4 +67,3 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - diff --git a/deploy/charts/bison/templates/web-ui/ingress.yaml b/deploy/charts/bison/templates/web-ui/ingress.yaml deleted file mode 100644 index 00c86e6..0000000 --- a/deploy/charts/bison/templates/web-ui/ingress.yaml +++ /dev/null @@ -1,40 +0,0 @@ -{{- if and .Values.webUI.enabled .Values.webUI.ingress.enabled }} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: {{ include "bison.webUI.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "bison.labels" . | nindent 4 }} - app.kubernetes.io/component: web-ui - {{- with .Values.webUI.ingress.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if .Values.webUI.ingress.className }} - ingressClassName: {{ .Values.webUI.ingress.className }} - {{- end }} - {{- if .Values.webUI.ingress.tls }} - tls: - {{- range .Values.webUI.ingress.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - - host: {{ .Values.webUI.ingress.host | quote }} - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: {{ include "bison.webUI.fullname" . }} - port: - number: {{ .Values.webUI.service.port }} -{{- end }} - diff --git a/deploy/charts/bison/templates/web-ui/service.yaml b/deploy/charts/bison/templates/web-ui/service.yaml deleted file mode 100644 index 905e3b6..0000000 --- a/deploy/charts/bison/templates/web-ui/service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -{{- if .Values.webUI.enabled }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "bison.webUI.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "bison.labels" . | nindent 4 }} - app.kubernetes.io/component: web-ui -spec: - type: {{ .Values.webUI.service.type }} - ports: - - port: {{ .Values.webUI.service.port }} - targetPort: http - protocol: TCP - name: http - selector: - {{- include "bison.selectorLabels" . | nindent 4 }} - app.kubernetes.io/component: web-ui -{{- end }} - diff --git a/deploy/charts/bison/values.schema.json b/deploy/charts/bison/values.schema.json new file mode 100644 index 0000000..80420eb --- /dev/null +++ b/deploy/charts/bison/values.schema.json @@ -0,0 +1,133 @@ +{ + "$schema": "https://json-schema.org/draft-07/schema#", + "title": "Bison Helm values", + "type": "object", + "properties": { + "global": { + "type": "object", + "properties": { + "imageRegistry": { "type": "string" }, + "imagePullSecrets": { "type": "array" } + } + }, + "auth": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "admin": { + "type": "object", + "properties": { + "username": { "type": "string" }, + "password": { "type": "string" }, + "existingSecret": { "type": "string" } + } + }, + "jwt": { + "type": "object", + "properties": { + "secret": { "type": "string" }, + "existingSecret": { "type": "string" } + } + } + } + }, + "dependencies": { + "type": "object", + "properties": { + "capsule": { + "type": "object", + "properties": { "enabled": { "type": "boolean" } } + }, + "opencost": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "apiUrl": { "type": "string" } + } + }, + "prometheus": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "url": { "type": "string" } + } + } + } + }, + "apiServer": { "$ref": "#/$defs/component" }, + "webUI": { "$ref": "#/$defs/component" }, + "ingress": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "className": { "type": "string" }, + "annotations": { "type": "object" }, + "host": { "type": "string" }, + "tls": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "secretName": { "type": "string" } + } + } + } + }, + "serviceAccount": { + "type": "object", + "properties": { + "create": { "type": "boolean" }, + "annotations": { "type": "object" }, + "name": { "type": "string" } + } + }, + "podSecurityContext": { "type": "object" }, + "securityContext": { "type": "object" }, + "networkPolicy": { + "type": "object", + "properties": { "enabled": { "type": "boolean" } } + } + }, + "$defs": { + "component": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "replicaCount": { "type": "integer", "minimum": 0 }, + "image": { + "type": "object", + "properties": { + "repository": { "type": "string" }, + "tag": { "type": "string" }, + "pullPolicy": { "type": "string" } + } + }, + "service": { + "type": "object", + "properties": { + "type": { "type": "string" }, + "port": { "type": "integer" } + } + }, + "resources": { "type": "object" }, + "nodeSelector": { "type": "object" }, + "tolerations": { "type": "array" }, + "affinity": { "type": "object" }, + "autoscaling": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "minReplicas": { "type": "integer", "minimum": 1 }, + "maxReplicas": { "type": "integer", "minimum": 1 }, + "targetCPUUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 } + } + }, + "podDisruptionBudget": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" } + } + } + } + } + } +} diff --git a/deploy/charts/bison/values.yaml b/deploy/charts/bison/values.yaml index 1546f16..84f81ca 100644 --- a/deploy/charts/bison/values.yaml +++ b/deploy/charts/bison/values.yaml @@ -2,19 +2,19 @@ # Global configuration global: - imageRegistry: "" + imageRegistry: "ghcr.io/supermarioyl" imagePullSecrets: [] # Authentication configuration auth: - enabled: false # Enable login authentication + enabled: false # Enable login authentication admin: - username: admin # Admin username - password: "" # Admin password (not recommended, use existingSecret) - existingSecret: "" # Secret containing 'password' key + username: admin # Admin username + password: "" # Admin password (not recommended, use existingSecret) + existingSecret: "" # Secret containing 'password' key jwt: - secret: "" # JWT signing secret (auto-generated if empty) - existingSecret: "" # Secret containing 'jwt-secret' key + secret: "" # JWT signing secret (auto-generated if empty) + existingSecret: "" # Secret containing 'jwt-secret' key # External dependencies # Note: Capsule and OpenCost must be installed separately before deploying Bison @@ -23,16 +23,16 @@ dependencies: capsule: # Capsule must be installed separately # helm install capsule projectcapsule/capsule -n capsule-system --create-namespace - enabled: true - + enabled: false + # OpenCost configuration opencost: # OpenCost must be installed separately # helm install opencost opencost/opencost -n opencost --create-namespace - enabled: true + enabled: false # OpenCost API URL (internal service URL) - use port 9003 for API, NOT 9090 (UI) apiUrl: "http://opencost.opencost.svc.cluster.local:9003" - + # Prometheus configuration (for node metrics, required by OpenCost) prometheus: enabled: true @@ -42,20 +42,19 @@ dependencies: # API Server configuration apiServer: enabled: true + # The api-server runs the billing/auto-recharge/alert scheduler. Multiple + # replicas are safe: the scheduler is guarded by Kubernetes lease-based leader + # election (LEADER_ELECTION_ENABLED, on by default) so it runs on exactly one + # replica at a time. Set LEADER_ELECTION_ENABLED=false only for single-replica + # or local development. replicaCount: 2 image: - repository: ghcr.io/supermarioyl/bison/api-server - tag: 0.0.1 + repository: bison/api-server + tag: "" # Defaults to Chart.AppVersion if empty pullPolicy: IfNotPresent service: type: ClusterIP port: 8080 - ingress: - enabled: true - className: "" - annotations: {} - host: bison-api.example.com - tls: [] resources: limits: cpu: 1000m @@ -66,24 +65,28 @@ apiServer: nodeSelector: {} tolerations: [] affinity: {} + # Optional autoscaling. When enabled, replicaCount is ignored (HPA owns replicas). + autoscaling: + enabled: false + minReplicas: 2 + maxReplicas: 5 + targetCPUUtilizationPercentage: 80 + # Optional PodDisruptionBudget to keep at least one replica during voluntary disruptions. + podDisruptionBudget: + enabled: false + minAvailable: 1 # Web UI configuration webUI: - enabled: false + enabled: true replicaCount: 2 image: - repository: ghcr.io/supermarioyl/bison/web-ui - tag: 0.0.1 + repository: bison/web-ui + tag: "" # Defaults to Chart.AppVersion if empty pullPolicy: IfNotPresent service: type: ClusterIP port: 80 - ingress: - enabled: true - className: "" - annotations: {} - host: bison.example.com - tls: [] resources: limits: cpu: 500m @@ -94,6 +97,30 @@ webUI: nodeSelector: {} tolerations: [] affinity: {} + autoscaling: + enabled: false + minReplicas: 2 + maxReplicas: 5 + targetCPUUtilizationPercentage: 80 + podDisruptionBudget: + enabled: false + minAvailable: 1 + +# Optional NetworkPolicy restricting ingress to the api-server (off by default). +# When enabled, the api-server only accepts traffic from web-ui pods and the +# release namespace. Review before enabling to ensure it fits your CNI/topology. +networkPolicy: + enabled: false + +# Ingress (unified entry point, web-ui nginx proxies /api to api-server) +ingress: + enabled: true + className: "" + annotations: {} + host: bison.example.com + tls: + enabled: false + secretName: "" # If empty, auto-generated as -tls # Service account serviceAccount: diff --git a/docs/README_CN.md b/docs/README_CN.md index 154539c..72a8013 100644 --- a/docs/README_CN.md +++ b/docs/README_CN.md @@ -237,10 +237,38 @@ helm install opencost opencost/opencost -n opencost --create-namespace \ ### 2. 部署 Bison +#### 方式 A: 使用 GHCR (推荐 - OCI 格式) + +```bash +# 直接从 GitHub Container Registry 安装 +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set auth.enabled=true \ + --version 0.0.2 +``` + +> **注意:** 需要 Helm >= 3.8.0 以支持 OCI + +#### 方式 B: 从 GitHub Release + +```bash +# 从 GitHub Release 下载 Helm chart +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz + +# 从下载的 chart 安装 +helm install bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace \ + --set auth.enabled=true +``` + +#### 方式 C: 从源码 + ```bash # 克隆并部署 -git clone https://github.com/your-org/bison.git -cd bison +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison helm install bison ./deploy/charts/bison \ --namespace bison-system \ diff --git a/docs/optimization-roadmap.md b/docs/optimization-roadmap.md new file mode 100644 index 0000000..ae8c631 --- /dev/null +++ b/docs/optimization-roadmap.md @@ -0,0 +1,226 @@ +# Bison 持续优化路线图 + +> 基于一次覆盖后端、前端、官网、文档、DevOps、测试的综合优化审计整理(多 agent 并行审计 + 对抗式验证,共 89 条发现)。 +> 生成时间:2026-06-19 · 审计基线版本 **0.0.11**(`deploy/charts/bison/Chart.yaml`、`web-ui/package.json`、`website/versions.json` 一致;唯一例外是 UI 页脚仍硬编码 `v3.0.0`,本次迭代已修复)。 + +## 概览 + +本路线图把 89 条已验证发现归并为 **9 个主题**,按「影响 / 工作量」排序为「快赢 → 战略」两档。整体判断: + +- **最危险的是一簇资金正确性 + 并发缺陷**:余额/计费状态存在共享 ConfigMap,read-modify-write 无 ResourceVersion 冲突重试(`api-server/internal/k8s/client.go` 的 `UpdateConfigMap` 为裸 `Update`,全仓无 `RetryOnConflict`),而调度器在 `replicaCount: 2`(`deploy/charts/bison/values.yaml`)下**无 leader election**,每个副本独立跑小时计费/自动充值/告警 —— 客户被按副本数重复扣费、并发充值丢更新。这两条必须最先处理。 +- **一组纯文档错误以极低成本拦住所有新用户**:OCI chart 路径写成 `oci://ghcr.io/supermarioyl/bison/bison`,而 CI 实际推送到 `oci://ghcr.io/supermarioyl/charts/bison`;健康检查文档指向不存在的 `/api/v1/health`(实际 `/healthz`);版本停留 0.0.2/0.0.1;OpenCost 命名空间在 chart 默认(`opencost`)与官网(`opencost-system`)间不一致。 +- **官网正在迭代**:emoji 图标、缺失能力展示、缺失截图、版本错印、homepage i18n、社交卡尺寸、SEO 收尾,已抽成独立的「本次迭代清单」。 +- **测试是 0**:后端 0 个 `*_test.go`,前端仅 `1+1==2`,CI 测试任务空跑报绿,所有金额逻辑无回归安全网。 + +--- + +## 已落地版本(v0.0.12 → v0.0.26,按「每个功能一个小版本」迭代) + +| 版本 | 主题 | 内容 | +|---|---|---| +| 0.0.12 | 资金/前端/官网/文档 | ConfigMap 余额 RMW 加 `RetryOnConflict`;`Deduct` 返回写后余额;前端路由 lazy + echarts 分包 + ErrorBoundary;官网去 emoji 换 SVG + ProductShowcase;安装文档纠错;本路线图 | +| 0.0.13 | 资金/并发 | 调度器 **leader election**(Lease),恢复 `replicaCount: 2`,scheduler 可重启 + 测试 | +| 0.0.14 | 安全 | Helm Secret `lookup` 持久化,升级不再轮换 JWT/密码 | +| 0.0.15 | 安全 | 登录 per-IP 限流 + `crypto/subtle` 常量时间比较 + 测试 | +| 0.0.16 | 安全 | `CORS_ALLOWED_ORIGINS` 可配置 allowlist | +| 0.0.17 | 资金 | `lastBilledAt` 时间戳门控,修计费窗口/Interval 不一致 + 防重启重扣 + 测试 | +| 0.0.18 | 资金 | `CalculateDailyConsumption` 分母改实际跨度,修 ~6x 烧钱速率低估 + 测试 | +| 0.0.19 | 性能 | 删 `ListTeams` 丢弃用量循环;`loadPrices` 每次计费读一次配置 | +| 0.0.20 | 性能 | OpenCost 查询 30s TTL + 并发合并缓存 + race 测试 | +| 0.0.21 | DevOps | release 加 **Test & Lint Gate**(vet/fmt/build/test -race + web);修 `tslib` 幽灵依赖 | +| 0.0.22 | 安全 | auth 开启时启动拒绝默认 JWT/密码 + 测试 | +| 0.0.23 | 前端质量 | 13 处错误提取统一为 `getApiErrorMessage` | +| 0.0.24 | 前端性能 | Auth/Theme Context value `useMemo` | +| 0.0.25 | DevOps | `values.schema.json` 类型校验 + `kubeVersion >=1.22` | +| 0.0.26 | DevOps | 可选 PDB / HPA / NetworkPolicy 模板 | + +> 已覆盖 P0 资金正确性与并发、安全基线、开箱即用/CI 门禁、热点性能、chart 健壮性与可用性等全部近期项与多数中期项。 + +### 仍待办(多需活集群验证或产品决策,建议后续单独排期) + +- **后端规模化**:SharedInformer/lister 缓存热点 List、列表分页、per-request 与调度器超时、报表单次聚合查询、`GetCostTrend` 按桶日期映射、suspend/resume 二次缩容修复。 +- **前端**:NodeDetail echarts option `useMemo`、逐行 N+1 query 分页门、硬编码色→主题 token、`formatCurrency`/`currencySymbol`、dayjs 统一 bootstrap、i18n 层。 +- **安全/供应链**:RBAC 去 `clusterrolebindings` 写、onboarding SSH 入参校验、镜像 Trivy 扫描/SBOM/cosign 签名、基础镜像 digest 固定、Dependabot。 +- **架构/平台化(远期)**:余额持久化模型升级(ConfigMap → per-key patch / CRD)、money 整数最小单位、能力补全或下线(OIDC/Email/Excel-PDF)。 + +--- + +## 优化主题 + +### 主题 1 · 资金正确性与并发安全 + +| 问题 | 影响 | 建议 | 优先级 | 工作量 | +|---|---|---|---|---| +| 调度器在每个副本无 leader election(`scheduler.go` 裸 ticker,`replicaCount:2`) | 按副本数重复扣费/重复自动充值/重复告警 —— 资金错误 | 立即将 `values.yaml` replicaCount 设 1 止血;用 `client-go` leaderelection Lease 根治,或拆成单副本 Deployment/CronJob | P0 | M(止血 S) | +| 余额 ConfigMap read-modify-write 无冲突重试(`balance_service.go`、`k8s/client.go`) | 并发充值与计费扣费丢更新,余额静默腐蚀,无审计差额 | `Recharge/Deduct/addRechargeRecord/SetOverdueAt` 包入 `retry.RetryOnConflict`,循环内重读重算重写 | P0 | M | +| ProcessBilling 扣费后独立再读余额做停机判断,且丢弃错误(`billing_service.go` `balance, _ := ...GetBalance`) | 基于陈旧/竞态值错误停机:deployment/statefulset 缩 0、删孤儿 pod | 让 `Deduct` 返回写后余额供判断;停止吞错;与上条乐观锁配合 | P0 | M | +| 计费窗口由 `config.Interval` 决定但 ticker 硬编码 1h(`scheduler.go` vs `billing_service.go`) | Interval≠1 时每小时按 Interval 小时扣费,严重超收 | ticker 周期由 Interval 驱动,或固定查 1h 窗口;加校验;记录 last-billed 时间戳防重启重扣 | P1 | S | +| `CalculateDailyConsumption` 分母取最近 100 条任意类型记录的最旧时间(`balance_service.go`) | 烧钱速率被低估约 6x,欠费预计时间高估 | 分母改为窗口内 deduction 记录的实际跨度(≤7 天 + 下限) | P1 | S | +| suspend/resume 依赖 `original-replicas` 注解,二次缩容丢副本数(`billing_service.go`) | 恢复后工作负载回到 0 副本 | suspend 前查 `team.Suspended` 只缩一次;per-object Update 包 RetryOnConflict | P2 | M | +| `GetCostTrend` 按位置索引映射 OpenCost 日桶(`opencost/client.go`) | 成本错位到错误日期或被静默置 0 | 按每桶 `Window.Start` 解析日期建趋势 | P2 | M | + +### 主题 2 · 安全加固 + +| 问题 | 影响 | 建议 | 优先级 | 工作量 | +|---|---|---|---|---| +| 鉴权默认关闭 + admin/admin + 硬编码 JWS 密钥(`config.go`) | 财务+集群控制面开箱即用零鉴权;token 可伪造 | 默认开启鉴权;启动时拒绝内置默认密钥/密码;无密钥时随机生成 | P0 | S | +| `helm upgrade` 每次 `randAlphaNum` 重生成密码与 JWT(`secret.yaml`) | 每次升级踢出所有用户、admin 密码静默改变 | 用 `lookup` 复用已存在 Secret,仅首装生成;可加 `resource-policy: keep` | P0 | S | +| 登录无限流、明文非常量时间比较(`auth.go`) | admin 密码可全速暴力破解 | per-IP 限流 + 退避;`crypto/subtle.ConstantTimeCompare` | P1 | S | +| CORS `Allow-Origin: *` 且允许 Authorization 头(`main.go`) | 任意站点可发起带凭证跨域请求 | 收紧为可配置 allowlist(Bison UI origin) | P2 | S | +| Onboarding/控制面 SSH 配置零校验(`onboarding.go`) | 配合鉴权关闭可在节点/控制面跑攻击者命令 | 校验 host/user 非空、host allowlist/CIDR、端口范围、脚本约束;限管理员可达 | P1 | M | +| API ClusterRole 过权:clusterrolebindings 写、node patch、namespace 增删(`rbac.yaml`) | api-server 被攻破≈集群管理员提权 | 去掉 clusterrolebindings 写(除非必需),node 写与最宽 verb 用 values 开关收口,拆 ClusterRole + per-ns Role | P2 | M | + +### 主题 3 · 后端性能与 K8s/OpenCost 访问模式 + +| 问题 | 影响 | 建议 | 优先级 | 工作量 | +|---|---|---|---|---| +| client-go 默认 5 QPS 无 informer 缓存(`k8s/client.go`) | 请求在客户端限流后串行,延迟随集群规模线性增长 | 设 `config.QPS=50`/`Burst=100`;为 namespaces/pods/nodes/tenants 引入 SharedInformerFactory + lister | P1 | L | +| OpenCost 查询无缓存,仪表盘多端点重复查同窗口(`opencost/client.go`、`stats.go`) | 并发用户成倍放大 OpenCost 负载,仪表盘慢 | 加 30-60s TTL 缓存(按 window+aggregate+filter)+ singleflight 合并并发 | P1 | M | +| `TenantService.List` 每团队按 namespace 逐个列 pod(`tenant_service.go`) | `/teams`、`/stats/overview`、`/stats/quota-alerts` 每次 O(团队×ns×pod) | 单次集群级 ListPods 后内存分桶;或 `?usage=true` 让用量按需 | P1 | M | +| `ListTeams` 逐团队算 OpenCost 用量后丢弃(`team.go` `_ = usage`) | 纯浪费的 N 次 OpenCost+tenant 扫描 | 删除该循环,或单次 `GetTeamUsage` 合并入响应 | P1 | S | +| 计费 nsToTeam 逐团队 `ListByTeam` 且吞错(`billing_service.go`) | 列举失败的团队被静默不计费 | 单次 `ListTenants` 用 status.namespaces 构图;记录而非吞错 | P1 | S | +| `calculateCost` 每行重读资源配置(`billing_service.go`) | 每个 allocation 一次 ConfigMap 读 | 每次计费操作读一次配置,预建价格表传入 | P1 | S | +| Summary/团队账单每 namespace 一次 OpenCost 查询(`report_service.go`、`billing_service.go`) | T×P 次串行 HTTP,报表生成随项目数变慢 | 一次 `GetAllocationByNamespace(window)` 建 ns→allocation 映射后内存聚合 | P2 | M | +| 列表无分页(`team.go`/`project.go`/`user.go`) | 大部署返回大而慢的载荷且无法分页 | 仿 `audit.go` 加 page/pageSize + total | P2 | M | +| 无 per-request/调度器超时(`main.go`、`k8s/client.go`) | K8s/OpenCost 卡住时 goroutine 与连接泄漏 | 中间件包 `context.WithTimeout`(20-25s);设 `rest.Config.Timeout`;调度每轮加超时 | P2 | M | +| 调度器 goroutine 无 panic 恢复/无首跑/无 jitter(`scheduler.go`) | 一次 panic 拖垮整进程;多副本同刻齐发踩踏 | 每任务 defer/recover + 随机 jitter + 启动后首跑 | P1 | S | + +### 主题 4 · 前端性能与健壮性 + +| 问题 | 影响 | 建议 | 优先级 | 工作量 | +|---|---|---|---|---| +| 无路由级代码分割,echarts 全量进主包(`App.tsx`、`NodeDetail.tsx`、`vite.config.ts`) | 所有会话首屏都下载只 NodeDetail 用的 ~1MB echarts | 路由 `React.lazy`+`Suspense`;`build.rollupOptions.output.manualChunks` 拆 react/antd/echarts | P1 | M | +| 无 ErrorBoundary,query 读错误未处理 | 渲染异常白屏;读失败页面静默空白/无重试 | 顶层 ErrorBoundary 包 ``;共享 `isError/error`→Alert/Result+refetch | P1 | M | +| `@ant-design/pro-components` 声明却零引用(2.7MB) | 拖慢安装、有误入包风险 | 从 `package.json` 移除并更新 CLAUDE.md 表述 | P1 | S | +| ProjectList/TeamList 逐行 N+1 query 无分页门(`ProjectList.tsx`、`TeamList.tsx`) | 50 项目=100 次后端调用/页 | 用分页切片驱动 useQueries,或批量后端端点,至少加 `enabled` | P2 | M | +| Auth/Theme context value 每次渲染新对象(`AuthContext.tsx`、`ThemeContext.tsx`、`main.tsx`) | 所有消费者及整套 antd 主题级联重渲染 | provider value 用 useMemo;`main.tsx` theme token useMemo([isDark]) | P2 | S | +| NodeDetail 每次渲染重建 8+ echarts option(`NodeDetail.tsx`,无 useMemo) | 隐藏 tab 也计算时序 map | 各 option useMemo 按 metric 切片;可 `destroyInactiveTabPane` | P2 | M | +| Dashboard 6 路轮询冗余、全局 staleTime 30s 偏激进 | 后台稳定网络负载 | `refetchIntervalInBackground:false`;合并低频概览轮询;分级 staleTime | P3 | S | +| Dashboard 列定义/渲染闭包每次(含轮询)重建(`Dashboard/index.tsx`) | 每次轮询全表重渲染 | 列数组 useMemo、helper useCallback | P3 | S | + +### 主题 5 · 前端质量、可访问性与一致性 + +| 问题 | 影响 | 建议 | 优先级 | 工作量 | +|---|---|---|---|---| +| 错误提取串复制 13 处、无 utils 目录 | 后端错误信封一变需改 13 处 | 加 `src/utils/error.ts` 的 `getApiErrorMessage(err, fallback)` | P2 | S | +| `error:any` 12 处破坏 strict 模式 | 在处理不可信响应的路径上失去类型安全 | 改 `unknown`/`AxiosError<{error?:string}>`;eslint 禁 explicit-any | P2 | S | +| 硬编码十六进制色不随暗色主题(`Dashboard`、`ResourceQuotaInput`、`NodeDetail` echarts) | 暗色下 `#999` 文本/弱阴影/图表轴对比度差 | 用 `theme.useToken()` 或 `var(--*)`;echarts 读 `isDark` 派生轴/文本色 | P2 | M | +| `` 无 href 共 10 处 | 非 tab 可达、无法 Enter/Space、不能新开 | 用 react-router `` 或 `Typography.Link` | P2 | S | +| Dashboard 硬编码 `$` 而非 `currencySymbol` | CNY/¥ 部署仪表盘显示 $ | 取 billingConfig + 抽 `formatCurrency(value,symbol)` | P2 | S | +| dayjs 插件/locale 每文件重复 bootstrap(3 文件) | 第四个页面易漏配 | 在 `main.tsx`/`lib/dayjs.ts` 统一一次 | P3 | S | +| 残留 `console.log`(`ResourceConfig.tsx`) | 生产控制台泄漏内部数据形状 | 删除;加 eslint `no-console`(留 warn/error) | P3 | S | +| 无 i18n 层,UI 文案全内联中文 | 无法本地化/集中措辞 | 若需多语引 react-i18next;否则至少集中重复标签/toast 前缀 | P3 | L | + +### 主题 6 · 文档与安装链路正确性 + +| 问题 | 影响 | 建议 | 优先级 | 工作量 | +|---|---|---|---|---| +| OCI chart 路径错误 `bison/bison`(应 `charts/bison`) | 主推荐安装命令 not found 全失败 | 全量替换为 `oci://ghcr.io/supermarioyl/charts/bison`;加 CI grep 校验与 release.yml 一致 | P0 | S | +| 健康检查文档 `/api/v1/health` 不存在(实际 `/healthz`/`/readyz`) | 首装验证步骤返回 404,误判安装失败 | 改 `curl .../healthz` | P1 | S | +| 版本停留 0.0.2/镜像 0.0.1(实际 0.0.11) | 装到陈旧版本、示例内部不一致 | 改用 VERSION 变量/最新版;release 工作流模板化注入防漂移 | P1 | S | +| intro.md Option A 用 github.io helm repo(CI 从不发布该 index) | 第一个安装方法即失败 | 重写为 OCI;清理别名命令 | P0 | S | +| values 键名虚构:`opencost.url`/`apiServer.replicas`/`auth.oidc`/`clusterName` | `--set` 静默 no-op:OpenCost 错指、计费无数据 | 改正键名;移除/标 roadmap 的 oidc 与 clusterName;加 helm template 校验 CI | P0 | M | +| OpenCost 命名空间不一致:chart 默认 `opencost` vs 官网 `opencost-system` | 默认 URL 触不到文档安装位置,计费显示 0 | 统一为 chart 默认 `opencost` | P1 | S | +| features.md 夸大未实现能力(OIDC/Email/Excel-PDF/插件) | 卖点接触即失败、生支持工单 | 标 planned 或移除,对齐 README 真实清单 | P1 | M | +| 配置默认虚构(`BILLING_INTERVAL=10m`、`auth.admin.password=admin`) | 运维误以为 10m 节奏与默认密码 | 删/实现该 env;密码默认改空/自动生成;逐列对照 values.yaml | P2 | S | +| CHANGELOG 停在 0.0.1、installation.md 对象名错(`bison-api-server`/`bison-webui`,实际 `bison-api`/`bison-web`)、孤儿 `docs/architecture.html` | 误导与维护腐烂 | 回填 0.0.6-0.0.11;改正对象名;删 html 或入构建 | P2 | S-M | + +### 主题 7 · 官网本次迭代 + +详见下方独立「网站本次迭代清单」章节。 + +### 主题 8 · DevOps / 发布管线 / 部署可用性 + +| 问题 | 影响 | 建议 | 优先级 | 工作量 | +|---|---|---|---|---| +| release 工作流无测试/lint 门(`release.yml`,仅 helm lint) | 破损代码可被打成正式 Release | 加 gating job 跑 `go test -race`/`vet`/`npm lint`/`npm test`(`workflow_call` 复用),build-and-push `needs:` 它 | P0 | S | +| web-ui nginx envsubst 失效 + 只读根 FS 会 CrashLoop(`nginx.conf`、`values.yaml`、`web-deployment.yaml`) | 默认生产配置下 Web UI 无法启动/连后端 | 改 `nginx-unprivileged`/listen 8080 + `pid /tmp` + emptyDir 挂载;`${API_BASE_URL}` 走 templates | P0 | M | +| helm upgrade 轮换密钥(见主题 2,列此联动) | 升级即登出全员 | `lookup` 复用 Secret | P0 | S | +| Dockerfile/CI 删 package-lock 再 install(`web-ui/Dockerfile`、`build-test.yml`) | 构建不可复现、缓存失效 | 修根因后全用 `npm ci` | P2 | S | +| 镜像无扫描/SBOM/签名(`release.yml` 已声明 id-token 却未接) | 漏洞基础层流向用户、无法验真 | build-push 加 `provenance/sbom`,cosign 无密钥签名,PR 跑非阻断 Trivy | P2 | M | +| 缺 PDB/HPA/NetworkPolicy(replicaCount:2) | 同时驱逐两副本致停机 | 加可选 PDB(minAvailable:1)/HPA/NetworkPolicy,values 开关 | P2 | M | +| 基础镜像未按 digest 固定,alpine:3.19 近 EOL | OS 层不可复现、积累 CVE | 按 `@sha256:` 固定并升 alpine;Renovate/Dependabot 维护;api-server 考虑 distroless | P2 | S | +| GitHub Actions 用 `latest`/浮动 major | CI 行为非确定、供应链风险 | helm 固定具体版本、actions 钉 SHA;启 github-actions Dependabot | P3 | S | +| Chart 无 kubeVersion 与 values.schema.json | 不支持集群晚失败、values 拼写静默忽略 | 加 `kubeVersion:'>=1.22.0-0'` 与 values.schema.json;`helm lint --strict` | P3 | S | +| api-server Dockerfile `COPY . .` 无 .dockerignore | 构建上下文大、禁用 Go 缓存 | 加 .dockerignore;去 `-a -installsuffix cgo`;BuildKit cache mount | P3 | S | + +### 主题 9 · 测试覆盖 + +| 问题 | 影响 | 建议 | 优先级 | 工作量 | +|---|---|---|---|---| +| 后端 0 个 `*_test.go`,所有金额逻辑无测试 | 符号/边界/重构错误可静默错账上线 | 表驱动单测 + fake clientset,优先 `calculateCost`/`Recharge`/`Deduct`/`isGracePeriodExpired`/`CalculateDailyConsumption` | P0 | L(首批 M) | +| CI 测试任务 vacuous 通过报绿 0% 覆盖(`build-test.yml`) | 对评审制造虚假信心 | 加最低覆盖门;检测关键包零测试文件即失败 | P0 | S | +| 并发扣费/充值无 `-race` 测试 | 丢更新竞态不可捕获 | N 并发 Recharge/Deduct 对 fake clientset 断言终值==操作和 | P1 | M | +| 计费 deduct+复读 分支、grace/suspend 单位逻辑无测试 | 错阈值/单位错停付费租户工作负载 | 覆盖正余额/grace 内/过 grace(hours&days)/恢复 四分支 | P1 | M | +| recharge 校验仅在 handler binding,`SetAutoRechargeConfig` 无正值检查 | 负自动充值额每 tick 静默扣费 | 加正值检查 + 边界/NaN 测试 | P1 | S | +| 前端仅占位测试,BillingConfig/api.ts/recharge 0% | money-facing UI 无输入校验/格式/错误保护 | 删 example.test 换真测试 + vitest 覆盖门 | P2 | L | + +--- + +## Top 优先级 + +> 排序原则:先「低工作量、止血资金/安全/可用性」的快赢,再「打基础」的战略项。 + +**快赢档(P0,可立即排期)** + +1. **调度器 leader election(先 replicaCount=1 止血)** —— 消除按副本数重复扣费。止血 S,根治 M。 +2. **ConfigMap 余额包 `RetryOnConflict` + 乐观并发** —— 杜绝余额丢更新/腐蚀,是金额准确性的地基。M。 +3. **修正安装文档致命错误**(OCI `charts/bison`、`/healthz`、版本 0.0.11、OpenCost 命名空间/键名)。S。 +4. **鉴权安全基线**(默认开启 + 拒绝默认密钥/密码 + helm `lookup` 持久化 Secret + 登录限流 + CORS 收紧)。S。 +5. **修复 web-ui 部署可用性**(nginx envsubst + 只读根 FS → unprivileged/8080 + emptyDir)。M。 +6. **release 工作流加测试/lint 门 + CI 覆盖门**。S。 +7. **官网本次迭代快赢三连**(修 `v3.0.0` 版本号 → emoji 改 Tabler SVG → 社交卡 1200x630)。S-M。 + +**战略档(P1,打基础)** + +8. **首批后端单元测试**(`calculateCost` 最高价值,叠加 `Recharge`/`Deduct`/计费分支)。M。 +9. **前端 `React.lazy` + Vite `manualChunks` 拆 echarts,并加顶层 ErrorBoundary**。M。 +10. **K8s client QPS/Burst + 热点 List informer 缓存 + OpenCost 短 TTL+singleflight 缓存**。L。 + +--- + +## 网站本次迭代清单 + +> 官网(Docusaurus,`website/`,线上 `bison.lei6393.com`)。第 1 项是第 7 项截图的前置条件。✅ = 本次迭代已落地。 + +1. ✅ **修正版本号(前置项)**:`web-ui/src/layouts/BasicLayout.tsx` 硬编码的 `v3.0.0` 改为从 `package.json` 经 Vite `define`(`__APP_VERSION__`)注入。 +2. ✅ **emoji 图标改 Tabler 内联 SVG**:`HomepageFeatures` 的 🔐💰📊🚀⚡🎯 改为 `Icons/` 内联 SVG(shield-lock / currency-dollar / dashboard / rocket / bolt / shield-check),`stroke=currentColor` 适配暗色。 +3. ✅ **UseCases / hero emoji 替换**:🤖🏢💵 与 ❌/✅/→ 改内联 SVG;hero 按钮去 🚀/⭐。 +4. ✅ **Homepage i18n**:6 张 feature 卡 + showcase 标签包 `` 并补 `i18n/zh-Hans/code.json` 中文。 +5. ✅ **补内联 SVG 图表/示意图**:新增 `ProductShowcase` 组件(资源总览/集群节点/报表中心/计费配置 四屏全矢量渲染),呼应「多用 SVG 图栏展示功能」。 +6. **刷新陈旧内容**:在 `features.md` 与 `HomepageFeatures` 增补已发布但缺失的能力——「集群与节点管理」「自动化节点 Onboarding」;校正 features.md 夸大项。 +7. **补缺失截图**:本次以矢量 `ProductShowcase` 替代(无可用集群拍摄真实截图);待有环境后补 Reports/Audit/Cluster/Project/Settings/Login 真实截图。 +8. **修正官网安装文档**:`installation.md`/`intro.md` 的 OCI 路径、OpenCost 命名空间、values 键名、对象名、版本号。 +9. **社交 / OG 卡**:换 1200×630,`docusaurus.config.ts` 补 `twitter:card=summary_large_image` 与 `og:image:width/height`。 +10. **SEO 收尾**:确认 google/baidu 验证标签、robots.txt、sitemap 完整;清理默认脚手架(blog 样例、plushie banner、boilerplate README)。 +11. **ParticleBackground 可访问性**:加 `prefers-reduced-motion` 短路与离屏暂停 rAF。 + +--- + +## 后续持续优化方向 + +### 近期(本季度,止血与开箱即用) + +- **资金正确性收口**:完成 leader election + ConfigMap 乐观并发 + `Deduct` 返回写后余额 + 计费窗口/Interval 校验 + `CalculateDailyConsumption` 分母修复,让一笔钱在任何并发与重启下都不丢、不重复、不基于陈旧值停机。 +- **安全基线**:鉴权默认开启 + 启动拒绝默认密钥 + helm Secret 持久化 + 登录限流 + CORS allowlist。 +- **开箱即用**:修复 web-ui nginx/只读根 FS CrashLoop;全量修正安装文档(OCI 路径、healthz、版本、values 键名、OpenCost 命名空间)。 +- **CI 门禁**:release 工作流接测试/lint 门 + 最低覆盖门,并补首批后端金额单测(`calculateCost`/`Recharge`/`Deduct`/计费分支 + `-race` 并发测试)。 +- **官网本次迭代**:执行上方 11 项清单。 +- **快赢清理**:删 `@ant-design/pro-components`、去 `ListTeams` 丢弃用量循环、计费 `calculateCost` 配置改读一次。 + +### 中期(规模化与体验) + +- **后端规模化**:K8s client QPS/Burst + SharedInformer/lister 缓存热点 List;OpenCost 短 TTL + singleflight 缓存;计费/Summary 改单次 `GetAllocationByNamespace` 聚合;列表加分页;per-request 与调度器超时;调度器 panic 恢复 + jitter。 +- **前端体验**:路由 lazy + manualChunks;顶层 ErrorBoundary + 共享 query 错误 UI;逐行 N+1 query 分页门/批量端点;Context value 与 Dashboard 列/闭包 memo 化;NodeDetail echarts option useMemo;轮询节流。 +- **前端一致性**:`getApiErrorMessage` 工具 + `error: unknown` + eslint 禁 any;硬编码色改主题 token(含 echarts 暗色);`` 改 ``;`formatCurrency` + 统一 `currencySymbol`;dayjs 统一 bootstrap;去 console.log。 +- **DevOps 供应链**:镜像扫描(Trivy)/SBOM/cosign 签名;npm ci 复现构建;基础镜像 digest 固定 + alpine 升级;PDB/HPA/NetworkPolicy;Chart kubeVersion + values.schema.json。 +- **文档治理**:CHANGELOG 回填并在 release 强制每 tag 有条目;features.md 与代码对齐;删孤儿 architecture.html;配置默认逐列对照 values.yaml。 +- **测试纵深**:补 `ProcessBilling` 分支、suspend/resume、grace 单位、自动充值正值校验测试;前端 api 拦截器/BillingConfig/recharge 表单测试 + vitest 覆盖门。 + +### 远期(架构与平台化) + +- **持久化模型升级**:评估将余额/计费历史从单一共享 ConfigMap 迁移到 per-team key + patch 语义,或引入轻量嵌入式存储/CRD,从根上消除 read-modify-write 竞态与单对象写放大;money 字段考虑整数最小单位避免 float64 精度问题。 +- **RBAC 最小权限**:拆分 ClusterRole + per-namespace Role,去除 clusterrolebindings 写,node 写按 values 开关。 +- **可观测性与多副本水平扩展**:在 leader election + informer 缓存到位后,让 API 真正多副本水平扩展,配 HPA/PDB/NetworkPolicy 与请求级 metrics/trace。 +- **i18n 平台化**:若多语言成为目标,引入 react-i18next 与 zh-CN 目录,逐步替换内联文案。 +- **能力补全或下线**:对 features.md 中标为 planned 的 OIDC/SSO、Email/SMTP 告警、Excel/PDF 导出、插件化计费规则,按路线图实现或正式从对外材料移除,保持「文档=能力」。 diff --git a/scripts/make-ghcr-public.md b/scripts/make-ghcr-public.md new file mode 100644 index 0000000..7b71804 --- /dev/null +++ b/scripts/make-ghcr-public.md @@ -0,0 +1,89 @@ +# 将 GHCR Package 设为公开 + +由于你无法通过 OCI 路径拉取 Helm chart (`Error: invalid_reference: invalid repository` 或 `403 Forbidden`),这是因为 GitHub Container Registry 的包默认是私有的。 + +## 手动设置 Package 为 Public + +1. **访问 GitHub Packages**: + - 进入 https://github.com/SuperMarioYL?tab=packages + - 或者直接访问仓库首页,点击右侧的 "Packages" + +2. **找到 charts/bison package**: + - 如果存在,点击进入 `charts/bison` package + +3. **修改可见性**: + - 点击 **Package settings** (右上角齿轮图标) + - 滚动到底部找到 **Danger Zone** + - 点击 **Change visibility** + - 选择 **Public** + - 确认更改 + +## 如果找不到 charts/bison Package + +说明 Helm chart 推送到 GHCR 失败了。检查步骤: + +1. **检查 GitHub Actions 运行日志**: + ``` + https://github.com/SuperMarioYL/Bison/actions + ``` + - 找到最近的 "Release" workflow run (v0.0.7) + - 查看 "Publish to Helm Repository (GHCR)" job 的日志 + - 检查是否有错误信息 + +2. **常见失败原因**: + - 权限不足 (GITHUB_TOKEN 没有 `packages: write` 权限) + - OCI 路径错误 + - Helm 登录失败 + +## 临时解决方案 + +在 GHCR OCI 路径修复之前,使用 GitHub Releases: + +```bash +# 列出所有可用版本 +curl -s https://api.github.com/repos/SuperMarioYL/Bison/releases | grep tag_name + +# 下载特定版本 +VERSION=0.0.7 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# 安装 +helm install my-bison bison-${VERSION}.tgz + +# 或者创建本地 Helm 仓库 +mkdir -p ~/helm-charts +cp bison-*.tgz ~/helm-charts/ +helm repo index ~/helm-charts/ +helm repo add local-bison ~/helm-charts/ +helm install my-bison local-bison/bison --version ${VERSION} +``` + +## 验证 Package 是否存在于 GHCR + +```bash +# 使用 GitHub API 检查 +curl -H "Authorization: token YOUR_GITHUB_TOKEN" \ + https://api.github.com/users/SuperMarioYL/packages/container/charts%2Fbison/versions + +# 或者尝试拉取(如果是公开的) +helm pull oci://ghcr.io/supermarioyl/charts/bison --version 0.0.7 +``` + +## 检查是否需要认证 + +即使设置为 public,某些情况下可能仍需要认证: + +```bash +# 登录 GHCR +echo YOUR_GITHUB_TOKEN | helm registry login ghcr.io -u SuperMarioYL --password-stdin + +# 然后拉取 +helm pull oci://ghcr.io/supermarioyl/charts/bison --version 0.0.7 +``` + +## 下一步 + +1. 先检查 v0.0.7 的 GitHub Actions workflow 日志 +2. 确认 Helm chart 推送步骤是否成功 +3. 如果推送成功,设置 package 为 public +4. 如果推送失败,发布新版本 v0.0.8 来测试修复后的配置 diff --git a/web-ui/.dockerignore b/web-ui/.dockerignore new file mode 100644 index 0000000..e619916 --- /dev/null +++ b/web-ui/.dockerignore @@ -0,0 +1,9 @@ +# Keep the Docker build context small and reproducible. +node_modules +dist +coverage +.git +.gitignore +.dockerignore +Dockerfile +npm-debug.log* diff --git a/web-ui/Dockerfile b/web-ui/Dockerfile index abbfa53..5c9668c 100644 --- a/web-ui/Dockerfile +++ b/web-ui/Dockerfile @@ -7,7 +7,8 @@ WORKDIR /app COPY package*.json ./ # Install dependencies -RUN npm ci +# Remove package-lock.json and use npm install to ensure optional deps are installed correctly +RUN rm -f package-lock.json && npm install # Copy source code COPY . . diff --git a/web-ui/package-lock.json b/web-ui/package-lock.json index 89696a0..4571970 100644 --- a/web-ui/package-lock.json +++ b/web-ui/package-lock.json @@ -1,15 +1,14 @@ { "name": "bison-web-ui", - "version": "0.0.1", + "version": "0.0.20", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "bison-web-ui", - "version": "0.0.1", + "version": "0.0.20", "dependencies": { "@ant-design/icons": "^5.2.6", - "@ant-design/pro-components": "^2.6.43", "@tanstack/react-query": "^5.17.0", "antd": "^5.12.8", "axios": "^1.6.5", @@ -18,7 +17,8 @@ "echarts-for-react": "^3.0.2", "react": "^18.2.0", "react-dom": "^18.2.0", - "react-router-dom": "^6.21.1" + "react-router-dom": "^6.21.1", + "tslib": "^2.6.0" }, "devDependencies": { "@testing-library/jest-dom": "^6.9.1", @@ -143,267 +143,6 @@ "version": "4.4.2", "license": "MIT" }, - "node_modules/@ant-design/pro-card": { - "version": "2.10.0", - "license": "MIT", - "dependencies": { - "@ant-design/cssinjs": "^1.21.1", - "@ant-design/icons": "^5.0.0", - "@ant-design/pro-provider": "2.16.2", - "@ant-design/pro-utils": "2.18.0", - "@babel/runtime": "^7.18.0", - "classnames": "^2.3.2", - "rc-resize-observer": "^1.0.0", - "rc-util": "^5.4.0" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-components": { - "version": "2.8.10", - "license": "MIT", - "dependencies": { - "@ant-design/pro-card": "2.10.0", - "@ant-design/pro-descriptions": "2.6.10", - "@ant-design/pro-field": "3.1.0", - "@ant-design/pro-form": "2.32.0", - "@ant-design/pro-layout": "7.22.7", - "@ant-design/pro-list": "2.6.10", - "@ant-design/pro-provider": "2.16.2", - "@ant-design/pro-skeleton": "2.2.1", - "@ant-design/pro-table": "3.21.0", - "@ant-design/pro-utils": "2.18.0", - "@babel/runtime": "^7.16.3" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0", - "react-dom": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-descriptions": { - "version": "2.6.10", - "license": "MIT", - "dependencies": { - "@ant-design/pro-field": "3.1.0", - "@ant-design/pro-form": "2.32.0", - "@ant-design/pro-provider": "2.16.2", - "@ant-design/pro-skeleton": "2.2.1", - "@ant-design/pro-utils": "2.18.0", - "@babel/runtime": "^7.18.0", - "rc-resize-observer": "^0.2.3", - "rc-util": "^5.0.6" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-descriptions/node_modules/rc-resize-observer": { - "version": "0.2.6", - "license": "MIT", - "dependencies": { - "@babel/runtime": "^7.10.1", - "classnames": "^2.2.1", - "rc-util": "^5.0.0", - "resize-observer-polyfill": "^1.5.1" - }, - "peerDependencies": { - "react": ">=16.9.0", - "react-dom": ">=16.9.0" - } - }, - "node_modules/@ant-design/pro-field": { - "version": "3.1.0", - "license": "MIT", - "dependencies": { - "@ant-design/icons": "^5.0.0", - "@ant-design/pro-provider": "2.16.2", - "@ant-design/pro-utils": "2.18.0", - "@babel/runtime": "^7.18.0", - "@chenshuai2144/sketch-color": "^1.0.8", - "classnames": "^2.3.2", - "dayjs": "^1.11.10", - "lodash": "^4.17.21", - "lodash-es": "^4.17.21", - "rc-util": "^5.4.0", - "swr": "^2.0.0" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-form": { - "version": "2.32.0", - "license": "MIT", - "dependencies": { - "@ant-design/icons": "^5.0.0", - "@ant-design/pro-field": "3.1.0", - "@ant-design/pro-provider": "2.16.2", - "@ant-design/pro-utils": "2.18.0", - "@babel/runtime": "^7.18.0", - "@chenshuai2144/sketch-color": "^1.0.7", - "@umijs/use-params": "^1.0.9", - "classnames": "^2.3.2", - "dayjs": "^1.11.10", - "lodash": "^4.17.21", - "lodash-es": "^4.17.21", - "rc-resize-observer": "^1.1.0", - "rc-util": "^5.0.6" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "rc-field-form": ">=1.22.0", - "react": ">=17.0.0", - "react-dom": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-layout": { - "version": "7.22.7", - "license": "MIT", - "dependencies": { - "@ant-design/cssinjs": "^1.21.1", - "@ant-design/icons": "^5.0.0", - "@ant-design/pro-provider": "2.16.2", - "@ant-design/pro-utils": "2.18.0", - "@babel/runtime": "^7.18.0", - "@umijs/route-utils": "^4.0.0", - "@umijs/use-params": "^1.0.9", - "classnames": "^2.3.2", - "lodash": "^4.17.21", - "lodash-es": "^4.17.21", - "path-to-regexp": "8.2.0", - "rc-resize-observer": "^1.1.0", - "rc-util": "^5.0.6", - "swr": "^2.0.0", - "warning": "^4.0.3" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0", - "react-dom": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-list": { - "version": "2.6.10", - "license": "MIT", - "dependencies": { - "@ant-design/cssinjs": "^1.21.1", - "@ant-design/icons": "^5.0.0", - "@ant-design/pro-card": "2.10.0", - "@ant-design/pro-field": "3.1.0", - "@ant-design/pro-table": "3.21.0", - "@ant-design/pro-utils": "2.18.0", - "@babel/runtime": "^7.18.0", - "classnames": "^2.3.2", - "dayjs": "^1.11.10", - "rc-resize-observer": "^1.0.0", - "rc-util": "^4.19.0" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0", - "react-dom": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-list/node_modules/rc-util": { - "version": "4.21.1", - "license": "MIT", - "dependencies": { - "add-dom-event-listener": "^1.1.0", - "prop-types": "^15.5.10", - "react-is": "^16.12.0", - "react-lifecycles-compat": "^3.0.4", - "shallowequal": "^1.1.0" - } - }, - "node_modules/@ant-design/pro-list/node_modules/react-is": { - "version": "16.13.1", - "license": "MIT" - }, - "node_modules/@ant-design/pro-provider": { - "version": "2.16.2", - "license": "MIT", - "dependencies": { - "@ant-design/cssinjs": "^1.21.1", - "@babel/runtime": "^7.18.0", - "@ctrl/tinycolor": "^3.4.0", - "dayjs": "^1.11.10", - "rc-util": "^5.0.1", - "swr": "^2.0.0" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0", - "react-dom": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-skeleton": { - "version": "2.2.1", - "license": "MIT", - "dependencies": { - "@babel/runtime": "^7.18.0" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0", - "react-dom": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-table": { - "version": "3.21.0", - "license": "MIT", - "dependencies": { - "@ant-design/cssinjs": "^1.21.1", - "@ant-design/icons": "^5.0.0", - "@ant-design/pro-card": "2.10.0", - "@ant-design/pro-field": "3.1.0", - "@ant-design/pro-form": "2.32.0", - "@ant-design/pro-provider": "2.16.2", - "@ant-design/pro-utils": "2.18.0", - "@babel/runtime": "^7.18.0", - "@dnd-kit/core": "^6.0.8", - "@dnd-kit/modifiers": "^6.0.1", - "@dnd-kit/sortable": "^7.0.2", - "@dnd-kit/utilities": "^3.2.1", - "classnames": "^2.3.2", - "dayjs": "^1.11.10", - "lodash": "^4.17.21", - "lodash-es": "^4.17.21", - "rc-resize-observer": "^1.0.0", - "rc-util": "^5.0.1" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "rc-field-form": ">=1.22.0", - "react": ">=17.0.0", - "react-dom": ">=17.0.0" - } - }, - "node_modules/@ant-design/pro-utils": { - "version": "2.18.0", - "license": "MIT", - "dependencies": { - "@ant-design/icons": "^5.0.0", - "@ant-design/pro-provider": "2.16.2", - "@babel/runtime": "^7.18.0", - "classnames": "^2.3.2", - "dayjs": "^1.11.10", - "lodash": "^4.17.21", - "lodash-es": "^4.17.21", - "rc-util": "^5.0.6", - "safe-stable-stringify": "^2.4.3", - "swr": "^2.0.0" - }, - "peerDependencies": { - "antd": "^4.24.15 || ^5.11.2", - "react": ">=17.0.0", - "react-dom": ">=17.0.0" - } - }, "node_modules/@ant-design/react-slick": { "version": "1.1.2", "license": "MIT", @@ -750,17 +489,6 @@ "node": ">=18" } }, - "node_modules/@chenshuai2144/sketch-color": { - "version": "1.0.9", - "license": "MIT", - "dependencies": { - "reactcss": "^1.2.3", - "tinycolor2": "^1.4.2" - }, - "peerDependencies": { - "react": ">=16.12.0" - } - }, "node_modules/@csstools/color-helpers": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/@csstools/color-helpers/-/color-helpers-5.1.0.tgz", @@ -896,70 +624,6 @@ "node": ">=18" } }, - "node_modules/@ctrl/tinycolor": { - "version": "3.6.1", - "license": "MIT", - "engines": { - "node": ">=10" - } - }, - "node_modules/@dnd-kit/accessibility": { - "version": "3.1.1", - "license": "MIT", - "dependencies": { - "tslib": "^2.0.0" - }, - "peerDependencies": { - "react": ">=16.8.0" - } - }, - "node_modules/@dnd-kit/core": { - "version": "6.3.1", - "license": "MIT", - "dependencies": { - "@dnd-kit/accessibility": "^3.1.1", - "@dnd-kit/utilities": "^3.2.2", - "tslib": "^2.0.0" - }, - "peerDependencies": { - "react": ">=16.8.0", - "react-dom": ">=16.8.0" - } - }, - "node_modules/@dnd-kit/modifiers": { - "version": "6.0.1", - "license": "MIT", - "dependencies": { - "@dnd-kit/utilities": "^3.2.1", - "tslib": "^2.0.0" - }, - "peerDependencies": { - "@dnd-kit/core": "^6.0.6", - "react": ">=16.8.0" - } - }, - "node_modules/@dnd-kit/sortable": { - "version": "7.0.2", - "license": "MIT", - "dependencies": { - "@dnd-kit/utilities": "^3.2.0", - "tslib": "^2.0.0" - }, - "peerDependencies": { - "@dnd-kit/core": "^6.0.7", - "react": ">=16.8.0" - } - }, - "node_modules/@dnd-kit/utilities": { - "version": "3.2.2", - "license": "MIT", - "dependencies": { - "tslib": "^2.0.0" - }, - "peerDependencies": { - "react": ">=16.8.0" - } - }, "node_modules/@emotion/hash": { "version": "0.8.0", "license": "MIT" @@ -1802,17 +1466,6 @@ "url": "https://opencollective.com/typescript-eslint" } }, - "node_modules/@umijs/route-utils": { - "version": "4.0.3", - "license": "MIT" - }, - "node_modules/@umijs/use-params": { - "version": "1.0.9", - "license": "MIT", - "peerDependencies": { - "react": "*" - } - }, "node_modules/@ungap/structured-clone": { "version": "1.3.0", "dev": true, @@ -2027,13 +1680,6 @@ "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, - "node_modules/add-dom-event-listener": { - "version": "1.1.0", - "license": "MIT", - "dependencies": { - "object-assign": "4.x" - } - }, "node_modules/agent-base": { "version": "7.1.4", "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", @@ -2671,6 +2317,7 @@ }, "node_modules/dequal": { "version": "2.0.3", + "dev": true, "license": "MIT", "engines": { "node": ">=6" @@ -3869,14 +3516,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/lodash": { - "version": "4.17.21", - "license": "MIT" - }, - "node_modules/lodash-es": { - "version": "4.17.22", - "license": "MIT" - }, "node_modules/lodash.merge": { "version": "4.6.2", "dev": true, @@ -4103,6 +3742,7 @@ }, "node_modules/object-assign": { "version": "4.1.1", + "dev": true, "license": "MIT", "engines": { "node": ">=0.10.0" @@ -4252,13 +3892,6 @@ "dev": true, "license": "ISC" }, - "node_modules/path-to-regexp": { - "version": "8.2.0", - "license": "MIT", - "engines": { - "node": ">=16" - } - }, "node_modules/path-type": { "version": "4.0.0", "dev": true, @@ -4511,19 +4144,6 @@ "license": "MIT", "peer": true }, - "node_modules/prop-types": { - "version": "15.8.1", - "license": "MIT", - "dependencies": { - "loose-envify": "^1.4.0", - "object-assign": "^4.1.1", - "react-is": "^16.13.1" - } - }, - "node_modules/prop-types/node_modules/react-is": { - "version": "16.13.1", - "license": "MIT" - }, "node_modules/proxy-from-env": { "version": "1.1.0", "license": "MIT" @@ -5118,10 +4738,6 @@ "version": "18.3.1", "license": "MIT" }, - "node_modules/react-lifecycles-compat": { - "version": "3.0.4", - "license": "MIT" - }, "node_modules/react-refresh": { "version": "0.17.0", "dev": true, @@ -5158,13 +4774,6 @@ "react-dom": ">=16.8" } }, - "node_modules/reactcss": { - "version": "1.2.3", - "license": "MIT", - "dependencies": { - "lodash": "^4.0.1" - } - }, "node_modules/read-cache": { "version": "1.0.0", "dev": true, @@ -5331,13 +4940,6 @@ "queue-microtask": "^1.2.2" } }, - "node_modules/safe-stable-stringify": { - "version": "2.5.0", - "license": "MIT", - "engines": { - "node": ">=10" - } - }, "node_modules/safer-buffer": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", @@ -5383,10 +4985,6 @@ "node": ">=10" } }, - "node_modules/shallowequal": { - "version": "1.1.0", - "license": "MIT" - }, "node_modules/shebang-command": { "version": "2.0.0", "dev": true, @@ -5665,17 +5263,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/swr": { - "version": "2.3.8", - "license": "MIT", - "dependencies": { - "dequal": "^2.0.3", - "use-sync-external-store": "^1.6.0" - }, - "peerDependencies": { - "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" - } - }, "node_modules/symbol-tree": { "version": "3.2.4", "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", @@ -5809,10 +5396,6 @@ "dev": true, "license": "MIT" }, - "node_modules/tinycolor2": { - "version": "1.6.0", - "license": "MIT" - }, "node_modules/tinyexec": { "version": "0.3.2", "resolved": "https://registry.npmjs.org/tinyexec/-/tinyexec-0.3.2.tgz", @@ -5981,6 +5564,8 @@ }, "node_modules/tslib": { "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", "license": "0BSD" }, "node_modules/type-check": { @@ -6054,13 +5639,6 @@ "punycode": "^2.1.0" } }, - "node_modules/use-sync-external-store": { - "version": "1.6.0", - "license": "MIT", - "peerDependencies": { - "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" - } - }, "node_modules/util-deprecate": { "version": "1.0.2", "dev": true, @@ -6246,13 +5824,6 @@ "node": ">=18" } }, - "node_modules/warning": { - "version": "4.0.3", - "license": "MIT", - "dependencies": { - "loose-envify": "^1.0.0" - } - }, "node_modules/webidl-conversions": { "version": "8.0.0", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.0.tgz", diff --git a/web-ui/package.json b/web-ui/package.json index 1d4bdd1..54483e1 100644 --- a/web-ui/package.json +++ b/web-ui/package.json @@ -1,6 +1,6 @@ { "name": "bison-web-ui", - "version": "0.0.1", + "version": "0.0.27", "private": true, "scripts": { "dev": "vite", @@ -13,7 +13,6 @@ }, "dependencies": { "@ant-design/icons": "^5.2.6", - "@ant-design/pro-components": "^2.6.43", "@tanstack/react-query": "^5.17.0", "antd": "^5.12.8", "axios": "^1.6.5", @@ -22,7 +21,8 @@ "echarts-for-react": "^3.0.2", "react": "^18.2.0", "react-dom": "^18.2.0", - "react-router-dom": "^6.21.1" + "react-router-dom": "^6.21.1", + "tslib": "^2.6.0" }, "devDependencies": { "@testing-library/jest-dom": "^6.9.1", diff --git a/web-ui/src/App.tsx b/web-ui/src/App.tsx index 1c4d22c..cead6fc 100644 --- a/web-ui/src/App.tsx +++ b/web-ui/src/App.tsx @@ -1,49 +1,70 @@ -import React from 'react'; +import React, { Suspense, lazy } from 'react'; import { Routes, Route, Navigate } from 'react-router-dom'; +import { Spin } from 'antd'; import BasicLayout from './layouts/BasicLayout'; -import Login from './pages/Login'; -import Dashboard from './pages/Dashboard'; -import ProjectList from './pages/Project/ProjectList'; -import ProjectCreate from './pages/Project/ProjectCreate'; -import ProjectDetail from './pages/Project/ProjectDetail'; -import ClusterNodes from './pages/Cluster/ClusterNodes'; -import NodeDetail from './pages/Cluster/NodeDetail'; -import TeamList from './pages/Team/TeamList'; -import TeamCreate from './pages/Team/TeamCreate'; -import TeamDetail from './pages/Team/TeamDetail'; -import UserList from './pages/User/UserList'; -import UserDetail from './pages/User/UserDetail'; -import AuditList from './pages/Audit/AuditList'; -import ReportCenter from './pages/Report/ReportCenter'; -import Settings from './pages/Settings'; import ProtectedRoute from './components/ProtectedRoute'; +import { useFeatures } from './hooks/useFeatures'; + +// Route-level code splitting: each page (and its heavy deps such as echarts on +// the node-detail page) is only downloaded when first visited. +const Login = lazy(() => import('./pages/Login')); +const Dashboard = lazy(() => import('./pages/Dashboard')); +const ProjectList = lazy(() => import('./pages/Project/ProjectList')); +const ProjectCreate = lazy(() => import('./pages/Project/ProjectCreate')); +const ProjectDetail = lazy(() => import('./pages/Project/ProjectDetail')); +const ClusterNodes = lazy(() => import('./pages/Cluster/ClusterNodes')); +const NodeDetail = lazy(() => import('./pages/Cluster/NodeDetail')); +const TeamList = lazy(() => import('./pages/Team/TeamList')); +const TeamCreate = lazy(() => import('./pages/Team/TeamCreate')); +const TeamDetail = lazy(() => import('./pages/Team/TeamDetail')); +const UserList = lazy(() => import('./pages/User/UserList')); +const UserDetail = lazy(() => import('./pages/User/UserDetail')); +const AuditList = lazy(() => import('./pages/Audit/AuditList')); +const ReportCenter = lazy(() => import('./pages/Report/ReportCenter')); +const Settings = lazy(() => import('./pages/Settings')); + +const PageFallback: React.FC = () => ( +
+ +
+); const App: React.FC = () => { + const { data: features } = useFeatures(); + return ( - - } /> - - - - }> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - - + }> + + } /> + + + + }> + } /> + } /> + {features?.capsuleEnabled !== false && ( + <> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + + )} + } /> + } /> + {features?.costEnabled !== false && ( + } /> + )} + } /> + } /> + + + ); }; diff --git a/web-ui/src/components/ErrorBoundary.tsx b/web-ui/src/components/ErrorBoundary.tsx new file mode 100644 index 0000000..91504ef --- /dev/null +++ b/web-ui/src/components/ErrorBoundary.tsx @@ -0,0 +1,58 @@ +import React from 'react'; +import { Button, Result } from 'antd'; + +interface Props { + children: React.ReactNode; +} + +interface State { + hasError: boolean; + error?: Error; +} + +/** + * Top-level error boundary so a render-time exception in any page shows a + * recoverable fallback instead of a blank white screen. + */ +class ErrorBoundary extends React.Component { + constructor(props: Props) { + super(props); + this.state = { hasError: false }; + } + + static getDerivedStateFromError(error: Error): State { + return { hasError: true, error }; + } + + componentDidCatch(error: Error, info: React.ErrorInfo) { + // eslint-disable-next-line no-console + console.error('Unhandled UI error:', error, info.componentStack); + } + + handleReset = () => { + this.setState({ hasError: false, error: undefined }); + }; + + render() { + if (this.state.hasError) { + return ( + + 重试 + , + , + ]} + /> + ); + } + return this.props.children; + } +} + +export default ErrorBoundary; diff --git a/web-ui/src/components/NodeOnboardingModal.tsx b/web-ui/src/components/NodeOnboardingModal.tsx new file mode 100644 index 0000000..b7c1029 --- /dev/null +++ b/web-ui/src/components/NodeOnboardingModal.tsx @@ -0,0 +1,183 @@ +import React, { useState } from 'react'; +import { + Modal, + Form, + Input, + InputNumber, + Select, + Alert, + Typography, +} from 'antd'; +import { useMutation } from '@tanstack/react-query'; +import { startNodeOnboarding, OnboardingRequest } from '../services/api'; + +const { TextArea } = Input; +const { Text } = Typography; + +interface NodeOnboardingModalProps { + open: boolean; + onClose: () => void; + onStarted: (jobId: string) => void; +} + +const NodeOnboardingModal: React.FC = ({ + open, + onClose, + onStarted, +}) => { + const [form] = Form.useForm(); + const [authMethod, setAuthMethod] = useState<'password' | 'privateKey'>('password'); + + const startMutation = useMutation({ + mutationFn: startNodeOnboarding, + onSuccess: (response) => { + form.resetFields(); + onStarted(response.data.id); + }, + }); + + const handleSubmit = () => { + form.validateFields().then(values => { + const request: OnboardingRequest = { + nodeIP: values.nodeIP, + sshPort: values.sshPort || 22, + sshUsername: values.sshUsername, + authMethod: values.authMethod, + password: values.authMethod === 'password' ? values.password : undefined, + privateKey: values.authMethod === 'privateKey' ? values.privateKey : undefined, + }; + startMutation.mutate(request); + }); + }; + + const handleClose = () => { + form.resetFields(); + setAuthMethod('password'); + onClose(); + }; + + // IP address validation + const validateIP = (_: unknown, value: string) => { + if (!value) { + return Promise.reject(new Error('请输入节点 IP')); + } + // Simple IP format validation + const ipRegex = /^(\d{1,3}\.){3}\d{1,3}$/; + if (!ipRegex.test(value)) { + return Promise.reject(new Error('请输入有效的 IP 地址')); + } + const parts = value.split('.').map(Number); + if (parts.some(p => p > 255)) { + return Promise.reject(new Error('请输入有效的 IP 地址')); + } + return Promise.resolve(); + }; + + return ( + + +
  • 目标节点已安装操作系统(Ubuntu/CentOS 等)
  • +
  • 目标节点已安装 kubeadm、kubelet、kubectl
  • +
  • 目标节点网络可达,支持 SSH 连接
  • + + } + type="info" + showIcon + style={{ marginBottom: 24 }} + /> + + {startMutation.isError && ( + + )} + +
    + + + + + + + + + + + + + + + + + {authMethod === 'password' ? ( + + + + ) : ( + 将私钥内容粘贴到此处} + > +