From bc00eeb55652e171cd93988598645fa2d7e1bf82 Mon Sep 17 00:00:00 2001 From: Leoy Date: Sat, 27 Dec 2025 11:25:03 +0800 Subject: [PATCH 01/44] Create CNAME --- CNAME | 1 + 1 file changed, 1 insertion(+) create mode 100644 CNAME diff --git a/CNAME b/CNAME new file mode 100644 index 0000000..def6a64 --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +www.lei6393.com \ No newline at end of file From 28eae9d42226fd5233e73e57d0e1f7fc796aebad Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sat, 27 Dec 2025 11:38:12 +0800 Subject: [PATCH 02/44] fix bug --- web-ui/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web-ui/Dockerfile b/web-ui/Dockerfile index abbfa53..5c9668c 100644 --- a/web-ui/Dockerfile +++ b/web-ui/Dockerfile @@ -7,7 +7,8 @@ WORKDIR /app COPY package*.json ./ # Install dependencies -RUN npm ci +# Remove package-lock.json and use npm install to ensure optional deps are installed correctly +RUN rm -f package-lock.json && npm install # Copy source code COPY . . From df9a923795271b4deba56bf61516ff414078d5a5 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sat, 27 Dec 2025 11:44:32 +0800 Subject: [PATCH 03/44] fix bug --- .github/workflows/deploy-docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 428461d..45abd44 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -10,7 +10,7 @@ on: workflow_dispatch: permissions: - contents: read + contents: write pages: write id-token: write From a364f581671c8d9f1a8f4b4714768c41f295d7d8 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sat, 27 Dec 2025 14:43:56 +0800 Subject: [PATCH 04/44] fix bug --- .github/workflows/deploy-docs.yml | 118 +++++++++++++---------------- .github/workflows/release.yml | 119 +++++++++++++++++++----------- 2 files changed, 127 insertions(+), 110 deletions(-) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 45abd44..5317e9d 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -10,7 +10,7 @@ on: workflow_dispatch: permissions: - contents: write + contents: read pages: write id-token: write @@ -19,13 +19,38 @@ concurrency: cancel-in-progress: false jobs: - build-docs: - name: Build Docusaurus + deploy-docs: + name: Build and Deploy Documentation runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: - - name: Checkout code + - name: Checkout repository uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + + - name: Fetch current Pages deployment + continue-on-error: true + run: | + # Clone gh-pages branch to preserve charts/ + if git ls-remote --exit-code --heads origin gh-pages; then + echo "Fetching existing gh-pages branch..." + git clone --single-branch --branch gh-pages --depth 1 \ + https://github.com/${{ github.repository }}.git gh-pages-current + + # Preserve charts/ directory if it exists + if [ -d "gh-pages-current/charts" ]; then + cp -r gh-pages-current/charts charts-backup + echo "Preserved charts/ directory" + fi + else + echo "No existing gh-pages branch, skipping preservation" + fi + - name: Setup Node.js uses: actions/setup-node@v4 with: @@ -43,75 +68,34 @@ jobs: cd website npm run build - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: docusaurus-build - path: website/build/ - - deploy-docs: - name: Deploy to GitHub Pages - needs: build-docs - runs-on: ubuntu-latest - steps: - - name: Checkout gh-pages branch (or create it) - uses: actions/checkout@v4 - with: - ref: gh-pages - fetch-depth: 0 - continue-on-error: true - - - name: Initialize gh-pages if checkout failed + - name: Prepare deployment run: | - if [ ! -d ".git" ]; then - echo "gh-pages branch doesn't exist, creating it..." - git init - git checkout -b gh-pages - git remote add origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git - fi + # Create deployment directory + mkdir -p _site - - name: Download docs build - uses: actions/download-artifact@v4 - with: - name: docusaurus-build - path: docs-temp/ + # Copy Docusaurus build output + cp -r website/build/* _site/ - - name: Preserve Helm charts and deploy docs - run: | - # Backup existing charts/ directory if it exists - if [ -d "charts" ]; then - echo "Backing up existing charts/ directory" - mv charts charts-backup + # Restore charts/ if it existed + if [ -d "charts-backup" ]; then + cp -r charts-backup _site/charts + echo "Restored charts/ directory" fi - # Clear everything except charts-backup and .git - find . -maxdepth 1 ! -name charts-backup ! -name .git ! -name . ! -name .. -exec rm -rf {} + - - # Move Docusaurus build to root - if [ -d "docs-temp" ] && [ "$(ls -A docs-temp)" ]; then - mv docs-temp/* docs-temp/.* . 2>/dev/null || mv docs-temp/* . - rm -rf docs-temp - else - echo "Warning: docs-temp directory is empty or doesn't exist" + # Copy CNAME for custom domain + if [ -f "CNAME" ]; then + cp CNAME _site/CNAME + echo "Added CNAME file" fi - # Restore charts/ directory - if [ -d "charts-backup" ]; then - echo "Restoring charts/ directory" - mv charts-backup charts - fi + # Ensure .nojekyll exists (should already be in build) + touch _site/.nojekyll - - name: Configure Git - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: '_site' - - name: Commit and push - run: | - git add -A - if ! git diff --cached --quiet; then - git commit -m "Deploy documentation from commit ${{ github.sha }}" - git push origin gh-pages --force - else - echo "No changes to commit" - fi + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8fb8205..33945c8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -215,11 +215,20 @@ jobs: name: Publish to Helm Repository (GitHub Pages) runs-on: ubuntu-latest needs: [prepare, create-release] + permissions: + contents: read + pages: write + id-token: write + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: - - name: Checkout code + - name: Checkout repository uses: actions/checkout@v4 - with: - fetch-depth: 0 + + - name: Setup Pages + uses: actions/configure-pages@v5 - name: Download Helm chart uses: actions/download-artifact@v4 @@ -232,41 +241,51 @@ jobs: with: version: 'latest' - - name: Configure Git + - name: Fetch current Pages deployment + continue-on-error: true run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - - - name: Publish to GitHub Pages (charts/ subdirectory) - env: - VERSION: ${{ needs.prepare.outputs.version }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Clone or create gh-pages branch + # Clone gh-pages branch to preserve documentation if git ls-remote --exit-code --heads origin gh-pages; then - echo "gh-pages branch exists, cloning..." - git clone --single-branch --branch gh-pages https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }}.git gh-pages - else - echo "gh-pages branch does not exist, creating..." - mkdir gh-pages - cd gh-pages - git init - git checkout -b gh-pages - git remote add origin https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }}.git + echo "Fetching existing gh-pages branch..." + git clone --single-branch --branch gh-pages --depth 1 \ + https://github.com/${{ github.repository }}.git gh-pages-current + + # Create backup of everything except charts/ and .git + mkdir -p docs-backup + cd gh-pages-current + for item in *; do + if [ "$item" != "charts" ] && [ "$item" != ".git" ]; then + cp -r "$item" ../docs-backup/ + fi + done + # Also copy hidden files (except .git) + for item in .[^.]*; do + if [ "$item" != ".git" ] && [ -e "$item" ]; then + cp -r "$item" ../docs-backup/ + fi + done cd .. + + echo "Preserved documentation files" + else + echo "No existing gh-pages branch, skipping preservation" fi - # Create charts/ directory if it doesn't exist - mkdir -p gh-pages/charts + - name: Prepare Helm repository + env: + VERSION: ${{ needs.prepare.outputs.version }} + run: | + # Create deployment directory structure + mkdir -p _site/charts - # Copy Helm chart to gh-pages/charts/ - cp bison-${VERSION}.tgz gh-pages/charts/ + # Copy Helm chart + cp bison-${VERSION}.tgz _site/charts/ - # Generate or update index.yaml in charts/ directory - cd gh-pages/charts + # Generate index.yaml in charts/ directory + cd _site/charts helm repo index . --url https://${{ github.repository_owner }}.github.io/Bison/charts/ - # Create README in charts/ directory if it doesn't exist + # Create README.md in charts/ directory if needed if [ ! -f README.md ]; then cat < README.md # Bison Helm Chart Repository @@ -297,20 +316,34 @@ jobs: See [index.yaml](./index.yaml) for all available versions. EOF fi + cd ../.. - # Return to gh-pages root - cd .. - - # Configure Git - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" + - name: Merge with documentation + run: | + # Restore documentation if it existed + if [ -d "docs-backup" ]; then + cp -r docs-backup/* _site/ + # Also copy hidden files + if ls docs-backup/.[^.]* 1> /dev/null 2>&1; then + cp -r docs-backup/.[^.]* _site/ + fi + echo "Restored documentation files" + fi - # Commit and push - git add charts/ - if git diff --cached --quiet; then - echo "No changes to commit" - else - git commit -m "Release Helm chart v${VERSION}" - git push origin gh-pages - echo "Successfully published Helm chart to GitHub Pages at /charts/" + # Copy CNAME for custom domain + if [ -f "CNAME" ]; then + cp CNAME _site/CNAME + echo "Added CNAME file" fi + + # Ensure .nojekyll exists + touch _site/.nojekyll + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: '_site' + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 From 9f28a971ca620bb30c2d98eb4a6a54b79ea450c8 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sat, 27 Dec 2025 14:58:38 +0800 Subject: [PATCH 05/44] fix bug --- CNAME | 1 - 1 file changed, 1 deletion(-) delete mode 100644 CNAME diff --git a/CNAME b/CNAME deleted file mode 100644 index def6a64..0000000 --- a/CNAME +++ /dev/null @@ -1 +0,0 @@ -www.lei6393.com \ No newline at end of file From a6cd0c900d64d91e0b759a969c40da438de78ba7 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sat, 27 Dec 2025 15:00:47 +0800 Subject: [PATCH 06/44] fix bug --- .github/workflows/deploy-docs.yml | 6 ------ .github/workflows/release.yml | 6 ------ 2 files changed, 12 deletions(-) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 5317e9d..326aa4a 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -82,12 +82,6 @@ jobs: echo "Restored charts/ directory" fi - # Copy CNAME for custom domain - if [ -f "CNAME" ]; then - cp CNAME _site/CNAME - echo "Added CNAME file" - fi - # Ensure .nojekyll exists (should already be in build) touch _site/.nojekyll diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 33945c8..99d32b3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -330,12 +330,6 @@ jobs: echo "Restored documentation files" fi - # Copy CNAME for custom domain - if [ -f "CNAME" ]; then - cp CNAME _site/CNAME - echo "Added CNAME file" - fi - # Ensure .nojekyll exists touch _site/.nojekyll From 8a2861e2e1e005b7082de8946926029816fd5397 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sat, 27 Dec 2025 15:35:18 +0800 Subject: [PATCH 07/44] fix bug --- .github/workflows/deploy-docs.yml | 3 --- .github/workflows/release.yml | 3 --- 2 files changed, 6 deletions(-) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 326aa4a..926734d 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -22,9 +22,6 @@ jobs: deploy-docs: name: Build and Deploy Documentation runs-on: ubuntu-latest - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} steps: - name: Checkout repository diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 99d32b3..c68a446 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -219,9 +219,6 @@ jobs: contents: read pages: write id-token: write - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} steps: - name: Checkout repository From 36ca34ad9d5a14c194a95a7b57723084dfbb2b67 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 11:51:00 +0800 Subject: [PATCH 08/44] fix release bug --- .github/workflows/release.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c68a446..cedc869 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -219,6 +219,9 @@ jobs: contents: read pages: write id-token: write + concurrency: + group: "pages" + cancel-in-progress: false steps: - name: Checkout repository From 3ab9af14aaffd7bc42e790b71536789eb9c1473b Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 13:17:36 +0800 Subject: [PATCH 09/44] fix helm bug --- .github/workflows/deploy-docs.yml | 24 ------- .github/workflows/release.yml | 113 ++---------------------------- README.md | 18 +++-- deploy/charts/bison/README.md | 76 ++++++++++++++++++++ deploy/charts/bison/README_CN.md | 86 +++++++++++++++++++++++ docs/README_CN.md | 32 ++++++++- 6 files changed, 207 insertions(+), 142 deletions(-) create mode 100644 deploy/charts/bison/README.md create mode 100644 deploy/charts/bison/README_CN.md diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 926734d..ef0f15c 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -30,24 +30,6 @@ jobs: - name: Setup Pages uses: actions/configure-pages@v5 - - name: Fetch current Pages deployment - continue-on-error: true - run: | - # Clone gh-pages branch to preserve charts/ - if git ls-remote --exit-code --heads origin gh-pages; then - echo "Fetching existing gh-pages branch..." - git clone --single-branch --branch gh-pages --depth 1 \ - https://github.com/${{ github.repository }}.git gh-pages-current - - # Preserve charts/ directory if it exists - if [ -d "gh-pages-current/charts" ]; then - cp -r gh-pages-current/charts charts-backup - echo "Preserved charts/ directory" - fi - else - echo "No existing gh-pages branch, skipping preservation" - fi - - name: Setup Node.js uses: actions/setup-node@v4 with: @@ -73,12 +55,6 @@ jobs: # Copy Docusaurus build output cp -r website/build/* _site/ - # Restore charts/ if it existed - if [ -d "charts-backup" ]; then - cp -r charts-backup _site/charts - echo "Restored charts/ directory" - fi - # Ensure .nojekyll exists (should already be in build) touch _site/.nojekyll diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cedc869..29586b1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -212,24 +212,17 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} publish-helm-repo: - name: Publish to Helm Repository (GitHub Pages) + name: Publish to Helm Repository (GHCR) runs-on: ubuntu-latest needs: [prepare, create-release] permissions: contents: read - pages: write - id-token: write - concurrency: - group: "pages" - cancel-in-progress: false + packages: write steps: - name: Checkout repository uses: actions/checkout@v4 - - name: Setup Pages - uses: actions/configure-pages@v5 - - name: Download Helm chart uses: actions/download-artifact@v4 with: @@ -241,103 +234,11 @@ jobs: with: version: 'latest' - - name: Fetch current Pages deployment - continue-on-error: true - run: | - # Clone gh-pages branch to preserve documentation - if git ls-remote --exit-code --heads origin gh-pages; then - echo "Fetching existing gh-pages branch..." - git clone --single-branch --branch gh-pages --depth 1 \ - https://github.com/${{ github.repository }}.git gh-pages-current - - # Create backup of everything except charts/ and .git - mkdir -p docs-backup - cd gh-pages-current - for item in *; do - if [ "$item" != "charts" ] && [ "$item" != ".git" ]; then - cp -r "$item" ../docs-backup/ - fi - done - # Also copy hidden files (except .git) - for item in .[^.]*; do - if [ "$item" != ".git" ] && [ -e "$item" ]; then - cp -r "$item" ../docs-backup/ - fi - done - cd .. - - echo "Preserved documentation files" - else - echo "No existing gh-pages branch, skipping preservation" - fi - - - name: Prepare Helm repository + - name: Log in to GitHub Container Registry + run: echo "${{ secrets.GITHUB_TOKEN }}" | helm registry login ghcr.io -u ${{ github.actor }} --password-stdin + + - name: Push Helm chart to GHCR env: VERSION: ${{ needs.prepare.outputs.version }} run: | - # Create deployment directory structure - mkdir -p _site/charts - - # Copy Helm chart - cp bison-${VERSION}.tgz _site/charts/ - - # Generate index.yaml in charts/ directory - cd _site/charts - helm repo index . --url https://${{ github.repository_owner }}.github.io/Bison/charts/ - - # Create README.md in charts/ directory if needed - if [ ! -f README.md ]; then - cat < README.md - # Bison Helm Chart Repository - - ## Usage - - Add the Helm repository: - - \`\`\`bash - helm repo add bison https://${{ github.repository_owner }}.github.io/Bison/charts/ - helm repo update - \`\`\` - - Search for available charts: - - \`\`\`bash - helm search repo bison - \`\`\` - - Install the chart: - - \`\`\`bash - helm install my-bison bison/bison --version ${VERSION} - \`\`\` - - ## Available Versions - - See [index.yaml](./index.yaml) for all available versions. - EOF - fi - cd ../.. - - - name: Merge with documentation - run: | - # Restore documentation if it existed - if [ -d "docs-backup" ]; then - cp -r docs-backup/* _site/ - # Also copy hidden files - if ls docs-backup/.[^.]* 1> /dev/null 2>&1; then - cp -r docs-backup/.[^.]* _site/ - fi - echo "Restored documentation files" - fi - - # Ensure .nojekyll exists - touch _site/.nojekyll - - - name: Upload artifact - uses: actions/upload-pages-artifact@v3 - with: - path: '_site' - - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 + helm push bison-${VERSION}.tgz oci://ghcr.io/${{ github.repository_owner }}/bison diff --git a/README.md b/README.md index 17866cd..fdbc983 100644 --- a/README.md +++ b/README.md @@ -237,29 +237,27 @@ helm install opencost opencost/opencost -n opencost --create-namespace \ ### 2. Deploy Bison -#### Option A: Using Helm Repository (Recommended) +#### Option A: Using GHCR (Recommended - OCI Format) ```bash -# Add Bison Helm repository -helm repo add bison https://supermarioyl.github.io/Bison/charts/ -helm repo update - -# Install Bison -helm install bison bison/bison \ +# Install directly from GitHub Container Registry +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ --namespace bison-system \ --create-namespace \ --set auth.enabled=true \ - --version 0.0.1 + --version 0.0.2 ``` +> **Note:** Requires Helm >= 3.8.0 for OCI support + #### Option B: From GitHub Release ```bash # Download Helm chart from GitHub Release -wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.1/bison-0.0.1.tgz +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz # Install from downloaded chart -helm install bison bison-0.0.1.tgz \ +helm install bison bison-0.0.2.tgz \ --namespace bison-system \ --create-namespace \ --set auth.enabled=true diff --git a/deploy/charts/bison/README.md b/deploy/charts/bison/README.md new file mode 100644 index 0000000..e64685f --- /dev/null +++ b/deploy/charts/bison/README.md @@ -0,0 +1,76 @@ +# Bison Helm Chart + +Kubernetes-based GPU Resource Billing and Scheduling Platform + +## Installation + +### From GHCR (Recommended) + +Install directly from GitHub Container Registry using OCI format: + +```bash +# Install specific version +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 + +# Or pull first, then install +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install my-bison bison-0.0.2.tgz +``` + +**Requirements:** +- Helm >= 3.8.0 (for OCI support) + +### From GitHub Releases + +Download the chart from [GitHub Releases](https://github.com/SuperMarioYL/Bison/releases) and install locally: + +```bash +# Download from release page +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz + +# Install +helm install my-bison bison-0.0.2.tgz +``` + +## Prerequisites + +Before installing Bison, ensure the following dependencies are installed: + +1. **Capsule** - Multi-tenant management + ```bash + helm install capsule projectcapsule/capsule -n capsule-system --create-namespace + ``` + +2. **OpenCost** - Cost tracking + ```bash + helm install opencost opencost/opencost -n opencost --create-namespace + ``` + +3. **Prometheus** - Metrics collection + ```bash + helm install prometheus prometheus-community/kube-prometheus-stack -n monitoring --create-namespace + ``` + +## Configuration + +See [values.yaml](./values.yaml) for all configuration options. + +### Basic Configuration + +```bash +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ + --set apiServer.replicas=2 \ + --set webUI.replicas=2 +``` + +## Uninstall + +```bash +helm uninstall my-bison +``` + +## More Information + +- [Project Homepage](https://supermarioyl.github.io/Bison/) +- [Documentation](https://supermarioyl.github.io/Bison/docs/) +- [GitHub Repository](https://github.com/SuperMarioYL/Bison) diff --git a/deploy/charts/bison/README_CN.md b/deploy/charts/bison/README_CN.md new file mode 100644 index 0000000..7cc5411 --- /dev/null +++ b/deploy/charts/bison/README_CN.md @@ -0,0 +1,86 @@ +# Bison Helm Chart + +基于 Kubernetes 的 GPU 资源计费与调度平台 + +## 安装 + +### 从 GHCR 安装(推荐) + +直接从 GitHub Container Registry 使用 OCI 格式安装: + +```bash +# 安装指定版本 +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 + +# 或者先拉取,再安装 +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install my-bison bison-0.0.2.tgz +``` + +**要求:** +- Helm >= 3.8.0(支持 OCI) + +### 从 GitHub Releases 安装 + +从 [GitHub Releases](https://github.com/SuperMarioYL/Bison/releases) 下载 chart 并本地安装: + +```bash +# 从 release 页面下载 +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz + +# 安装 +helm install my-bison bison-0.0.2.tgz +``` + +## 前置条件 + +安装 Bison 前,请确保已安装以下依赖: + +1. **Capsule** - 多租户管理 + ```bash + helm install capsule projectcapsule/capsule -n capsule-system --create-namespace + ``` + +2. **OpenCost** - 成本追踪 + ```bash + helm install opencost opencost/opencost -n opencost --create-namespace + ``` + +3. **Prometheus** - 指标收集 + ```bash + helm install prometheus prometheus-community/kube-prometheus-stack -n monitoring --create-namespace + ``` + +## 配置 + +所有配置选项请查看 [values.yaml](./values.yaml)。 + +### 基础配置 + +```bash +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ + --set apiServer.replicas=2 \ + --set webUI.replicas=2 +``` + +### 常用配置项 + +| 参数 | 说明 | 默认值 | +|------|------|--------| +| `apiServer.replicas` | API 服务器副本数 | `1` | +| `webUI.replicas` | Web UI 副本数 | `1` | +| `auth.enabled` | 启用认证 | `false` | +| `opencost.url` | OpenCost API 地址 | `http://opencost.opencost:9003` | + +## 卸载 + +```bash +helm uninstall my-bison +``` + +## 更多信息 + +- [项目主页](https://supermarioyl.github.io/Bison/) +- [文档](https://supermarioyl.github.io/Bison/docs/) +- [GitHub 仓库](https://github.com/SuperMarioYL/Bison) +- [English README](./README.md) diff --git a/docs/README_CN.md b/docs/README_CN.md index 154539c..72a8013 100644 --- a/docs/README_CN.md +++ b/docs/README_CN.md @@ -237,10 +237,38 @@ helm install opencost opencost/opencost -n opencost --create-namespace \ ### 2. 部署 Bison +#### 方式 A: 使用 GHCR (推荐 - OCI 格式) + +```bash +# 直接从 GitHub Container Registry 安装 +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set auth.enabled=true \ + --version 0.0.2 +``` + +> **注意:** 需要 Helm >= 3.8.0 以支持 OCI + +#### 方式 B: 从 GitHub Release + +```bash +# 从 GitHub Release 下载 Helm chart +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz + +# 从下载的 chart 安装 +helm install bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace \ + --set auth.enabled=true +``` + +#### 方式 C: 从源码 + ```bash # 克隆并部署 -git clone https://github.com/your-org/bison.git -cd bison +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison helm install bison ./deploy/charts/bison \ --namespace bison-system \ From 00921726755dabe2463c5f4ec68d834c4bb627f3 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 17:06:14 +0800 Subject: [PATCH 10/44] fix document --- .github/workflows/cleanup-docs-version.yml | 136 ++++++++++ .github/workflows/release.yml | 22 +- .github/workflows/version-docs.yml | 118 +++++++++ README.md | 60 +++-- deploy/charts/bison/README.md | 27 +- website/docs/installation.md | 44 ++-- website/i18n/zh-Hans/code.json | 156 ++++++++++++ .../components/ArchitectureDiagram/index.tsx | 160 ++++++++++++ .../ArchitectureDiagram/styles.module.css | 183 ++++++++++++++ .../src/components/HomepageFeatures/index.tsx | 9 +- .../HomepageFeatures/styles.module.css | 119 ++++++++- .../components/ParticleBackground/index.tsx | 101 ++++++++ .../ParticleBackground/styles.module.css | 9 + website/src/components/StatsSection/index.tsx | 124 +++++++++ .../components/StatsSection/styles.module.css | 101 ++++++++ website/src/components/UseCases/index.tsx | 236 ++++++++++++++++++ .../src/components/UseCases/styles.module.css | 207 +++++++++++++++ website/src/css/custom.css | 143 +++++++++++ website/src/pages/index.module.css | 68 +++++ website/src/pages/index.tsx | 42 +++- 20 files changed, 2009 insertions(+), 56 deletions(-) create mode 100644 .github/workflows/cleanup-docs-version.yml create mode 100644 .github/workflows/version-docs.yml create mode 100644 website/src/components/ArchitectureDiagram/index.tsx create mode 100644 website/src/components/ArchitectureDiagram/styles.module.css create mode 100644 website/src/components/ParticleBackground/index.tsx create mode 100644 website/src/components/ParticleBackground/styles.module.css create mode 100644 website/src/components/StatsSection/index.tsx create mode 100644 website/src/components/StatsSection/styles.module.css create mode 100644 website/src/components/UseCases/index.tsx create mode 100644 website/src/components/UseCases/styles.module.css diff --git a/.github/workflows/cleanup-docs-version.yml b/.github/workflows/cleanup-docs-version.yml new file mode 100644 index 0000000..6161df5 --- /dev/null +++ b/.github/workflows/cleanup-docs-version.yml @@ -0,0 +1,136 @@ +name: Cleanup Documentation Version + +on: + release: + types: [deleted] + +permissions: + contents: write + +concurrency: + group: docs-versioning + cancel-in-progress: false + +jobs: + remove-version: + name: Remove Documentation Version + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Extract version from tag + id: version + run: | + # Extract version from tag (v0.0.3 -> 0.0.3) + VERSION=${GITHUB_REF#refs/tags/v} + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Removing documentation version: $VERSION" + + - name: Check if version exists + id: check + run: | + VERSION=${{ steps.version.outputs.version }} + if grep -q "\"${VERSION}\"" website/versions.json; then + echo "exists=true" >> $GITHUB_OUTPUT + echo "✅ Version ${VERSION} found in versions.json" + else + echo "exists=false" >> $GITHUB_OUTPUT + echo "⚠️ Version ${VERSION} not found, nothing to remove" + fi + + - name: Install jq + if: steps.check.outputs.exists == 'true' + run: | + sudo apt-get update + sudo apt-get install -y jq + + - name: Remove version from versions.json + if: steps.check.outputs.exists == 'true' + run: | + VERSION=${{ steps.version.outputs.version }} + + echo "Current versions.json:" + cat website/versions.json + + # Remove version from array using jq + jq --arg ver "${VERSION}" 'del(.[] | select(. == $ver))' website/versions.json > website/versions.json.tmp + mv website/versions.json.tmp website/versions.json + + echo "" + echo "Updated versions.json:" + cat website/versions.json + + - name: Remove versioned directories + if: steps.check.outputs.exists == 'true' + run: | + VERSION=${{ steps.version.outputs.version }} + + # Remove versioned documentation directory + if [ -d "website/versioned_docs/version-${VERSION}" ]; then + rm -rf "website/versioned_docs/version-${VERSION}" + echo "✅ Removed versioned_docs/version-${VERSION}/" + fi + + # Remove versioned sidebars file + if [ -f "website/versioned_sidebars/version-${VERSION}-sidebars.json" ]; then + rm -f "website/versioned_sidebars/version-${VERSION}-sidebars.json" + echo "✅ Removed versioned_sidebars/version-${VERSION}-sidebars.json" + fi + + echo "" + echo "Files removed:" + git status --short + + - name: Configure Git + if: steps.check.outputs.exists == 'true' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Commit and push changes + if: steps.check.outputs.exists == 'true' + run: | + VERSION=${{ steps.version.outputs.version }} + + git add website/versions.json + git add website/versioned_docs/ + git add website/versioned_sidebars/ + + git commit -m "docs: remove version ${VERSION} [skip ci] + + Auto-cleanup documentation version after release deletion ${GITHUB_REF} + + - Removed version ${VERSION} from versions.json + - Deleted versioned_docs/version-${VERSION}/ + - Deleted versioned_sidebars/version-${VERSION}-sidebars.json" + + git push origin main + + echo "✅ Changes pushed to main branch" + echo "📚 Documentation version ${VERSION} has been removed" + + - name: Summary + run: | + VERSION=${{ steps.version.outputs.version }} + EXISTS=${{ steps.check.outputs.exists }} + + echo "## Documentation Cleanup Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Version**: ${VERSION}" >> $GITHUB_STEP_SUMMARY + echo "**Release**: ${GITHUB_REF}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + if [ "$EXISTS" == "true" ]; then + echo "✅ **Status**: Documentation version removed successfully" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Removed files**:" >> $GITHUB_STEP_SUMMARY + echo "- \`versions.json\` (updated)" >> $GITHUB_STEP_SUMMARY + echo "- \`versioned_docs/version-${VERSION}/\`" >> $GITHUB_STEP_SUMMARY + echo "- \`versioned_sidebars/version-${VERSION}-sidebars.json\`" >> $GITHUB_STEP_SUMMARY + else + echo "⚠️ **Status**: Version not found, nothing to remove" >> $GITHUB_STEP_SUMMARY + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 29586b1..4e4176a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -171,14 +171,19 @@ jobs: ### 🚀 Installation - #### Using Helm Repository (Recommended) + **Requirements:** Helm >= 3.8.0, Kubernetes >= 1.22 + + #### Method 1: From GHCR (Recommended) \`\`\`bash - helm repo add bison https://${{ github.repository_owner }}.github.io/Bison/ - helm repo update - helm install my-bison bison/bison --version ${VERSION} + # Install directly from GitHub Container Registry + helm install my-bison oci://ghcr.io/${{ github.repository_owner }}/bison/bison --version ${VERSION} + + # Or pull first, then install + helm pull oci://ghcr.io/${{ github.repository_owner }}/bison/bison --version ${VERSION} + helm install my-bison bison-${VERSION}.tgz \`\`\` - #### Using GitHub Release + #### Method 2: From GitHub Releases \`\`\`bash wget https://github.com/${{ github.repository }}/releases/download/v${VERSION}/bison-${VERSION}.tgz helm install my-bison bison-${VERSION}.tgz @@ -195,6 +200,13 @@ jobs: **Full Changelog**: https://github.com/${{ github.repository }}/compare/v${VERSION}...v${VERSION} + ### 📚 Documentation + + Version-specific documentation will be available shortly at: + - 📖 [https://supermarioyl.github.io/Bison/docs/${VERSION}/](https://supermarioyl.github.io/Bison/docs/${VERSION}/) + + > Note: Documentation versioning happens automatically after release. Allow a few minutes for the docs site to update. + --- 🤖 Generated with [GitHub Actions](https://github.com/features/actions) EOF diff --git a/.github/workflows/version-docs.yml b/.github/workflows/version-docs.yml new file mode 100644 index 0000000..b135c2a --- /dev/null +++ b/.github/workflows/version-docs.yml @@ -0,0 +1,118 @@ +name: Version Documentation + +on: + release: + types: [published] + +permissions: + contents: write + +concurrency: + group: docs-versioning + cancel-in-progress: false + +jobs: + create-version: + name: Create Documentation Version + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Extract version from tag + id: version + run: | + # Extract version from tag (v0.0.3 -> 0.0.3) + VERSION=${GITHUB_REF#refs/tags/v} + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Creating documentation version: $VERSION" + + - name: Check if version already exists + id: check + run: | + VERSION=${{ steps.version.outputs.version }} + if grep -q "\"${VERSION}\"" website/versions.json; then + echo "exists=true" >> $GITHUB_OUTPUT + echo "⚠️ Version ${VERSION} already exists in versions.json" + else + echo "exists=false" >> $GITHUB_OUTPUT + echo "✅ Version ${VERSION} not found, will create" + fi + + - name: Setup Node.js + if: steps.check.outputs.exists == 'false' + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: website/package-lock.json + + - name: Install dependencies + if: steps.check.outputs.exists == 'false' + run: | + cd website + npm ci + + - name: Create documentation version + if: steps.check.outputs.exists == 'false' + run: | + VERSION=${{ steps.version.outputs.version }} + cd website + + echo "Running: npm run docusaurus docs:version ${VERSION}" + npm run docusaurus docs:version ${VERSION} + + echo "✅ Documentation version ${VERSION} created" + echo "" + echo "Files created/modified:" + git status --short + + - name: Configure Git + if: steps.check.outputs.exists == 'false' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Commit and push changes + if: steps.check.outputs.exists == 'false' + run: | + VERSION=${{ steps.version.outputs.version }} + + git add website/versions.json + git add website/versioned_docs/ + git add website/versioned_sidebars/ + + git commit -m "docs: add version ${VERSION} [skip ci] + + Auto-generated documentation version from release ${GITHUB_REF} + + - Added version ${VERSION} to versions.json + - Created versioned_docs/version-${VERSION}/ + - Created versioned_sidebars/version-${VERSION}-sidebars.json" + + git push origin main + + echo "✅ Changes pushed to main branch" + echo "📚 Documentation will be available at: https://supermarioyl.github.io/Bison/docs/${VERSION}/" + + - name: Summary + run: | + VERSION=${{ steps.version.outputs.version }} + EXISTS=${{ steps.check.outputs.exists }} + + echo "## Documentation Versioning Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Version**: ${VERSION}" >> $GITHUB_STEP_SUMMARY + echo "**Release**: ${GITHUB_REF}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + if [ "$EXISTS" == "true" ]; then + echo "⚠️ **Status**: Version already exists, skipped" >> $GITHUB_STEP_SUMMARY + else + echo "✅ **Status**: Documentation version created successfully" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "📚 **View Documentation**: [https://supermarioyl.github.io/Bison/docs/${VERSION}/](https://supermarioyl.github.io/Bison/docs/${VERSION}/)" >> $GITHUB_STEP_SUMMARY + fi diff --git a/README.md b/README.md index fdbc983..b536834 100644 --- a/README.md +++ b/README.md @@ -730,26 +730,52 @@ docker pull ghcr.io/supermarioyl/bison/web-ui:latest - `linux/amd64` - `linux/arm64` -### Helm Repository +### Helm Installation Methods + +Bison Helm charts are distributed via **GitHub Container Registry (GHCR)** using the OCI format, which is the modern standard for Helm 3.8+. + +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 + +#### Method 1: Install from GHCR (Recommended) ```bash -# Add repository -helm repo add bison https://supermarioyl.github.io/Bison/charts/ +# Install specific version directly from GHCR +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 -# Search available versions -helm search repo bison +# Or pull the chart first, then install +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install my-bison bison-0.0.2.tgz + +# Customize installation +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true +``` -# View chart information -helm show chart bison/bison -helm show values bison/bison +#### Method 2: Install from GitHub Releases -# Install specific version -helm install my-bison bison/bison --version 0.0.1 +```bash +# Download chart from GitHub Releases +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz -# Upgrade to latest -helm upgrade my-bison bison/bison +# Install from downloaded file +helm install my-bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace ``` +**Why GHCR OCI Format?** +- ✅ No need for separate Helm repository maintenance +- ✅ Unified image and chart management in GHCR +- ✅ Faster installation (direct pull from registry) +- ✅ Better version control and immutability +- ✅ Standard practice for Helm 3.8+ + ## Development ```bash @@ -781,16 +807,16 @@ Bison uses automated GitHub Actions for releases: ``` 2. **GitHub Actions automatically**: - - Builds multi-platform Docker images + - Builds multi-platform Docker images (amd64, arm64) - Pushes images to GitHub Container Registry - Packages Helm chart - - Creates GitHub Release - - Updates Helm repository on GitHub Pages + - Publishes chart to GHCR (OCI format) + - Creates GitHub Release with chart attachment 3. **Verify release**: - Check [GitHub Releases](https://github.com/SuperMarioYL/Bison/releases) - - Pull new images: `docker pull ghcr.io/supermarioyl/bison/api-server:3.1.0` - - Update Helm repo: `helm repo update && helm search repo bison` + - Pull new images: `docker pull ghcr.io/supermarioyl/bison/api-server:0.0.2` + - Install chart: `helm install test oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2` ## Project Structure diff --git a/deploy/charts/bison/README.md b/deploy/charts/bison/README.md index e64685f..59bc116 100644 --- a/deploy/charts/bison/README.md +++ b/deploy/charts/bison/README.md @@ -4,7 +4,11 @@ Kubernetes-based GPU Resource Billing and Scheduling Platform ## Installation -### From GHCR (Recommended) +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 + +### Method 1: From GHCR (Recommended) Install directly from GitHub Container Registry using OCI format: @@ -15,12 +19,23 @@ helm install my-bison oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 # Or pull first, then install helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 helm install my-bison bison-0.0.2.tgz + +# With custom configuration +helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true ``` -**Requirements:** -- Helm >= 3.8.0 (for OCI support) +**Why GHCR OCI Format?** +- ✅ No separate Helm repository needed +- ✅ Unified with Docker images in GHCR +- ✅ Faster installation +- ✅ Modern Helm 3.8+ standard -### From GitHub Releases +### Method 2: From GitHub Releases Download the chart from [GitHub Releases](https://github.com/SuperMarioYL/Bison/releases) and install locally: @@ -29,7 +44,9 @@ Download the chart from [GitHub Releases](https://github.com/SuperMarioYL/Bison/ wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz # Install -helm install my-bison bison-0.0.2.tgz +helm install my-bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace ``` ## Prerequisites diff --git a/website/docs/installation.md b/website/docs/installation.md index c9abf61..da89191 100644 --- a/website/docs/installation.md +++ b/website/docs/installation.md @@ -44,39 +44,53 @@ helm install opencost opencost/opencost \ ## Installation Methods -Choose one of the following methods to install Bison: +Bison Helm charts are distributed via **GitHub Container Registry (GHCR)** using the modern OCI format. -### Option A: Helm Repository (Recommended) +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 -The simplest way to install Bison is using the official Helm repository: +### Option A: From GHCR (Recommended) + +The simplest way to install Bison is directly from GitHub Container Registry: ```bash -# Add Bison Helm repository -helm repo add bison https://supermarioyl.github.io/Bison/charts/ -helm repo update +# Install specific version from GHCR +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace -# Install with default configuration -helm install bison bison/bison \ +# Or pull the chart first, then install +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install bison bison-0.0.2.tgz \ --namespace bison-system \ --create-namespace -# Or customize installation -helm install bison bison/bison \ +# Customize installation +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ --namespace bison-system \ --create-namespace \ --set opencost.url=http://opencost.opencost-system.svc:9003 \ - --set auth.enabled=false \ - --set apiServer.image.tag=0.0.1 \ - --set webUI.image.tag=0.0.1 + --set auth.enabled=true \ + --set apiServer.image.tag=0.0.2 \ + --set webUI.image.tag=0.0.2 ``` +**Why GHCR OCI Format?** +- ✅ No separate Helm repository maintenance needed +- ✅ Unified with Docker images in GHCR +- ✅ Faster installation (direct registry pull) +- ✅ Modern Helm 3.8+ standard practice + ### Option B: From GitHub Release Download a specific version from GitHub Releases: ```bash -# Download latest Helm chart -VERSION=0.0.1 +# Download Helm chart +VERSION=0.0.2 wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz # Install the chart diff --git a/website/i18n/zh-Hans/code.json b/website/i18n/zh-Hans/code.json index e92fe8a..e49f43e 100644 --- a/website/i18n/zh-Hans/code.json +++ b/website/i18n/zh-Hans/code.json @@ -325,5 +325,161 @@ "theme.tags.tagsPageTitle": { "message": "标签", "description": "The title of the tag list page" + }, + "component.statsSection.efficiency": { + "message": "GPU 资源效率", + "description": "Label for GPU resource efficiency statistic" + }, + "component.statsSection.deployTime": { + "message": "平均部署时间", + "description": "Label for average deployment time statistic" + }, + "component.statsSection.tenants": { + "message": "支持租户数", + "description": "Label for supported tenants statistic" + }, + "component.statsSection.savings": { + "message": "成本节省", + "description": "Label for cost savings statistic" + }, + "component.architectureDiagram.title": { + "message": "架构概览", + "description": "Architecture diagram section title" + }, + "component.architectureDiagram.subtitle": { + "message": "基于云原生技术构建,具备可扩展性和可靠性", + "description": "Architecture diagram section subtitle" + }, + "component.architectureDiagram.node.bison": { + "message": "GPU 计费与调度平台", + "description": "Description for Bison node" + }, + "component.architectureDiagram.node.capsule": { + "message": "多租户管理", + "description": "Description for Capsule node" + }, + "component.architectureDiagram.node.opencost": { + "message": "成本追踪与分析", + "description": "Description for OpenCost node" + }, + "component.architectureDiagram.node.k8s": { + "message": "容器编排", + "description": "Description for Kubernetes node" + }, + "component.architectureDiagram.node.prometheus": { + "message": "指标收集", + "description": "Description for Prometheus node" + }, + "component.useCases.title": { + "message": "真实应用场景", + "description": "Use cases section title" + }, + "component.useCases.subtitle": { + "message": "了解 Bison 如何在不同场景下转变 GPU 资源管理", + "description": "Use cases section subtitle" + }, + "component.useCases.beforeBison": { + "message": "使用前", + "description": "Before Bison comparison header" + }, + "component.useCases.withBison": { + "message": "使用后", + "description": "With Bison comparison header" + }, + "component.useCases.aiTraining.title": { + "message": "AI 训练平台", + "description": "Title for AI training use case" + }, + "component.useCases.aiTraining.description": { + "message": "多团队机器学习工作负载的 GPU 资源共享", + "description": "Description for AI training use case" + }, + "component.useCases.aiTraining.before.manual": { + "message": "手动分配 GPU", + "description": "AI training before: manual allocation" + }, + "component.useCases.aiTraining.before.noCost": { + "message": "成本不可见", + "description": "AI training before: no cost visibility" + }, + "component.useCases.aiTraining.before.conflicts": { + "message": "资源冲突", + "description": "AI training before: resource conflicts" + }, + "component.useCases.aiTraining.after.automated": { + "message": "自动化调度", + "description": "AI training after: automated scheduling" + }, + "component.useCases.aiTraining.after.realtime": { + "message": "实时成本追踪", + "description": "AI training after: real-time cost tracking" + }, + "component.useCases.aiTraining.after.fair": { + "message": "公平资源共享", + "description": "AI training after: fair resource sharing" + }, + "component.useCases.enterprise.title": { + "message": "企业云平台", + "description": "Title for enterprise cloud use case" + }, + "component.useCases.enterprise.description": { + "message": "部门级资源隔离与计费", + "description": "Description for enterprise cloud use case" + }, + "component.useCases.enterprise.before.chaos": { + "message": "共享集群混乱", + "description": "Enterprise before: shared cluster chaos" + }, + "component.useCases.enterprise.before.noBudget": { + "message": "无预算控制", + "description": "Enterprise before: no budget control" + }, + "component.useCases.enterprise.before.manual": { + "message": "手动报告", + "description": "Enterprise before: manual reporting" + }, + "component.useCases.enterprise.after.isolated": { + "message": "租户隔离", + "description": "Enterprise after: isolated tenants" + }, + "component.useCases.enterprise.after.prepaid": { + "message": "预付费余额", + "description": "Enterprise after: prepaid balances" + }, + "component.useCases.enterprise.after.automated": { + "message": "自动化报表", + "description": "Enterprise after: automated reports" + }, + "component.useCases.billing.title": { + "message": "成本中心计费", + "description": "Title for cost center billing use case" + }, + "component.useCases.billing.description": { + "message": "内部 GPU 资源的计费回扣系统", + "description": "Description for cost center billing use case" + }, + "component.useCases.billing.before.excel": { + "message": "基于 Excel 跟踪", + "description": "Billing before: Excel-based tracking" + }, + "component.useCases.billing.before.monthly": { + "message": "月度对账", + "description": "Billing before: monthly reconciliation" + }, + "component.useCases.billing.before.disputes": { + "message": "账单纠纷", + "description": "Billing before: billing disputes" + }, + "component.useCases.billing.after.realtime": { + "message": "实时扣费", + "description": "Billing after: real-time deduction" + }, + "component.useCases.billing.after.transparent": { + "message": "透明定价", + "description": "Billing after: transparent pricing" + }, + "component.useCases.billing.after.automated": { + "message": "自动开票", + "description": "Billing after: automated invoicing" } } diff --git a/website/src/components/ArchitectureDiagram/index.tsx b/website/src/components/ArchitectureDiagram/index.tsx new file mode 100644 index 0000000..a2aba96 --- /dev/null +++ b/website/src/components/ArchitectureDiagram/index.tsx @@ -0,0 +1,160 @@ +import type {ReactNode} from 'react'; +import {translate} from '@docusaurus/Translate'; +import Translate from '@docusaurus/Translate'; +import Heading from '@theme/Heading'; +import styles from './styles.module.css'; + +interface ArchNode { + id: string; + label: string; + description: string; + color: string; +} + +const nodes: ArchNode[] = [ + { + id: 'bison', + label: 'Bison', + description: translate({ + id: 'component.architectureDiagram.node.bison', + message: 'GPU Billing & Scheduling Platform', + description: 'Description for Bison node', + }), + color: '#0A84FF', + }, + { + id: 'capsule', + label: 'Capsule', + description: translate({ + id: 'component.architectureDiagram.node.capsule', + message: 'Multi-Tenant Management', + description: 'Description for Capsule node', + }), + color: '#5E5CE6', + }, + { + id: 'opencost', + label: 'OpenCost', + description: translate({ + id: 'component.architectureDiagram.node.opencost', + message: 'Cost Tracking & Analytics', + description: 'Description for OpenCost node', + }), + color: '#BF5AF2', + }, + { + id: 'k8s', + label: 'Kubernetes', + description: translate({ + id: 'component.architectureDiagram.node.k8s', + message: 'Container Orchestration', + description: 'Description for Kubernetes node', + }), + color: '#326CE5', + }, + { + id: 'prometheus', + label: 'Prometheus', + description: translate({ + id: 'component.architectureDiagram.node.prometheus', + message: 'Metrics Collection', + description: 'Description for Prometheus node', + }), + color: '#E6522C', + }, +]; + +export default function ArchitectureDiagram(): ReactNode { + return ( +
+
+
+ + + Architecture Overview + + +

+ + Built on cloud-native technologies for scalability and reliability + +

+
+ +
+ + + + + + + + + + + + + + + + {/* Connections */} + + + + + + + + + {/* Bison */} + + + Bison + + + {/* Capsule */} + + + Capsule + + + {/* OpenCost */} + + + OpenCost + + + {/* Kubernetes */} + + + Kubernetes + + + {/* Prometheus */} + + + Prometheus + + + +
+ {nodes.map(node => ( +
+
+ {node.label} +
+
{node.description}
+
+ ))} +
+
+
+
+ ); +} diff --git a/website/src/components/ArchitectureDiagram/styles.module.css b/website/src/components/ArchitectureDiagram/styles.module.css new file mode 100644 index 0000000..074c7ed --- /dev/null +++ b/website/src/components/ArchitectureDiagram/styles.module.css @@ -0,0 +1,183 @@ +.architectureSection { + padding: 4rem 0; + background: #ffffff; +} + +[data-theme='dark'] .architectureSection { + background: #0d0d0d; +} + +.sectionTitle { + font-size: 2.5rem; + font-weight: 700; + margin-bottom: 1rem; +} + +.sectionSubtitle { + font-size: 1.125rem; + color: #666; + max-width: 600px; + margin: 0 auto; +} + +[data-theme='dark'] .sectionSubtitle { + color: #b3b3b3; +} + +.diagramContainer { + margin-top: 3rem; +} + +.diagram { + width: 100%; + max-width: 800px; + height: auto; + margin: 0 auto; + display: block; + margin-bottom: 3rem; +} + +.node { + cursor: pointer; + transition: transform 0.3s ease; + animation: nodeAppear 0.6s ease-out backwards; +} + +.node:nth-child(1) { + animation-delay: 0.1s; +} + +.node:nth-child(2) { + animation-delay: 0.2s; +} + +.node:nth-child(3) { + animation-delay: 0.3s; +} + +.node:nth-child(4) { + animation-delay: 0.4s; +} + +.node:nth-child(5) { + animation-delay: 0.5s; +} + +.node:hover { + transform: scale(1.05); +} + +.node rect { + transition: all 0.3s ease; +} + +.node:hover rect { + filter: url(#glow) brightness(1.1); +} + +.connectionLine { + stroke-dasharray: 5, 5; + animation: dashMove 20s linear infinite; +} + +@keyframes dashMove { + to { + stroke-dashoffset: -1000; + } +} + +@keyframes nodeAppear { + from { + opacity: 0; + transform: scale(0.8); + } + to { + opacity: 1; + transform: scale(1); + } +} + +.nodeDescriptions { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 1.5rem; + margin-top: 2rem; +} + +.nodeCard { + padding: 1.5rem; + border-radius: 12px; + border-left: 4px solid; + background: rgba(0, 0, 0, 0.02); + transition: all 0.3s ease; + animation: cardSlideIn 0.6s ease-out backwards; +} + +[data-theme='dark'] .nodeCard { + background: rgba(255, 255, 255, 0.05); +} + +.nodeCard:nth-child(1) { + animation-delay: 0.6s; +} + +.nodeCard:nth-child(2) { + animation-delay: 0.7s; +} + +.nodeCard:nth-child(3) { + animation-delay: 0.8s; +} + +.nodeCard:nth-child(4) { + animation-delay: 0.9s; +} + +.nodeCard:nth-child(5) { + animation-delay: 1s; +} + +.nodeCard:hover { + transform: translateX(4px); + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1); +} + +.nodeCardTitle { + font-size: 1.125rem; + font-weight: 600; + margin-bottom: 0.5rem; +} + +.nodeCardDescription { + font-size: 0.875rem; + color: #666; +} + +[data-theme='dark'] .nodeCardDescription { + color: #b3b3b3; +} + +@keyframes cardSlideIn { + from { + opacity: 0; + transform: translateX(-20px); + } + to { + opacity: 1; + transform: translateX(0); + } +} + +@media screen and (max-width: 768px) { + .sectionTitle { + font-size: 2rem; + } + + .diagram { + height: 300px; + } + + .nodeDescriptions { + grid-template-columns: 1fr; + } +} diff --git a/website/src/components/HomepageFeatures/index.tsx b/website/src/components/HomepageFeatures/index.tsx index 69b3dd0..4df085e 100644 --- a/website/src/components/HomepageFeatures/index.tsx +++ b/website/src/components/HomepageFeatures/index.tsx @@ -81,10 +81,11 @@ const FeatureList: FeatureItem[] = [ function Feature({title, Svg, icon, description}: FeatureItem) { return ( -
-
- {title} -

{description}

+
+
+ {icon &&
{icon}
} + {title} +

{description}

); diff --git a/website/src/components/HomepageFeatures/styles.module.css b/website/src/components/HomepageFeatures/styles.module.css index b248eb2..c8133e3 100644 --- a/website/src/components/HomepageFeatures/styles.module.css +++ b/website/src/components/HomepageFeatures/styles.module.css @@ -1,11 +1,128 @@ .features { display: flex; align-items: center; - padding: 2rem 0; + padding: 4rem 0; width: 100%; + background: linear-gradient(180deg, #f8f9fa 0%, #ffffff 100%); +} + +[data-theme='dark'] .features { + background: linear-gradient(180deg, #1a1a1a 0%, #0d0d0d 100%); } .featureSvg { height: 200px; width: 200px; } + +.featureCard { + padding: 2rem; + border-radius: 16px; + background: rgba(255, 255, 255, 0.9); + backdrop-filter: blur(10px); + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05); + transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); + border: 1px solid rgba(0, 0, 0, 0.05); + height: 100%; + position: relative; + overflow: hidden; +} + +[data-theme='dark'] .featureCard { + background: rgba(30, 30, 30, 0.8); + border: 1px solid rgba(255, 255, 255, 0.1); +} + +.featureCard::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + height: 3px; + background: linear-gradient(90deg, #0A84FF, #5E5CE6, #BF5AF2); + transform: scaleX(0); + transition: transform 0.4s ease; +} + +.featureCard:hover::before { + transform: scaleX(1); +} + +.featureCard:hover { + transform: translateY(-8px) scale(1.02); + box-shadow: + 0 12px 24px rgba(10, 132, 255, 0.15), + 0 0 0 1px rgba(10, 132, 255, 0.1); +} + +.featureIcon { + font-size: 3rem; + display: inline-block; + transition: transform 0.4s cubic-bezier(0.4, 0, 0.2, 1); + margin-bottom: 1rem; +} + +.featureCard:hover .featureIcon { + transform: scale(1.2) rotate(5deg); +} + +.featureTitle { + margin-bottom: 1rem; + font-weight: 600; + color: #1a1a1a; +} + +[data-theme='dark'] .featureTitle { + color: #ffffff; +} + +.featureDescription { + color: #666; + line-height: 1.6; +} + +[data-theme='dark'] .featureDescription { + color: #b3b3b3; +} + +/* Stagger animation on scroll */ +.featureCard { + opacity: 0; + animation: fadeInUp 0.6s ease-out forwards; +} + +.featureCard:nth-child(1) { + animation-delay: 0.1s; +} + +.featureCard:nth-child(2) { + animation-delay: 0.2s; +} + +.featureCard:nth-child(3) { + animation-delay: 0.3s; +} + +.featureCard:nth-child(4) { + animation-delay: 0.4s; +} + +.featureCard:nth-child(5) { + animation-delay: 0.5s; +} + +.featureCard:nth-child(6) { + animation-delay: 0.6s; +} + +@keyframes fadeInUp { + from { + opacity: 0; + transform: translateY(30px); + } + to { + opacity: 1; + transform: translateY(0); + } +} diff --git a/website/src/components/ParticleBackground/index.tsx b/website/src/components/ParticleBackground/index.tsx new file mode 100644 index 0000000..1caddd0 --- /dev/null +++ b/website/src/components/ParticleBackground/index.tsx @@ -0,0 +1,101 @@ +import {useEffect, useRef} from 'react'; +import type {ReactNode} from 'react'; +import styles from './styles.module.css'; + +interface Particle { + x: number; + y: number; + vx: number; + vy: number; + size: number; + opacity: number; +} + +export default function ParticleBackground(): ReactNode { + const canvasRef = useRef(null); + + useEffect(() => { + const canvas = canvasRef.current; + if (!canvas) return; + + const ctx = canvas.getContext('2d'); + if (!ctx) return; + + // Set canvas size + const resizeCanvas = () => { + canvas.width = window.innerWidth; + canvas.height = window.innerHeight; + }; + resizeCanvas(); + window.addEventListener('resize', resizeCanvas); + + // Particle settings - fewer particles on mobile + const isMobile = window.innerWidth < 768; + const particleCount = isMobile ? 30 : 80; + const particles: Particle[] = []; + + // Initialize particles + for (let i = 0; i < particleCount; i++) { + particles.push({ + x: Math.random() * canvas.width, + y: Math.random() * canvas.height, + vx: (Math.random() - 0.5) * 0.5, + vy: (Math.random() - 0.5) * 0.5, + size: Math.random() * 2 + 1, + opacity: Math.random() * 0.5 + 0.2, + }); + } + + // Animation loop + let animationFrameId: number; + const animate = () => { + ctx.clearRect(0, 0, canvas.width, canvas.height); + + particles.forEach((particle, i) => { + // Update position + particle.x += particle.vx; + particle.y += particle.vy; + + // Wrap around edges + if (particle.x < 0) particle.x = canvas.width; + if (particle.x > canvas.width) particle.x = 0; + if (particle.y < 0) particle.y = canvas.height; + if (particle.y > canvas.height) particle.y = 0; + + // Draw particle + ctx.beginPath(); + ctx.arc(particle.x, particle.y, particle.size, 0, Math.PI * 2); + ctx.fillStyle = `rgba(255, 255, 255, ${particle.opacity})`; + ctx.fill(); + + // Draw connections + particles.slice(i + 1).forEach(otherParticle => { + const dx = particle.x - otherParticle.x; + const dy = particle.y - otherParticle.y; + const distance = Math.sqrt(dx * dx + dy * dy); + + if (distance < 120) { + ctx.beginPath(); + ctx.moveTo(particle.x, particle.y); + ctx.lineTo(otherParticle.x, otherParticle.y); + const opacity = (1 - distance / 120) * 0.15; + ctx.strokeStyle = `rgba(255, 255, 255, ${opacity})`; + ctx.lineWidth = 0.5; + ctx.stroke(); + } + }); + }); + + animationFrameId = requestAnimationFrame(animate); + }; + + animate(); + + return () => { + window.removeEventListener('resize', resizeCanvas); + cancelAnimationFrame(animationFrameId); + }; + }, []); + + return ; +} diff --git a/website/src/components/ParticleBackground/styles.module.css b/website/src/components/ParticleBackground/styles.module.css new file mode 100644 index 0000000..13d4b89 --- /dev/null +++ b/website/src/components/ParticleBackground/styles.module.css @@ -0,0 +1,9 @@ +.particleCanvas { + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 100%; + pointer-events: none; + z-index: 1; +} diff --git a/website/src/components/StatsSection/index.tsx b/website/src/components/StatsSection/index.tsx new file mode 100644 index 0000000..91349f7 --- /dev/null +++ b/website/src/components/StatsSection/index.tsx @@ -0,0 +1,124 @@ +import {useEffect, useState, useRef} from 'react'; +import type {ReactNode} from 'react'; +import {translate} from '@docusaurus/Translate'; +import styles from './styles.module.css'; + +interface StatItem { + value: string; + label: string; + suffix?: string; +} + +const stats: StatItem[] = [ + { + value: '99.9', + label: translate({ + id: 'component.statsSection.efficiency', + message: 'GPU Resource Efficiency', + description: 'Label for GPU resource efficiency statistic', + }), + suffix: '%', + }, + { + value: '30', + label: translate({ + id: 'component.statsSection.deployTime', + message: 'Avg Deploy Time', + description: 'Label for average deployment time statistic', + }), + suffix: ' min', + }, + { + value: '1000', + label: translate({ + id: 'component.statsSection.tenants', + message: 'Supported Tenants', + description: 'Label for supported tenants statistic', + }), + suffix: '+', + }, + { + value: '40', + label: translate({ + id: 'component.statsSection.savings', + message: 'Cost Savings', + description: 'Label for cost savings statistic', + }), + suffix: '%+', + }, +]; + +function CountUpNumber({end, suffix = '', duration = 2000}: {end: number; suffix?: string; duration?: number}): ReactNode { + const [count, setCount] = useState(0); + const [hasAnimated, setHasAnimated] = useState(false); + const ref = useRef(null); + + useEffect(() => { + const observer = new IntersectionObserver( + entries => { + if (entries[0].isIntersecting && !hasAnimated) { + setHasAnimated(true); + + const startTime = Date.now(); + const startValue = 0; + + const animate = () => { + const now = Date.now(); + const progress = Math.min((now - startTime) / duration, 1); + + // Easing function (ease-out cubic) + const easeOut = 1 - Math.pow(1 - progress, 3); + const current = startValue + (end - startValue) * easeOut; + + setCount(current); + + if (progress < 1) { + requestAnimationFrame(animate); + } else { + setCount(end); + } + }; + + animate(); + } + }, + {threshold: 0.3} + ); + + if (ref.current) { + observer.observe(ref.current); + } + + return () => observer.disconnect(); + }, [end, duration, hasAnimated]); + + return ( + + {end % 1 === 0 ? Math.floor(count) : count.toFixed(1)} + {suffix} + + ); +} + +export default function StatsSection(): ReactNode { + return ( +
+
+
+ {stats.map((stat, index) => ( +
+
+ +
+
{stat.label}
+
+ ))} +
+
+
+ ); +} diff --git a/website/src/components/StatsSection/styles.module.css b/website/src/components/StatsSection/styles.module.css new file mode 100644 index 0000000..b0ee418 --- /dev/null +++ b/website/src/components/StatsSection/styles.module.css @@ -0,0 +1,101 @@ +.statsSection { + padding: 4rem 0; + background: linear-gradient(135deg, #0A84FF 0%, #5E5CE6 100%); + position: relative; + overflow: hidden; +} + +.statsSection::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: + radial-gradient(circle at 10% 20%, rgba(255, 255, 255, 0.1) 0%, transparent 40%), + radial-gradient(circle at 90% 80%, rgba(255, 255, 255, 0.1) 0%, transparent 40%); + animation: shimmer 10s ease-in-out infinite; +} + +@keyframes shimmer { + 0%, 100% { + opacity: 0.5; + } + 50% { + opacity: 1; + } +} + +.statsGrid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 3rem; + position: relative; + z-index: 1; +} + +.statItem { + text-align: center; + color: white; + animation: fadeInScale 0.8s ease-out backwards; +} + +.statItem:nth-child(1) { + animation-delay: 0.1s; +} + +.statItem:nth-child(2) { + animation-delay: 0.2s; +} + +.statItem:nth-child(3) { + animation-delay: 0.3s; +} + +.statItem:nth-child(4) { + animation-delay: 0.4s; +} + +.statValue { + font-size: 3.5rem; + font-weight: 700; + line-height: 1.2; + margin-bottom: 0.5rem; + text-shadow: 0 2px 10px rgba(0, 0, 0, 0.2); + font-variant-numeric: tabular-nums; +} + +.statLabel { + font-size: 1rem; + font-weight: 500; + opacity: 0.95; + text-transform: uppercase; + letter-spacing: 1px; +} + +@keyframes fadeInScale { + from { + opacity: 0; + transform: scale(0.8) translateY(20px); + } + to { + opacity: 1; + transform: scale(1) translateY(0); + } +} + +@media screen and (max-width: 768px) { + .statsGrid { + grid-template-columns: repeat(2, 1fr); + gap: 2rem; + } + + .statValue { + font-size: 2.5rem; + } + + .statLabel { + font-size: 0.875rem; + } +} diff --git a/website/src/components/UseCases/index.tsx b/website/src/components/UseCases/index.tsx new file mode 100644 index 0000000..bba03f0 --- /dev/null +++ b/website/src/components/UseCases/index.tsx @@ -0,0 +1,236 @@ +import type {ReactNode} from 'react'; +import {translate} from '@docusaurus/Translate'; +import Translate from '@docusaurus/Translate'; +import Heading from '@theme/Heading'; +import styles from './styles.module.css'; + +interface UseCase { + icon: string; + title: string; + description: string; + before: string[]; + after: string[]; +} + +const useCases: UseCase[] = [ + { + icon: '🤖', + title: translate({ + id: 'component.useCases.aiTraining.title', + message: 'AI Training Platform', + description: 'Title for AI training use case', + }), + description: translate({ + id: 'component.useCases.aiTraining.description', + message: 'Multi-team GPU resource sharing for machine learning workloads', + description: 'Description for AI training use case', + }), + before: [ + translate({ + id: 'component.useCases.aiTraining.before.manual', + message: 'Manual GPU allocation', + description: 'AI training before: manual allocation', + }), + translate({ + id: 'component.useCases.aiTraining.before.noCost', + message: 'No cost visibility', + description: 'AI training before: no cost visibility', + }), + translate({ + id: 'component.useCases.aiTraining.before.conflicts', + message: 'Resource conflicts', + description: 'AI training before: resource conflicts', + }), + ], + after: [ + translate({ + id: 'component.useCases.aiTraining.after.automated', + message: 'Automated scheduling', + description: 'AI training after: automated scheduling', + }), + translate({ + id: 'component.useCases.aiTraining.after.realtime', + message: 'Real-time cost tracking', + description: 'AI training after: real-time cost tracking', + }), + translate({ + id: 'component.useCases.aiTraining.after.fair', + message: 'Fair resource sharing', + description: 'AI training after: fair resource sharing', + }), + ], + }, + { + icon: '🏢', + title: translate({ + id: 'component.useCases.enterprise.title', + message: 'Enterprise Cloud', + description: 'Title for enterprise cloud use case', + }), + description: translate({ + id: 'component.useCases.enterprise.description', + message: 'Department-level resource isolation and billing', + description: 'Description for enterprise cloud use case', + }), + before: [ + translate({ + id: 'component.useCases.enterprise.before.chaos', + message: 'Shared cluster chaos', + description: 'Enterprise before: shared cluster chaos', + }), + translate({ + id: 'component.useCases.enterprise.before.noBudget', + message: 'No budget control', + description: 'Enterprise before: no budget control', + }), + translate({ + id: 'component.useCases.enterprise.before.manual', + message: 'Manual reporting', + description: 'Enterprise before: manual reporting', + }), + ], + after: [ + translate({ + id: 'component.useCases.enterprise.after.isolated', + message: 'Isolated tenants', + description: 'Enterprise after: isolated tenants', + }), + translate({ + id: 'component.useCases.enterprise.after.prepaid', + message: 'Prepaid balances', + description: 'Enterprise after: prepaid balances', + }), + translate({ + id: 'component.useCases.enterprise.after.automated', + message: 'Automated reports', + description: 'Enterprise after: automated reports', + }), + ], + }, + { + icon: '💵', + title: translate({ + id: 'component.useCases.billing.title', + message: 'Cost Center Billing', + description: 'Title for cost center billing use case', + }), + description: translate({ + id: 'component.useCases.billing.description', + message: 'Chargeback system for internal GPU resources', + description: 'Description for cost center billing use case', + }), + before: [ + translate({ + id: 'component.useCases.billing.before.excel', + message: 'Excel-based tracking', + description: 'Billing before: Excel-based tracking', + }), + translate({ + id: 'component.useCases.billing.before.monthly', + message: 'Monthly reconciliation', + description: 'Billing before: monthly reconciliation', + }), + translate({ + id: 'component.useCases.billing.before.disputes', + message: 'Billing disputes', + description: 'Billing before: billing disputes', + }), + ], + after: [ + translate({ + id: 'component.useCases.billing.after.realtime', + message: 'Real-time deduction', + description: 'Billing after: real-time deduction', + }), + translate({ + id: 'component.useCases.billing.after.transparent', + message: 'Transparent pricing', + description: 'Billing after: transparent pricing', + }), + translate({ + id: 'component.useCases.billing.after.automated', + message: 'Automated invoicing', + description: 'Billing after: automated invoicing', + }), + ], + }, +]; + +function ComparisonCard({useCase}: {useCase: UseCase}): ReactNode { + return ( +
+
{useCase.icon}
+ + {useCase.title} + +

{useCase.description}

+ +
+
+
+ + + + Before Bison + + +
+
    + {useCase.before.map((item, i) => ( +
  • + {item} +
  • + ))} +
+
+ +
+ +
+
+ + + + With Bison + + +
+
    + {useCase.after.map((item, i) => ( +
  • + {item} +
  • + ))} +
+
+
+
+ ); +} + +export default function UseCases(): ReactNode { + return ( +
+
+
+ + + Real-World Use Cases + + +

+ + See how Bison transforms GPU resource management across different scenarios + +

+
+ +
+ {useCases.map((useCase, index) => ( + + ))} +
+
+
+ ); +} diff --git a/website/src/components/UseCases/styles.module.css b/website/src/components/UseCases/styles.module.css new file mode 100644 index 0000000..61ea1d3 --- /dev/null +++ b/website/src/components/UseCases/styles.module.css @@ -0,0 +1,207 @@ +.useCasesSection { + padding: 4rem 0; + background: linear-gradient(180deg, #ffffff 0%, #f8f9fa 100%); +} + +[data-theme='dark'] .useCasesSection { + background: linear-gradient(180deg, #0d0d0d 0%, #1a1a1a 100%); +} + +.sectionTitle { + font-size: 2.5rem; + font-weight: 700; + margin-bottom: 1rem; +} + +.sectionSubtitle { + font-size: 1.125rem; + color: #666; + max-width: 700px; + margin: 0 auto; +} + +[data-theme='dark'] .sectionSubtitle { + color: #b3b3b3; +} + +.useCasesGrid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); + gap: 2rem; + margin-top: 3rem; +} + +.useCaseCard { + padding: 2rem; + border-radius: 16px; + background: white; + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05); + transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); + border: 1px solid rgba(0, 0, 0, 0.05); + animation: fadeInUp 0.6s ease-out backwards; +} + +[data-theme='dark'] .useCaseCard { + background: rgba(30, 30, 30, 0.8); + border: 1px solid rgba(255, 255, 255, 0.1); +} + +.useCaseCard:nth-child(1) { + animation-delay: 0.1s; +} + +.useCaseCard:nth-child(2) { + animation-delay: 0.2s; +} + +.useCaseCard:nth-child(3) { + animation-delay: 0.3s; +} + +.useCaseCard:hover { + transform: translateY(-8px); + box-shadow: 0 12px 24px rgba(10, 132, 255, 0.1); +} + +.useCaseIcon { + font-size: 3rem; + margin-bottom: 1rem; + display: inline-block; + animation: bounce 2s ease-in-out infinite; +} + +@keyframes bounce { + 0%, 100% { + transform: translateY(0); + } + 50% { + transform: translateY(-10px); + } +} + +.useCaseTitle { + font-size: 1.5rem; + font-weight: 600; + margin-bottom: 0.75rem; +} + +.useCaseDescription { + color: #666; + margin-bottom: 1.5rem; + line-height: 1.6; +} + +[data-theme='dark'] .useCaseDescription { + color: #b3b3b3; +} + +.comparison { + display: grid; + grid-template-columns: 1fr auto 1fr; + gap: 1rem; + align-items: start; +} + +.comparisonColumn { + min-width: 0; +} + +.comparisonHeader { + display: flex; + align-items: center; + gap: 0.5rem; + font-weight: 600; + margin-bottom: 0.75rem; + font-size: 0.875rem; +} + +.crossIcon { + font-size: 1rem; +} + +.checkIcon { + font-size: 1rem; +} + +.comparisonDivider { + font-size: 1.5rem; + color: #0A84FF; + font-weight: 700; + padding-top: 1.5rem; +} + +.comparisonList { + list-style: none; + padding: 0; + margin: 0; +} + +.comparisonList li { + padding: 0.5rem 0; + font-size: 0.875rem; + position: relative; + padding-left: 1.25rem; +} + +.comparisonList li::before { + content: ''; + position: absolute; + left: 0; + top: 0.875rem; + width: 6px; + height: 6px; + border-radius: 50%; +} + +.comparisonItemBefore { + color: #999; +} + +.comparisonItemBefore::before { + background: #ff4444; +} + +.comparisonItemAfter { + color: #333; + font-weight: 500; +} + +[data-theme='dark'] .comparisonItemAfter { + color: #ffffff; +} + +.comparisonItemAfter::before { + background: #00c853; +} + +@keyframes fadeInUp { + from { + opacity: 0; + transform: translateY(30px); + } + to { + opacity: 1; + transform: translateY(0); + } +} + +@media screen and (max-width: 768px) { + .useCasesGrid { + grid-template-columns: 1fr; + } + + .comparison { + grid-template-columns: 1fr; + gap: 1.5rem; + } + + .comparisonDivider { + text-align: center; + padding-top: 0; + transform: rotate(90deg); + } + + .sectionTitle { + font-size: 2rem; + } +} diff --git a/website/src/css/custom.css b/website/src/css/custom.css index 8df8ae0..a9ab623 100644 --- a/website/src/css/custom.css +++ b/website/src/css/custom.css @@ -127,3 +127,146 @@ button:focus, outline-offset: 2px; border-radius: 4px; } + +/* ==================== Additional Modern Effects ==================== */ + +/* Smooth scroll behavior */ +html { + scroll-behavior: smooth; +} + +/* Page transitions */ +main { + animation: pageEnter 0.5s ease-out; +} + +@keyframes pageEnter { + from { + opacity: 0; + transform: translateY(20px); + } + to { + opacity: 1; + transform: translateY(0); + } +} + +/* Enhanced button styles */ +.button--primary, +.button--secondary { + font-weight: 600; + letter-spacing: 0.3px; + position: relative; + overflow: hidden; +} + +.button--primary::before, +.button--secondary::before { + content: ''; + position: absolute; + top: 50%; + left: 50%; + width: 0; + height: 0; + border-radius: 50%; + background: rgba(255, 255, 255, 0.2); + transform: translate(-50%, -50%); + transition: width 0.6s, height 0.6s; +} + +.button--primary:hover::before, +.button--secondary:hover::before { + width: 300px; + height: 300px; +} + +/* Card hover glow effect */ +.card:hover { + box-shadow: + 0 4px 20px rgba(10, 132, 255, 0.15), + 0 0 0 1px rgba(10, 132, 255, 0.1); +} + +/* Improved text selection */ +::selection { + background: rgba(10, 132, 255, 0.3); + color: inherit; +} + +/* Loading shimmer effect */ +@keyframes shimmer { + 0% { + background-position: -1000px 0; + } + 100% { + background-position: 1000px 0; + } +} + +/* Pulse animation for important elements */ +@keyframes pulse { + 0%, 100% { + opacity: 1; + } + 50% { + opacity: 0.8; + } +} + +/* Gradient text effect */ +.gradient-text { + background: linear-gradient(135deg, #0A84FF, #5E5CE6, #BF5AF2); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +/* Scrollbar styling */ +::-webkit-scrollbar { + width: 10px; + height: 10px; +} + +::-webkit-scrollbar-track { + background: rgba(0, 0, 0, 0.05); +} + +::-webkit-scrollbar-thumb { + background: rgba(10, 132, 255, 0.3); + border-radius: 5px; +} + +::-webkit-scrollbar-thumb:hover { + background: rgba(10, 132, 255, 0.5); +} + +[data-theme='dark'] ::-webkit-scrollbar-track { + background: rgba(255, 255, 255, 0.05); +} + +[data-theme='dark'] ::-webkit-scrollbar-thumb { + background: rgba(10, 132, 255, 0.4); +} + +[data-theme='dark'] ::-webkit-scrollbar-thumb:hover { + background: rgba(10, 132, 255, 0.6); +} + +/* Performance optimization hints */ +.hero, +.card, +button, +a { + will-change: transform; +} + +/* Reduce motion for accessibility */ +@media (prefers-reduced-motion: reduce) { + *, + *::before, + *::after { + animation-duration: 0.01ms !important; + animation-iteration-count: 1 !important; + transition-duration: 0.01ms !important; + } +} diff --git a/website/src/pages/index.module.css b/website/src/pages/index.module.css index 9f71a5d..95cdd6c 100644 --- a/website/src/pages/index.module.css +++ b/website/src/pages/index.module.css @@ -8,11 +8,51 @@ text-align: center; position: relative; overflow: hidden; + background: linear-gradient(135deg, #0A84FF 0%, #5E5CE6 50%, #BF5AF2 100%); + background-size: 400% 400%; + animation: gradientShift 15s ease infinite; + min-height: 500px; + display: flex; + align-items: center; +} + +.heroBanner::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: radial-gradient(circle at 20% 50%, rgba(255, 255, 255, 0.1) 0%, transparent 50%), + radial-gradient(circle at 80% 80%, rgba(255, 255, 255, 0.1) 0%, transparent 50%); + animation: pulse 8s ease-in-out infinite; +} + +@keyframes gradientShift { + 0% { + background-position: 0% 50%; + } + 50% { + background-position: 100% 50%; + } + 100% { + background-position: 0% 50%; + } +} + +@keyframes pulse { + 0%, 100% { + opacity: 0.5; + } + 50% { + opacity: 1; + } } @media screen and (max-width: 996px) { .heroBanner { padding: 2rem; + min-height: 400px; } } @@ -20,4 +60,32 @@ display: flex; align-items: center; justify-content: center; + gap: 1rem; + margin-top: 2rem; + position: relative; + z-index: 10; +} + +.buttons a { + animation: fadeInUp 0.8s ease-out; + animation-fill-mode: both; +} + +.buttons a:nth-child(1) { + animation-delay: 0.2s; +} + +.buttons a:nth-child(2) { + animation-delay: 0.4s; +} + +@keyframes fadeInUp { + from { + opacity: 0; + transform: translateY(30px); + } + to { + opacity: 1; + transform: translateY(0); + } } diff --git a/website/src/pages/index.tsx b/website/src/pages/index.tsx index 6b7ad24..4a28ad8 100644 --- a/website/src/pages/index.tsx +++ b/website/src/pages/index.tsx @@ -5,6 +5,10 @@ import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import useBaseUrl from '@docusaurus/useBaseUrl'; import Layout from '@theme/Layout'; import HomepageFeatures from '@site/src/components/HomepageFeatures'; +import ParticleBackground from '@site/src/components/ParticleBackground'; +import StatsSection from '@site/src/components/StatsSection'; +import ArchitectureDiagram from '@site/src/components/ArchitectureDiagram'; +import UseCases from '@site/src/components/UseCases'; import Heading from '@theme/Heading'; import styles from './index.module.css'; @@ -13,26 +17,43 @@ function HomepageHeader() { const {siteConfig} = useDocusaurusContext(); return (
-
+ +
Bison Logo - + {siteConfig.title} -

{siteConfig.tagline}

+

+ {siteConfig.tagline} +

- + Get Started 🚀 + to="https://github.com/SuperMarioYL/Bison"> GitHub ⭐
@@ -49,7 +70,10 @@ export default function Home(): ReactNode { description="Enterprise GPU resource billing and multi-tenant management platform based on Kubernetes, Capsule, and OpenCost">
+ + +
); From 094fbe43913acea824c317a152198dafaf99db2f Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 22:05:10 +0800 Subject: [PATCH 11/44] fix bug --- website/DOMAIN_SETUP.md | 119 ++++++++++++++++++ website/docusaurus.config.ts | 4 +- website/i18n/zh-Hans/code.json | 29 +++-- .../current/intro.md | 62 +++++++++ .../version-0.0.1.json | 14 +++ .../docusaurus-theme-classic/footer.json | 2 +- website/src/pages/index.tsx | 15 ++- website/static/CNAME | 1 + 8 files changed, 228 insertions(+), 18 deletions(-) create mode 100644 website/DOMAIN_SETUP.md create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/intro.md create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/version-0.0.1.json create mode 100644 website/static/CNAME diff --git a/website/DOMAIN_SETUP.md b/website/DOMAIN_SETUP.md new file mode 100644 index 0000000..cd6fc19 --- /dev/null +++ b/website/DOMAIN_SETUP.md @@ -0,0 +1,119 @@ +# 自定义域名配置指南 + +本文档说明如何为 Bison 文档站点配置自定义域名 `bison.lei6393.com`。 + +## 1. DNS 配置 + +在你的 DNS 服务商(lei6393.com 的域名注册商)添加以下 DNS 记录: + +### 方式一:使用 CNAME(推荐) + +``` +类型: CNAME +主机记录: bison +记录值: supermarioyl.github.io +TTL: 600 (或默认值) +``` + +### 方式二:使用 A 记录 + +如果 DNS 服务商不支持 CNAME,可以使用 A 记录指向 GitHub Pages 的 IP: + +``` +类型: A +主机记录: bison +记录值: 185.199.108.153 +TTL: 600 + +重复添加以下 IP: +185.199.109.153 +185.199.110.153 +185.199.111.153 +``` + +## 2. GitHub 仓库设置 + +1. 进入 GitHub 仓库: https://github.com/SuperMarioYL/Bison +2. 点击 **Settings** > **Pages** +3. 在 **Custom domain** 输入框中填写: `bison.lei6393.com` +4. 勾选 **Enforce HTTPS** (DNS 生效后) +5. 点击 **Save** + +## 3. 验证配置 + +### 检查 DNS 解析 + +```bash +# 检查 CNAME 记录 +dig bison.lei6393.com CNAME +short +# 应该返回: supermarioyl.github.io + +# 检查 A 记录 +dig bison.lei6393.com A +short +# 应该返回 GitHub Pages 的 IP 地址 +``` + +### 测试网站访问 + +DNS 生效后(通常 5-30 分钟),访问: + +- 主域名: https://bison.lei6393.com +- 中文版: https://bison.lei6393.com/zh-Hans/ +- 文档: https://bison.lei6393.com/docs/ + +### 旧 GitHub Pages URL 重定向 + +GitHub 会自动将以下 URL 重定向到新域名: +- https://supermarioyl.github.io/Bison/ → https://bison.lei6393.com/ + +## 4. 本地开发 + +本地开发时仍然使用 `npm start`,会在 `http://localhost:3001/` 运行(注意不再有 `/Bison/` 路径)。 + +## 5. 部署 + +自定义域名配置已包含在代码中: + +- ✅ `website/static/CNAME` - 包含域名配置 +- ✅ `website/docusaurus.config.ts` - URL 和 baseUrl 已更新 + +每次 `npm run build` 构建时,CNAME 文件会自动复制到 `build/` 目录。 + +部署到 GitHub Pages: + +```bash +cd website +npm run deploy +``` + +## 常见问题 + +### Q: DNS 配置后多久生效? +A: 通常 5-30 分钟,最长可能需要 48 小时。 + +### Q: HTTPS 证书如何配置? +A: GitHub Pages 会在 DNS 生效后自动生成和配置 Let's Encrypt 证书,无需手动操作。 + +### Q: 为什么选择 bison.lei6393.com 而不是 www.lei6393.com? +A: +- 语义清晰,专门用于 Bison 项目 +- 便于未来扩展其他子域名项目 +- 符合企业级开源项目的最佳实践 + +### Q: 旧的 GitHub Pages 链接还能用吗? +A: 可以,GitHub 会自动重定向到新域名。 + +## 技术细节 + +当前配置: + +```typescript +// docusaurus.config.ts +url: "https://bison.lei6393.com", +baseUrl: "/", +``` + +这意味着: +- 所有链接使用根路径 `/` 而不是 `/Bison/` +- 中文版访问路径: `/zh-Hans/` (不再是 `/Bison/zh-Hans/`) +- 文档路径: `/docs/` (不再是 `/Bison/docs/`) diff --git a/website/docusaurus.config.ts b/website/docusaurus.config.ts index 4b6b1d6..37ca0de 100644 --- a/website/docusaurus.config.ts +++ b/website/docusaurus.config.ts @@ -15,10 +15,10 @@ const config: Config = { }, // Set the production url of your site here - url: "https://supermarioyl.github.io", + url: "https://bison.lei6393.com", // Set the // pathname under which your site is served // For GitHub pages deployment, it is often '//' - baseUrl: "/Bison/", + baseUrl: "/", // GitHub pages deployment config. // If you aren't using GitHub pages, you don't need these. diff --git a/website/i18n/zh-Hans/code.json b/website/i18n/zh-Hans/code.json index e49f43e..027b01f 100644 --- a/website/i18n/zh-Hans/code.json +++ b/website/i18n/zh-Hans/code.json @@ -343,12 +343,10 @@ "description": "Label for cost savings statistic" }, "component.architectureDiagram.title": { - "message": "架构概览", - "description": "Architecture diagram section title" + "message": "架构概览" }, "component.architectureDiagram.subtitle": { - "message": "基于云原生技术构建,具备可扩展性和可靠性", - "description": "Architecture diagram section subtitle" + "message": "基于云原生技术构建,具备可扩展性和可靠性" }, "component.architectureDiagram.node.bison": { "message": "GPU 计费与调度平台", @@ -371,20 +369,16 @@ "description": "Description for Prometheus node" }, "component.useCases.title": { - "message": "真实应用场景", - "description": "Use cases section title" + "message": "真实应用场景" }, "component.useCases.subtitle": { - "message": "了解 Bison 如何在不同场景下转变 GPU 资源管理", - "description": "Use cases section subtitle" + "message": "了解 Bison 如何在不同场景下转变 GPU 资源管理" }, "component.useCases.beforeBison": { - "message": "使用前", - "description": "Before Bison comparison header" + "message": "使用前" }, "component.useCases.withBison": { - "message": "使用后", - "description": "With Bison comparison header" + "message": "使用后" }, "component.useCases.aiTraining.title": { "message": "AI 训练平台", @@ -481,5 +475,16 @@ "component.useCases.billing.after.automated": { "message": "自动开票", "description": "Billing after: automated invoicing" + }, + "homepage.getStarted": { + "message": "快速开始" + }, + "homepage.title": { + "message": "Bison - Kubernetes GPU 资源计费与多租户管理", + "description": "Homepage title" + }, + "homepage.description": { + "message": "基于 Kubernetes、Capsule 和 OpenCost 的企业级 GPU 资源计费和多租户管理平台", + "description": "Homepage meta description" } } diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/intro.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/intro.md new file mode 100644 index 0000000..1a9216b --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/intro.md @@ -0,0 +1,62 @@ +--- +sidebar_position: 1 +--- + +# 简介 + +让我们**在不到 5 分钟内**了解 **Bison**。 + +## 开始使用 + +通过**创建新集群**或**添加 Bison 到现有 Kubernetes 集群**开始使用。 + +### 您需要什么 + +- [Kubernetes](https://kubernetes.io/) 版本 1.22 或更高: + - 运行中的 Kubernetes 集群 + - 已配置 kubectl 访问 +- [Helm](https://helm.sh/) 版本 3.x 或更高 +- [Capsule](https://capsule.clastix.io/) 用于多租户管理 +- [OpenCost](https://www.opencost.io/) 用于成本追踪 +- [Prometheus](https://prometheus.io/) 用于指标收集 + +## 安装 Bison + +使用 Helm 在您的 Kubernetes 集群中安装 Bison: + +```bash +# 添加 Bison Helm 仓库 +helm repo add bison https://supermarioyl.github.io/Bison/ +helm repo update + +# 安装 Bison +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system:9003 +``` + +## 配置您的第一个租户 + +安装完成后,创建您的第一个租户(团队): + +```bash +kubectl apply -f - <
- Get Started 🚀 + Get Started 🚀 + title={translate({ + id: 'homepage.title', + message: 'Bison - Kubernetes GPU Resource Billing & Multi-Tenant Management', + description: 'Homepage title', + })} + description={translate({ + id: 'homepage.description', + message: 'Enterprise GPU resource billing and multi-tenant management platform based on Kubernetes, Capsule, and OpenCost', + description: 'Homepage meta description', + })}>
diff --git a/website/static/CNAME b/website/static/CNAME new file mode 100644 index 0000000..ba638fd --- /dev/null +++ b/website/static/CNAME @@ -0,0 +1 @@ +bison.lei6393.com From 5b382c20055ed546a1a623245e70c70091bbbe9c Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 22:10:55 +0800 Subject: [PATCH 12/44] fix bug --- .github/workflows/cleanup-docs-version.yml | 3 ++- .github/workflows/version-docs.yml | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cleanup-docs-version.yml b/.github/workflows/cleanup-docs-version.yml index 6161df5..6c5473e 100644 --- a/.github/workflows/cleanup-docs-version.yml +++ b/.github/workflows/cleanup-docs-version.yml @@ -21,6 +21,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + ref: main - name: Extract version from tag id: version @@ -108,7 +109,7 @@ jobs: - Deleted versioned_docs/version-${VERSION}/ - Deleted versioned_sidebars/version-${VERSION}-sidebars.json" - git push origin main + git push origin HEAD:main echo "✅ Changes pushed to main branch" echo "📚 Documentation version ${VERSION} has been removed" diff --git a/.github/workflows/version-docs.yml b/.github/workflows/version-docs.yml index b135c2a..1314da3 100644 --- a/.github/workflows/version-docs.yml +++ b/.github/workflows/version-docs.yml @@ -21,6 +21,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + ref: main - name: Extract version from tag id: version @@ -93,10 +94,10 @@ jobs: - Created versioned_docs/version-${VERSION}/ - Created versioned_sidebars/version-${VERSION}-sidebars.json" - git push origin main + git push origin HEAD:main echo "✅ Changes pushed to main branch" - echo "📚 Documentation will be available at: https://supermarioyl.github.io/Bison/docs/${VERSION}/" + echo "📚 Documentation will be available at: https://bison.lei6393.com/docs/${VERSION}/" - name: Summary run: | @@ -114,5 +115,5 @@ jobs: else echo "✅ **Status**: Documentation version created successfully" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "📚 **View Documentation**: [https://supermarioyl.github.io/Bison/docs/${VERSION}/](https://supermarioyl.github.io/Bison/docs/${VERSION}/)" >> $GITHUB_STEP_SUMMARY + echo "📚 **View Documentation**: [https://bison.lei6393.com/docs/${VERSION}/](https://bison.lei6393.com/docs/${VERSION}/)" >> $GITHUB_STEP_SUMMARY fi From 61ba8127e14acd7caa3c9ab2293335c3527a0f64 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 28 Dec 2025 14:11:59 +0000 Subject: [PATCH 13/44] docs: add version 0.0.6 [skip ci] Auto-generated documentation version from release refs/tags/v0.0.6 - Added version 0.0.6 to versions.json - Created versioned_docs/version-0.0.6/ - Created versioned_sidebars/version-0.0.6-sidebars.json --- .../version-0.0.6/architecture.md | 418 ++++++++++++++++++ .../version-0.0.6/configuration.md | 361 +++++++++++++++ .../versioned_docs/version-0.0.6/features.md | 195 ++++++++ .../version-0.0.6/installation.md | 316 +++++++++++++ website/versioned_docs/version-0.0.6/intro.md | 167 +++++++ .../version-0.0.6/user-guides/_category_.json | 8 + .../version-0.0.6/user-guides/admin.md | 176 ++++++++ .../version-0.0.6/user-guides/developer.md | 187 ++++++++ .../version-0.0.6/user-guides/team-leader.md | 126 ++++++ .../version-0.0.6-sidebars.json | 8 + website/versions.json | 1 + 11 files changed, 1963 insertions(+) create mode 100644 website/versioned_docs/version-0.0.6/architecture.md create mode 100644 website/versioned_docs/version-0.0.6/configuration.md create mode 100644 website/versioned_docs/version-0.0.6/features.md create mode 100644 website/versioned_docs/version-0.0.6/installation.md create mode 100644 website/versioned_docs/version-0.0.6/intro.md create mode 100644 website/versioned_docs/version-0.0.6/user-guides/_category_.json create mode 100644 website/versioned_docs/version-0.0.6/user-guides/admin.md create mode 100644 website/versioned_docs/version-0.0.6/user-guides/developer.md create mode 100644 website/versioned_docs/version-0.0.6/user-guides/team-leader.md create mode 100644 website/versioned_sidebars/version-0.0.6-sidebars.json diff --git a/website/versioned_docs/version-0.0.6/architecture.md b/website/versioned_docs/version-0.0.6/architecture.md new file mode 100644 index 0000000..dbbe50d --- /dev/null +++ b/website/versioned_docs/version-0.0.6/architecture.md @@ -0,0 +1,418 @@ +--- +sidebar_position: 5 +--- + +# Architecture + +This document provides a technical overview of Bison's architecture, designed with high cohesion and low coupling principles for maintainability and scalability. + +## System Overview + +### High-Level Architecture + +```mermaid +graph TB + subgraph PRESENT[Presentation Layer] + WEB[Web UI
React 18 + Ant Design 5] + CLI[kubectl / API Client] + end + + subgraph GATEWAY[API Gateway Layer] + GW[API Server
Go + Gin Framework] + AUTH[Auth Middleware
JWT + OIDC] + end + + subgraph BUSINESS[Business Logic Layer] + TS[Tenant Service
Team & Project CRUD] + BS[Billing Service
Cost Calculation] + BLS[Balance Service
Wallet Management] + QS[Quota Service
Resource Limits] + AS[Alert Service
Notifications] + RS[Report Service
Analytics] + end + + subgraph INTEGRATION[Integration Layer] + K8S[Kubernetes Client
client-go] + OCC[OpenCost Client
REST API] + PC[Prometheus Client
PromQL] + end + + subgraph EXTERNAL[External Systems] + KAPI[Kubernetes API] + CAP[Capsule Controller] + OC[OpenCost] + PROM[Prometheus] + end + + subgraph DATA[Data Layer] + CM[ConfigMaps
Persistent Storage] + end + + WEB --> GW + CLI --> GW + GW --> AUTH + AUTH --> TS & BS & BLS & QS & AS & RS + + TS --> K8S + BS --> OCC + BLS --> K8S + QS --> K8S + RS --> OCC & PC + + K8S --> KAPI + K8S --> CAP + OCC --> OC + PC --> PROM + + TS & BLS --> CM + KAPI --> CM +``` + +### Design Principles + +| Principle | Implementation | +|-----------|----------------| +| **High Cohesion** | Each service handles a single domain (billing, quota, alerts) | +| **Low Coupling** | Services communicate via well-defined interfaces | +| **Stateless API** | All state persisted in Kubernetes ConfigMaps | +| **Cloud Native** | Leverages Kubernetes primitives for HA and scaling | +| **Zero Database** | ConfigMaps eliminate external database dependencies | + +## Architecture Layers + +Bison follows a layered architecture pattern: + +### 1. Presentation Layer +- **Web UI**: React 18 + TypeScript + Ant Design 5 +- **API Client**: REST API for external integrations + +### 2. API Gateway Layer +- **API Server**: Go + Gin framework +- **Authentication**: JWT and OIDC support +- **Middleware**: Logging, recovery, CORS handling + +### 3. Business Logic Layer +- **Tenant Service**: Team and project management +- **Billing Service**: Cost calculation and aggregation +- **Balance Service**: Wallet management and auto-deduction +- **Quota Service**: Resource limit enforcement +- **Alert Service**: Multi-channel notifications +- **Report Service**: Analytics and export + +### 4. Integration Layer +- **Kubernetes Client**: client-go for K8s API interaction +- **OpenCost Client**: REST API for cost data +- **Prometheus Client**: PromQL queries for metrics + +### 5. Data Layer +- **ConfigMaps**: Persistent storage for balances, billing config, and metadata +- **etcd**: Backing store via Kubernetes ConfigMaps + +## Core Components + +### API Server + +The API server is the central component that handles all HTTP requests: + +**Technology Stack:** +- Go 1.24+ +- Gin web framework +- client-go for Kubernetes API + +**Key Responsibilities:** +- Serve REST API endpoints +- Authentication and authorization +- Request routing and middleware +- Background task scheduling + +**Endpoints:** +``` +/api/v1/teams - Team management +/api/v1/projects - Project management +/api/v1/billing - Billing configuration +/api/v1/balance - Balance operations +/api/v1/stats - Statistics and reports +``` + +### Web UI + +React-based single-page application: + +**Technology Stack:** +- React 18 +- TypeScript +- Vite (build tool) +- Ant Design 5 +- ECharts (visualization) +- React Query (state management) + +**Features:** +- Dashboard with real-time metrics +- Team and project management +- Billing configuration +- Balance monitoring +- Usage reports and export + +### Billing Service + +Calculates costs based on resource usage: + +**Data Flow:** +```mermaid +sequenceDiagram + participant S as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant K8S as Kubernetes + + S->>BS: Trigger billing (every 10min) + BS->>OC: Query team costs + OC-->>BS: Return usage data + BS->>BS: Calculate cost + BS->>BLS: Deduct from balance + BLS->>K8S: Update ConfigMap + BLS-->>BS: Confirm + BS->>AS: Check threshold + alt Balance low + AS->>Webhook: Send alert + end +``` + +**Billing Formula:** +``` +Total Cost = (CPU_cores × CPU_price × hours) + + (Memory_GB × Memory_price × hours) + + (GPU_count × GPU_price × hours) +``` + +### Balance Service + +Manages team wallets and auto-deduction: + +**Features:** +- Real-time balance tracking +- Auto-deduction based on usage +- Recharge operations +- Transaction history +- Auto-suspension when balance depleted + +**Storage:** +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: bison-team-balances + namespace: bison-system +data: + ml-team: "1523.45" + data-team: "890.12" + dev-team: "2100.00" +``` + +### Tenant Service + +Manages teams (Capsule Tenants) and projects (Namespaces): + +**Features:** +- Create/delete teams +- Assign resource quotas +- Configure node pools (shared/exclusive) +- Manage team metadata + +**Capsule Integration:** +```yaml +apiVersion: capsule.clastix.io/v1beta1 +kind: Tenant +metadata: + name: ml-team +spec: + owners: + - name: team-leader + kind: User + resourceQuota: + items: + - hard: + cpu: "20" + memory: 64Gi + nvidia.com/gpu: "4" +``` + +## Data Flow + +### Team Creation Flow + +```mermaid +sequenceDiagram + participant U as Admin + participant API as API Server + participant TS as Tenant Service + participant K8S as Kubernetes + participant CAP as Capsule + + U->>API: POST /api/v1/teams + API->>TS: CreateTeam(name, quota, balance) + TS->>CAP: Create Tenant + CAP-->>TS: Tenant created + TS->>K8S: Create ConfigMap (balance) + K8S-->>TS: ConfigMap created + TS-->>API: Success + API-->>U: 201 Created +``` + +### Billing Cycle Flow + +```mermaid +sequenceDiagram + participant SCH as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant AS as Alert Service + + loop Every 10 minutes + SCH->>BS: Trigger billing calculation + BS->>OC: Query costs (last 10min) + OC-->>BS: Return usage metrics + BS->>BS: Calculate total cost + BS->>BLS: Deduct cost from balance + BLS->>BLS: Update balance + BLS-->>BS: Balance updated + BS->>AS: Check balance threshold + alt Balance < 20% + AS->>AS: Send low balance alert + end + alt Balance <= 0 + AS->>BS: Suspend team workloads + end + end +``` + +## Integration Points + +### Kubernetes Integration + +Bison integrates deeply with Kubernetes: + +- **Capsule Tenants** for multi-tenancy +- **ResourceQuotas** for limit enforcement +- **Namespaces** for project isolation +- **ConfigMaps** for data persistence +- **RBAC** for access control + +### OpenCost Integration + +Real-time cost tracking via OpenCost API: + +```bash +# Query team costs +GET /allocation/compute?window=10m&aggregate=namespace&filter=namespace:ml-team + +# Response +{ + "ml-team": { + "cpuCost": 0.25, + "memCost": 0.10, + "gpuCost": 4.17, + "totalCost": 4.52 + } +} +``` + +### Prometheus Integration + +Metrics collection for monitoring: + +- Resource utilization metrics +- Cost metrics +- Balance metrics +- Alert metrics + +## Deployment Architecture + +### High Availability Setup + +```mermaid +graph TB + subgraph K8S[Kubernetes Cluster] + subgraph NS1[bison-system namespace] + API1[API Server Pod 1] + API2[API Server Pod 2] + WEB1[Web UI Pod 1] + WEB2[Web UI Pod 2] + end + + subgraph NS2[opencost-system namespace] + OC[OpenCost] + end + + subgraph NS3[prometheus-system namespace] + PROM[Prometheus] + end + + LB[LoadBalancer] + end + + LB --> API1 & API2 + LB --> WEB1 & WEB2 + API1 & API2 --> OC + OC --> PROM +``` + +### Resource Requirements + +**Minimum:** +- API Server: 200m CPU, 256Mi Memory +- Web UI: 100m CPU, 128Mi Memory + +**Recommended (Production):** +- API Server: 1000m CPU, 512Mi Memory (2 replicas) +- Web UI: 500m CPU, 256Mi Memory (2 replicas) + +## Security Model + +### Authentication +- JWT token-based authentication +- OIDC/SSO integration +- Admin user management + +### Authorization +- Kubernetes RBAC integration +- Role-based access control +- Team-scoped permissions + +### Data Security +- All data encrypted at rest (etcd encryption) +- TLS for API communication +- Secret management via Kubernetes Secrets + +## Technology Stack + +### Backend +- **Language**: Go 1.24+ +- **Framework**: Gin +- **Kubernetes Client**: client-go +- **Configuration**: Viper +- **Logging**: Logrus + +### Frontend +- **Framework**: React 18 +- **Language**: TypeScript +- **UI Library**: Ant Design 5 +- **Build Tool**: Vite +- **State Management**: React Query +- **Charts**: ECharts + +### Infrastructure +- **Platform**: Kubernetes 1.22+ +- **Multi-Tenancy**: Capsule +- **Cost Tracking**: OpenCost +- **Metrics**: Prometheus +- **Storage**: ConfigMaps (etcd) + +## Next Steps + +- [Installation Guide](installation.md) - Deploy Bison +- [Configuration](configuration.md) - Configure billing +- [User Guides](user-guides/admin.md) - Learn to use Bison +- [Features](features.md) - Explore capabilities diff --git a/website/versioned_docs/version-0.0.6/configuration.md b/website/versioned_docs/version-0.0.6/configuration.md new file mode 100644 index 0000000..539aaa3 --- /dev/null +++ b/website/versioned_docs/version-0.0.6/configuration.md @@ -0,0 +1,361 @@ +--- +sidebar_position: 6 +--- + +# Configuration + +This guide covers how to configure Bison for your specific environment and requirements. + +## Helm Chart Configuration + +Bison is configured primarily through Helm values. You can customize the installation by providing a `values.yaml` file or using `--set` flags. + +### Key Configuration Parameters + +| Parameter | Description | Default | Example | +|-----------|-------------|---------|---------| +| `auth.enabled` | Enable authentication | `false` | `true` | +| `auth.admin.username` | Admin username | `admin` | `admin` | +| `auth.admin.password` | Admin password | `admin` | `changeme` | +| `apiServer.replicaCount` | API server replicas | `2` | `3` | +| `apiServer.image.repository` | API server image | `ghcr.io/supermarioyl/bison/api-server` | - | +| `apiServer.image.tag` | API server image tag | `0.0.1` | `latest` | +| `webUI.replicaCount` | Web UI replicas | `2` | `3` | +| `webUI.image.repository` | Web UI image | `ghcr.io/supermarioyl/bison/web-ui` | - | +| `webUI.image.tag` | Web UI image tag | `0.0.1` | `latest` | +| `opencost.url` | OpenCost API endpoint | `http://opencost.opencost-system.svc:9003` | Custom URL | + +### Example Custom Values + +Create a `custom-values.yaml` file: + +```yaml +# Authentication +auth: + enabled: true + admin: + username: admin + password: MySecurePassword123 + +# API Server +apiServer: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 512Mi + +# Web UI +webUI: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + +# OpenCost Integration +opencost: + url: http://opencost.opencost-system.svc:9003 + +# Node Selection (optional) +nodeSelector: + node-role.kubernetes.io/control-plane: "" + +# Tolerations (optional) +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule +``` + +Install with custom values: + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --values custom-values.yaml +``` + +## Billing Configuration + +Billing settings are configured through the Web UI or API after installation. + +### Access Billing Configuration + +1. **Via Web UI:** + - Navigate to **Settings** > **Billing Configuration** + - Set pricing for CPU, Memory, GPU, and other resources + - Configure currency and billing intervals + +2. **Via API:** + ```bash + curl -X POST http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50 + }, + "billingInterval": "hourly" + }' + ``` + +### Billing Parameters + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `enabled` | Enable/disable billing | `true` | +| `currency` | Currency for billing | `USD`, `CNY`, `EUR` | +| `pricing.cpu` | CPU price per core-hour | `0.05` | +| `pricing.memory` | Memory price per GB-hour | `0.01` | +| `pricing["nvidia.com/gpu"]` | GPU price per GPU-hour | `2.50` | +| `billingInterval` | Billing aggregation period | `hourly`, `daily` | +| `lowBalanceThreshold` | Warning threshold (%) | `20` | +| `suspendThreshold` | Auto-suspend threshold (%) | `5` | + +### Example Billing Configuration + +```json +{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "nvidia.com/mig-1g.5gb": 0.50, + "nvidia.com/mig-2g.10gb": 1.00 + }, + "billingInterval": "hourly", + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## Team Configuration + +### Creating Teams + +Teams can be created through the Web UI or API: + +**Via Web UI:** +1. Navigate to **Teams** page +2. Click **Create Team** +3. Set team name, quota, and initial balance + +**Via API:** +```bash +curl -X POST http://localhost:8080/api/v1/teams \ + -H "Content-Type: application/json" \ + -d '{ + "name": "ml-team", + "description": "Machine Learning Team", + "quota": { + "cpu": "20", + "memory": "64Gi", + "nvidia.com/gpu": "4" + }, + "balance": 1000.00 + }' +``` + +### Team Quotas + +Team quotas define resource limits: + +```yaml +quota: + cpu: "20" # 20 CPU cores + memory: "64Gi" # 64 GB RAM + nvidia.com/gpu: "4" # 4 GPUs + storage: "500Gi" # 500 GB storage +``` + +### Team Balance Management + +Set initial balance and configure auto-recharge: + +```json +{ + "balance": 1000.00, + "autoRecharge": { + "enabled": true, + "amount": 500.00, + "schedule": "monthly", + "threshold": 100.00 + } +} +``` + +## Alert Configuration + +Configure multi-channel alerts for low balance and quota warnings. + +### Webhook Alerts + +```json +{ + "type": "webhook", + "enabled": true, + "url": "https://your-webhook-endpoint.com/alerts", + "headers": { + "Authorization": "Bearer YOUR_TOKEN" + }, + "template": { + "title": "Bison Alert", + "message": "Team {{.TeamName}} balance is {{.Balance}}" + } +} +``` + +### DingTalk Alerts + +```json +{ + "type": "dingtalk", + "enabled": true, + "webhook": "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN", + "secret": "YOUR_SECRET" +} +``` + +### WeChat Work Alerts + +```json +{ + "type": "wechat", + "enabled": true, + "corpid": "YOUR_CORP_ID", + "corpsecret": "YOUR_CORP_SECRET", + "agentid": 1000001 +} +``` + +## OpenCost Integration + +Configure OpenCost connection: + +### Check OpenCost Connectivity + +```bash +# Test OpenCost API +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz + +# Test allocation API +curl http://localhost:9003/allocation/compute?window=1d +``` + +### Update OpenCost URL + +If OpenCost is deployed in a different namespace or with a different service name: + +```bash +helm upgrade bison bison/bison \ + --set opencost.url=http://my-opencost.custom-namespace.svc:9003 \ + --namespace bison-system +``` + +## Authentication & OIDC + +Enable authentication and integrate with your SSO provider: + +### Basic Authentication + +```yaml +auth: + enabled: true + admin: + username: admin + password: SecurePassword123 +``` + +### OIDC Integration + +```yaml +auth: + enabled: true + oidc: + enabled: true + issuerURL: https://your-oidc-provider.com + clientID: bison-client-id + clientSecret: your-client-secret + redirectURL: https://bison.example.com/callback +``` + +## Environment Variables + +Additional configuration can be provided via environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `KUBECONFIG` | Path to kubeconfig file | In-cluster config | +| `OPENCOST_URL` | OpenCost API URL | `http://opencost.opencost-system.svc:9003` | +| `AUTH_ENABLED` | Enable authentication | `false` | +| `LOG_LEVEL` | Logging level | `info` | +| `BILLING_INTERVAL` | Billing calculation interval | `10m` | + +Set environment variables in Helm values: + +```yaml +apiServer: + env: + - name: LOG_LEVEL + value: debug + - name: BILLING_INTERVAL + value: 5m +``` + +## Advanced Configuration + +### Custom Resource Pricing + +Price any Kubernetes resource: + +```json +{ + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "amd.com/gpu": 2.00, + "ephemeral-storage": 0.001, + "custom.io/fpga": 5.00 + } +} +``` + +### Multi-Cluster Support + +Deploy Bison in each cluster with shared billing: + +```yaml +# Cluster A +apiServer: + clusterName: prod-us-west + +# Cluster B +apiServer: + clusterName: prod-us-east +``` + +## Next Steps + +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Understand the system design +- [Features](features.md) - Explore all capabilities diff --git a/website/versioned_docs/version-0.0.6/features.md b/website/versioned_docs/version-0.0.6/features.md new file mode 100644 index 0000000..8d0c438 --- /dev/null +++ b/website/versioned_docs/version-0.0.6/features.md @@ -0,0 +1,195 @@ +--- +sidebar_position: 2 +--- + +# Features + +Bison provides a comprehensive suite of features for GPU resource management, billing, and multi-tenant isolation in Kubernetes environments. + +## See Bison in Action + +### 🎯 Real-Time Resource Dashboard + +![Bison Dashboard](/img/ui-dashboard.png) + +**What you see:** +- **Cluster Overview** - Total teams, projects, resource pools, and quotas at a glance +- **Resource Utilization** - Visual breakdown showing which teams are consuming resources +- **7-Day Cost Trends** - Historical cost data to identify spending patterns +- **Top 5 Cost Rankings** - Quickly identify heavy GPU consumers +- **Team Budget Status** - Real-time balance monitoring with color-coded alerts + +**Who benefits:** +- **Platform Administrators** get instant visibility into cluster health and usage patterns +- **Finance Teams** can track costs in real-time without waiting for monthly reports +- **Team Leaders** can compare their usage against other teams + +--- + +### 💼 Team Management & Budget Monitoring + +![Team Management](/img/ui-team.png) + +**What you see:** +- **Team List** with real-time status indicators: + - 🟢 Green balance = Healthy budget + - 🟡 Yellow balance = Approaching threshold + - 🔴 Red balance = Low balance or suspended +- **Resource Allocation** - CPU/Memory/GPU quotas per team (e.g., "cpu 0/10" means 0 used out of 10 allocated) +- **Project Count** - Number of namespaces/projects under each team +- **Quick Actions** - Edit quotas, recharge balance, or delete team with one click + +**Who benefits:** +- **Team Leaders** monitor their budget status and resource usage at a glance +- **Administrators** manage multiple teams from a single unified view +- **Finance Teams** see which teams need recharging + +--- + +### 💰 Flexible Billing Configuration + +![Billing Configuration](/img/ui-billing.png) + +**What you see:** +- **Per-Resource Pricing** - Set custom prices for CPU (per core-hour), Memory (per GB-hour), GPU (per GPU-hour) +- **Currency Selection** - Support for CNY, USD, EUR, and other currencies +- **Enable/Disable Toggle** - Turn billing on/off for specific resources with one click +- **Billing Rules** - Define how resources are metered (hourly, daily, etc.) +- **Alert Thresholds** - Configure when to send low-balance warnings + +**Who benefits:** +- **Finance Teams** align cloud costs with internal chargeback policies +- **Administrators** adjust pricing based on actual hardware costs +- **Budget Managers** set appropriate warning thresholds to prevent overruns + +--- + +## Core Capabilities + +### Multi-Tenant Management +✅ **Capsule-Powered Isolation** - True multi-tenancy using Kubernetes-native Capsule operator +✅ **OIDC Integration** - Enterprise SSO support for authentication +✅ **Team-Based Access Control** - Manage users, roles, and permissions per team +✅ **Shared & Exclusive Node Pools** - Flexible resource allocation strategies + +### Real-Time Billing +✅ **Usage-Based Billing** - Accurate cost tracking based on actual resource consumption +✅ **Configurable Pricing** - Set custom rates for CPU, Memory, GPU, and any Kubernetes resource +✅ **Multi-Currency Support** - CNY, USD, EUR, and more +✅ **Billing Rules Engine** - Define custom billing logic and aggregation periods + +### Dynamic Resource Quotas +✅ **Per-Team Quotas** - CPU, Memory, GPU, Storage, and custom resources +✅ **Namespace Quotas** - Project-level resource limits within teams +✅ **Auto-Enforcement** - Kubernetes-native quota enforcement +✅ **Quota Alerts** - Notifications when approaching limits + +### Team Balance & Wallet System +✅ **Prepaid Balances** - Team wallets with real-time deduction +✅ **Auto-Deduction** - Automated billing based on resource usage +✅ **Balance Thresholds** - Configurable warning and suspension levels +✅ **Transaction History** - Complete audit trail of all balance changes + +### Auto-Recharge +✅ **Scheduled Top-Ups** - Weekly or monthly automatic recharges +✅ **Custom Amounts** - Flexible recharge amounts per team +✅ **Recharge Notifications** - Alert teams when balance is added + +### Balance Alerts +✅ **Multi-Channel Notifications** - Webhook, DingTalk, WeChat, Email +✅ **Configurable Thresholds** - Set warning levels (e.g., 20%, 10%, 5%) +✅ **Auto-Suspension** - Automatically suspend workloads when balance depleted +✅ **Custom Templates** - Customize alert messages + +### Usage Reports +✅ **Team Analytics** - Per-team cost breakdowns and trends +✅ **Project Analytics** - Namespace-level resource consumption +✅ **Export Capabilities** - CSV, Excel, PDF reports +✅ **Historical Data** - 30/60/90-day cost analysis + +### Audit Logging +✅ **Complete Operation History** - Track all administrative actions +✅ **User Attribution** - Who did what and when +✅ **Resource Changes** - Track quota, balance, and configuration changes +✅ **Compliance Ready** - Meet internal audit requirements + +--- + +## Architecture Highlights + +Bison's architecture is designed for simplicity, scalability, and zero external dependencies. + +```mermaid +graph TB + subgraph USER_LAYER[User Layer] + UI[Web UI
React + Ant Design] + API[REST API
Go + Gin] + end + + subgraph CORE[Core Services] + BS[Billing Service] + TS[Tenant Service] + QS[Quota Service] + end + + subgraph K8S[Kubernetes Layer] + CA[Capsule
Multi-Tenancy] + OC[OpenCost
Cost Tracking] + PR[Prometheus
Metrics] + end + + subgraph DATA[Data Layer] + CM[ConfigMaps
Zero Database] + end + + UI --> API + API --> BS & TS & QS + BS --> OC + TS --> CA + QS --> CA + BS & TS --> CM + OC --> PR +``` + +### Key Architectural Benefits + +- **Zero External Dependencies** - All data stored in Kubernetes ConfigMaps (etcd-backed) +- **Cloud-Native** - Built on Kubernetes primitives for maximum portability +- **Scalable** - Stateless API server that can scale horizontally +- **Secure** - Kubernetes RBAC integration and optional authentication +- **Observable** - Prometheus metrics and structured logging +- **Extensible** - Plugin architecture for custom billing rules and alerts + +--- + +## Integration Points + +### OpenCost Integration +Bison leverages [OpenCost](https://www.opencost.io/) for real-time cost tracking: +- Per-pod, per-namespace, per-team cost visibility +- GPU utilization metrics +- Historical cost data and trends +- Integration with Prometheus for metric collection + +### Capsule Integration +Bison uses [Capsule](https://capsule.clastix.io/) for multi-tenancy: +- Team-based tenant isolation +- Namespace quota enforcement +- Network and security policies +- OIDC/SSO integration + +### Prometheus Integration +Metrics collection and monitoring: +- Resource utilization tracking +- Custom billing metrics +- Alert rule evaluation +- Historical data retention + +--- + +## Next Steps + +- [Installation Guide](installation.md) - Deploy Bison in your cluster +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Deep dive into system design +- [Configuration](configuration.md) - Configure billing and settings diff --git a/website/versioned_docs/version-0.0.6/installation.md b/website/versioned_docs/version-0.0.6/installation.md new file mode 100644 index 0000000..da89191 --- /dev/null +++ b/website/versioned_docs/version-0.0.6/installation.md @@ -0,0 +1,316 @@ +--- +sidebar_position: 3 +--- + +# Installation Guide + +This guide provides detailed instructions for installing Bison in your Kubernetes cluster. + +## Prerequisites + +Before installing Bison, ensure you have: + +- **Kubernetes 1.22+** - A running Kubernetes cluster +- **kubectl** - Configured to access your cluster +- **Helm 3.0+** - Package manager for Kubernetes +- **Capsule Operator v0.1.0+** - For multi-tenant isolation +- **OpenCost** - Deployed with Prometheus for cost tracking + +### Install Prerequisites + +If you haven't installed the required components: + +#### Install Capsule + +```bash +# Using Helm +helm repo add projectcapsule https://projectcapsule.github.io/charts +helm install capsule projectcapsule/capsule \ + --namespace capsule-system \ + --create-namespace +``` + +#### Install OpenCost + +```bash +# Using Helm +helm repo add opencost https://opencost.github.io/opencost-helm-chart +helm install opencost opencost/opencost \ + --namespace opencost-system \ + --create-namespace \ + --set prometheus.internal.serviceName=prometheus-server \ + --set prometheus.internal.namespaceName=prometheus-system +``` + +## Installation Methods + +Bison Helm charts are distributed via **GitHub Container Registry (GHCR)** using the modern OCI format. + +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 + +### Option A: From GHCR (Recommended) + +The simplest way to install Bison is directly from GitHub Container Registry: + +```bash +# Install specific version from GHCR +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace + +# Or pull the chart first, then install +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace + +# Customize installation +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true \ + --set apiServer.image.tag=0.0.2 \ + --set webUI.image.tag=0.0.2 +``` + +**Why GHCR OCI Format?** +- ✅ No separate Helm repository maintenance needed +- ✅ Unified with Docker images in GHCR +- ✅ Faster installation (direct registry pull) +- ✅ Modern Helm 3.8+ standard practice + +### Option B: From GitHub Release + +Download a specific version from GitHub Releases: + +```bash +# Download Helm chart +VERSION=0.0.2 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# Install the chart +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +### Option C: From Source + +Clone and build from source: + +```bash +# Clone repository +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# Install dependencies and build +make install-deps +make build + +# Deploy using Helm +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +## Configuration Options + +Bison can be configured using Helm values. Here are the key configuration options: + +### Basic Configuration + +```yaml +# values.yaml +apiServer: + image: + repository: ghcr.io/supermarioyl/bison/api-server + tag: 0.0.1 + replicas: 2 + +webUI: + image: + repository: ghcr.io/supermarioyl/bison/web-ui + tag: 0.0.1 + replicas: 2 + +# OpenCost URL +opencost: + url: http://opencost.opencost-system.svc:9003 + +# Authentication +auth: + enabled: false +``` + +### Custom Configuration Example + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set apiServer.replicas=3 \ + --set webUI.replicas=3 \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true +``` + +## Verify Installation + +After installation, verify that all components are running: + +```bash +# Check pod status +kubectl get pods -n bison-system + +# Expected output: +# NAME READY STATUS RESTARTS AGE +# bison-api-server-xxxxxxxxx-xxxxx 1/1 Running 0 2m +# bison-webui-xxxxxxxxx-xxxxx 1/1 Running 0 2m + +# Check services +kubectl get svc -n bison-system + +# Check logs +kubectl logs -n bison-system deployment/bison-api-server +kubectl logs -n bison-system deployment/bison-webui +``` + +## Access the Platform + +### Port Forward (Development) + +```bash +# Port-forward the Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# Access at http://localhost:3000 +``` + +### Ingress (Production) + +For production deployments, configure an Ingress: + +```yaml +# ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: bison-ingress + namespace: bison-system + annotations: + kubernetes.io/ingress.class: nginx +spec: + rules: + - host: bison.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: bison-webui + port: + number: 80 + - path: /api + pathType: Prefix + backend: + service: + name: bison-api-server + port: + number: 8080 +``` + +Apply the Ingress: + +```bash +kubectl apply -f ingress.yaml +``` + +## Docker Images + +Bison images are available on GitHub Container Registry: + +```bash +# Pull images +docker pull ghcr.io/supermarioyl/bison/api-server:0.0.1 +docker pull ghcr.io/supermarioyl/bison/web-ui:0.0.1 + +# Or use latest +docker pull ghcr.io/supermarioyl/bison/api-server:latest +docker pull ghcr.io/supermarioyl/bison/web-ui:latest +``` + +**Supported Platforms:** +- `linux/amd64` +- `linux/arm64` + +## Upgrading + +To upgrade Bison to a new version: + +```bash +# Update Helm repository +helm repo update + +# Upgrade to latest version +helm upgrade bison bison/bison --namespace bison-system + +# Or upgrade to specific version +helm upgrade bison bison/bison --version 0.0.2 --namespace bison-system +``` + +## Uninstalling + +To completely remove Bison: + +```bash +# Uninstall Helm release +helm uninstall bison --namespace bison-system + +# Remove namespace (optional) +kubectl delete namespace bison-system +``` + +## Troubleshooting + +### Pod Not Starting + +Check pod logs for errors: + +```bash +kubectl logs -n bison-system deployment/bison-api-server +kubectl describe pod -n bison-system +``` + +### Cannot Connect to OpenCost + +Verify OpenCost is running and accessible: + +```bash +kubectl get svc -n opencost-system +kubectl port-forward -n opencost-system svc/opencost 9003:9003 + +# Test endpoint +curl http://localhost:9003/healthz +``` + +### Authentication Issues + +If authentication is enabled, ensure you have the correct credentials: + +```bash +# Default credentials (change in production!) +Username: admin +Password: admin +``` + +## Next Steps + +- [Configuration Guide](configuration.md) - Configure billing and settings +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Understand the system design diff --git a/website/versioned_docs/version-0.0.6/intro.md b/website/versioned_docs/version-0.0.6/intro.md new file mode 100644 index 0000000..b3dc3aa --- /dev/null +++ b/website/versioned_docs/version-0.0.6/intro.md @@ -0,0 +1,167 @@ +--- +sidebar_position: 1 +slug: / +--- + +# Introduction to Bison + +![Bison Logo](/img/logo.png) + +**Enterprise GPU Resource Billing & Multi-Tenant Management Platform** + +Bison is a Kubernetes-based platform that provides comprehensive GPU resource management, billing, and multi-tenant isolation for organizations running shared GPU clusters. + +## The GPU Management Challenge + +Managing shared GPU clusters across multiple teams creates critical operational and financial challenges: + +**For Platform Administrators:** +- How do you fairly allocate expensive GPU resources across competing teams? +- How do you prevent resource hogging while ensuring everyone gets their fair share? +- How do you track who's using what and implement accurate chargeback? +- How do you maintain strict multi-tenant isolation without complex manual configuration? + +**For Finance & Budget Teams:** +- How do you implement automated chargeback for GPU usage without manual accounting? +- How do you prevent budget overruns before they happen? +- How do you generate accurate cost reports for internal billing? + +**For Development Teams:** +- How do you get predictable, isolated access to GPU resources? +- How do you know when you're approaching your budget limits? +- How do you avoid impacting other teams' workloads? + +**Traditional Approach:** +- Manual quota configuration per namespace +- Excel-based billing calculations +- No real-time cost visibility +- Complex multi-tool setup (quota management + cost tracking + billing system) +- Frequent resource conflicts and budget surprises + +## Bison's Integrated Solution + +```mermaid +graph TB + subgraph WITHOUT["Without Bison"] + P1[❌ Manual Quota Management
Per-namespace configuration] + P2[❌ Spreadsheet Billing
Manual calculations & reports] + P3[❌ No Resource Isolation
Teams compete for resources] + P4[❌ Budget Overruns
No proactive alerts] + P5[❌ Complex Tooling
Multiple systems to manage] + end + + subgraph WITH["With Bison"] + S1[✅ Automated Team Quotas
Capsule-powered isolation] + S2[✅ Real-Time Billing
OpenCost integration] + S3[✅ True Multi-Tenancy
Shared/Exclusive modes] + S4[✅ Proactive Alerts
Balance monitoring & auto-suspend] + S5[✅ Unified Platform
Single pane of glass] + end + + P1 -.Transform.-> S1 + P2 -.Transform.-> S2 + P3 -.Transform.-> S3 + P4 -.Transform.-> S4 + P5 -.Transform.-> S5 + + style WITHOUT fill:#ffebee + style WITH fill:#e8f5e9 + style S1 fill:#4caf50,color:#fff + style S2 fill:#4caf50,color:#fff + style S3 fill:#4caf50,color:#fff + style S4 fill:#4caf50,color:#fff + style S5 fill:#4caf50,color:#fff +``` + +**Bison combines:** +- 🔐 **Kubernetes-native multi-tenancy** (Capsule) - True team isolation with shared or exclusive node pools +- 💰 **Real-time cost tracking** (OpenCost + Prometheus) - Per-pod, per-namespace, per-team cost visibility +- 💳 **Automated billing & budgets** - Prepaid balances, auto-deduction, low-balance alerts, and auto-suspension +- 📊 **Unified dashboard** - Single interface for admins, team leaders, and finance teams +- 🔧 **Zero external dependencies** - All data stored in Kubernetes ConfigMaps (etcd-backed) + +**Result:** Deploy once, get complete GPU resource management with automated billing in under 30 minutes. + +## Quick Start + +### Prerequisites + +- Kubernetes 1.22+ +- kubectl configured +- Helm 3.0+ +- Capsule operator (v0.1.0+) installed +- OpenCost deployed with Prometheus + +### Installation + +Choose one of the following installation methods: + +#### Option A: Using Helm Repository (Recommended) + +```bash +# Add Bison Helm repository +helm repo add bison https://supermarioyl.github.io/Bison/charts/ +helm repo update + +# Install with default configuration +helm install bison bison/bison --namespace bison-system --create-namespace + +# Or customize installation +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=false +``` + +#### Option B: From GitHub Release + +```bash +# Download latest Helm chart +VERSION=0.0.1 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# Install +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +#### Option C: From Source + +```bash +# Clone repository +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# Install dependencies and build +make install-deps +make build + +# Deploy using Helm +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +### Access the Platform + +After installation, access Bison through: + +```bash +# Port-forward the Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# Access at http://localhost:3000 +# Default credentials (if auth enabled): +# Username: admin +# Password: admin (change immediately in production!) +``` + +## Next Steps + +- [Explore Features](features.md) - Learn about all capabilities +- [Installation Guide](installation.md) - Detailed installation instructions +- [User Guides](user-guides/admin.md) - Role-based user guides +- [Architecture](architecture.md) - Understand the system architecture +- [Configuration](configuration.md) - Configure billing and settings diff --git a/website/versioned_docs/version-0.0.6/user-guides/_category_.json b/website/versioned_docs/version-0.0.6/user-guides/_category_.json new file mode 100644 index 0000000..fe79f61 --- /dev/null +++ b/website/versioned_docs/version-0.0.6/user-guides/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "User Guides", + "position": 4, + "link": { + "type": "generated-index", + "description": "Role-based guides for using Bison effectively." + } +} diff --git a/website/versioned_docs/version-0.0.6/user-guides/admin.md b/website/versioned_docs/version-0.0.6/user-guides/admin.md new file mode 100644 index 0000000..a466cbb --- /dev/null +++ b/website/versioned_docs/version-0.0.6/user-guides/admin.md @@ -0,0 +1,176 @@ +--- +sidebar_position: 1 +--- + +# Administrator Guide + +This guide is for platform administrators who deploy, configure, and manage the Bison platform. + +## Responsibilities + +As a platform administrator, you are responsible for: + +- ✅ Deploying and configuring Bison +- ✅ Creating and managing teams +- ✅ Setting global billing configuration +- ✅ Monitoring cluster-wide metrics +- ✅ Responding to alerts and recharge requests + +## Getting Started + +### 1. Deploy Bison + +Follow the [Installation Guide](../installation.md) to deploy Bison in your Kubernetes cluster. + +### 2. Configure Billing + +Set up billing rules and pricing: + +1. Access the Web UI +2. Navigate to **Settings** > **Billing Configuration** +3. Configure: + - **Currency**: USD, CNY, EUR, etc. + - **CPU Price**: Cost per core-hour + - **Memory Price**: Cost per GB-hour + - **GPU Price**: Cost per GPU-hour +4. Click **Save** + +### 3. Create First Team + +Create a team for your users: + +1. Navigate to **Teams** page +2. Click **Create Team** +3. Fill in: + - **Team Name**: e.g., "ml-team" + - **Description**: Team purpose + - **Resource Quota**: + - CPU: e.g., "20" cores + - Memory: e.g., "64Gi" + - GPU: e.g., "4" + - **Initial Balance**: e.g., 1000.00 +4. Click **Create** + +## Common Tasks + +### Managing Teams + +#### View All Teams + +```bash +# Via kubectl +kubectl get tenants + +# Via API +curl http://localhost:8080/api/v1/teams +``` + +#### Update Team Quota + +1. Navigate to **Teams** page +2. Click **Edit** on the team row +3. Modify quotas +4. Click **Save** + +#### Recharge Team Balance + +1. Navigate to **Teams** page +2. Click **Recharge** on the team row +3. Enter amount +4. Add notes (optional) +5. Click **Confirm** + +### Monitoring + +#### View Dashboard + +Access real-time cluster metrics: +- Total teams and projects +- Resource utilization +- Cost trends +- Top consumers +- Balance status + +#### Check Alerts + +Monitor low-balance and quota alerts: +1. Navigate to **Alerts** page +2. Review active alerts +3. Take action as needed + +### Billing Configuration + +#### Update Pricing + +```bash +curl -X PUT http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "pricing": { + "cpu": 0.06, + "memory": 0.012, + "nvidia.com/gpu": 3.00 + } + }' +``` + +#### Configure Alert Thresholds + +```json +{ + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## Best Practices + +### Team Naming +- Use lowercase, alphanumeric characters and hyphens +- Example: `ml-team`, `data-science`, `dev-team` + +### Quota Allocation +- Start with conservative quotas +- Monitor usage for 1-2 weeks +- Adjust based on actual needs + +### Balance Management +- Set up auto-recharge for critical teams +- Monitor balance trends weekly +- Respond to low-balance alerts promptly + +### Security +- Enable authentication in production +- Use OIDC/SSO for enterprise deployments +- Regularly audit user permissions + +## Troubleshooting + +### Team Creation Failed + +Check Capsule operator logs: +```bash +kubectl logs -n capsule-system deployment/capsule-controller-manager +``` + +### Billing Not Working + +Verify OpenCost connectivity: +```bash +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz +``` + +### High Resource Usage + +Check resource consumption: +```bash +kubectl top pods -n bison-system +``` + +## Next Steps + +- [Team Leader Guide](team-leader.md) - Guide for team leaders +- [Developer Guide](developer.md) - Guide for developers +- [Configuration](../configuration.md) - Advanced configuration diff --git a/website/versioned_docs/version-0.0.6/user-guides/developer.md b/website/versioned_docs/version-0.0.6/user-guides/developer.md new file mode 100644 index 0000000..d14ab62 --- /dev/null +++ b/website/versioned_docs/version-0.0.6/user-guides/developer.md @@ -0,0 +1,187 @@ +--- +sidebar_position: 3 +--- + +# Developer Guide + +This guide is for developers who deploy workloads and consume resources within team projects. + +## Responsibilities + +As a developer, you are responsible for: + +- ✅ Deploying applications within your project +- ✅ Monitoring resource usage +- ✅ Staying within quota limits +- ✅ Optimizing resource consumption + +## Getting Started + +### 1. Get Kubeconfig + +Request kubeconfig from your team leader or administrator. + +### 2. Set Context + +```bash +# Set context to your project namespace +kubectl config set-context --current --namespace=your-project + +# Verify +kubectl config view --minify | grep namespace +``` + +### 3. Check Quota + +See your available resources: +```bash +kubectl describe quota +``` + +## Deploying Workloads + +### Basic Pod Deployment + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-training-job + namespace: your-project +spec: + containers: + - name: trainer + image: your-ml-image:latest + resources: + requests: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" +``` + +### Using Deployments + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ml-inference + namespace: your-project +spec: + replicas: 2 + selector: + matchLabels: + app: ml-inference + template: + metadata: + labels: + app: ml-inference + spec: + containers: + - name: inference + image: your-inference-image:latest + resources: + requests: + cpu: "2" + memory: "8Gi" + nvidia.com/gpu: "1" +``` + +## Monitoring Usage + +### Check Pod Resource Usage + +```bash +# View resource consumption +kubectl top pods + +# Detailed pod information +kubectl describe pod +``` + +### View Logs + +```bash +# Stream logs +kubectl logs -f + +# Previous logs (if pod restarted) +kubectl logs --previous +``` + +## Best Practices + +### Resource Requests and Limits + +Always specify both requests and limits: +```yaml +resources: + requests: + cpu: "2" + memory: "8Gi" + limits: + cpu: "4" + memory: "16Gi" +``` + +### GPU Usage + +- Request GPUs only when needed +- Use GPU for compute-intensive tasks +- Monitor GPU utilization + +### Clean Up + +Delete resources when no longer needed: +```bash +# Delete pod +kubectl delete pod + +# Delete deployment +kubectl delete deployment + +# Clean up completed jobs +kubectl delete job --field-selector status.successful=1 +``` + +### Cost Optimization + +- Right-size your resource requests +- Use horizontal pod autoscaling +- Clean up idle resources +- Share GPUs when possible (if supported) + +## Troubleshooting + +### Pod Pending (Insufficient Quota) + +If your pod is stuck in `Pending` state: + +```bash +kubectl describe pod +``` + +Look for quota-related errors and reduce resource requests or ask your team leader for more quota. + +### Out of Memory (OOM) + +If pods are killed due to OOM: +1. Check memory usage patterns +2. Increase memory limits +3. Optimize application memory usage + +### GPU Not Available + +Verify GPU requests: +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,GPU:.status.allocatable."nvidia\.com/gpu" +``` + +## Next Steps + +- [Team Leader Guide](team-leader.md) - Understand team management +- [Architecture](../architecture.md) - Learn about the platform diff --git a/website/versioned_docs/version-0.0.6/user-guides/team-leader.md b/website/versioned_docs/version-0.0.6/user-guides/team-leader.md new file mode 100644 index 0000000..7b6de96 --- /dev/null +++ b/website/versioned_docs/version-0.0.6/user-guides/team-leader.md @@ -0,0 +1,126 @@ +--- +sidebar_position: 2 +--- + +# Team Leader Guide + +This guide is for team leaders who manage projects, monitor budgets, and allocate resources within their team. + +## Responsibilities + +As a team leader, you are responsible for: + +- ✅ Creating and managing projects (namespaces) +- ✅ Allocating quotas to projects +- ✅ Monitoring team balance and consumption +- ✅ Requesting recharges when needed + +## Getting Started + +### 1. Access Bison + +Log in to the Web UI with your credentials. + +### 2. View Team Dashboard + +Your dashboard shows: +- Team balance and status +- Resource utilization +- Active projects +- Cost trends + +## Managing Projects + +### Create a Project + +1. Navigate to **Projects** page +2. Click **Create Project** +3. Fill in: + - **Project Name**: e.g., "training-ml-models" + - **Description**: Project purpose + - **Quota** (optional): + - CPU: e.g., "8" cores + - Memory: e.g., "32Gi" + - GPU: e.g., "2" +4. Click **Create** + +### List Projects + +```bash +# Via kubectl (if you have access) +kubectl get namespaces -l capsule.clastix.io/tenant=your-team + +# Via API +curl http://localhost:8080/api/v1/teams/your-team/projects +``` + +### Delete a Project + +1. Navigate to **Projects** page +2. Click **Delete** on the project row +3. Confirm deletion + +**Warning**: This will delete all resources in the project! + +## Monitoring Budget + +### Check Balance + +View your current balance: +1. Navigate to **Team** page +2. See balance in the status card + +### View Usage Trends + +Analyze spending patterns: +1. Navigate to **Reports** page +2. Select time range (7 days, 30 days, 90 days) +3. View: + - Cost breakdown by resource type + - Daily cost trends + - Per-project consumption + +### Request Recharge + +When balance is low: +1. Click **Request Recharge** button +2. Enter requested amount +3. Add justification +4. Submit request to administrator + +## Resource Management + +### Monitor Quota Usage + +Check how much of your quota is being used: +```bash +kubectl describe quota -n your-project +``` + +### Optimize Costs + +Tips to reduce spending: +- **Right-size resources**: Don't over-provision CPU/Memory +- **Clean up idle pods**: Delete unused workloads +- **Use spot/preemptible instances**: Where applicable +- **Monitor GPU utilization**: Ensure GPUs are fully utilized + +## Best Practices + +### Project Organization +- Create separate projects for different workloads +- Example: `ml-training`, `ml-inference`, `data-processing` + +### Quota Allocation +- Allocate quotas based on project priority +- Reserve buffer for urgent tasks + +### Cost Awareness +- Review costs weekly +- Identify and eliminate waste +- Set up cost alerts + +## Next Steps + +- [Developer Guide](developer.md) - Guide for your team members +- [Features](../features.md) - Explore all Bison features diff --git a/website/versioned_sidebars/version-0.0.6-sidebars.json b/website/versioned_sidebars/version-0.0.6-sidebars.json new file mode 100644 index 0000000..caea0c0 --- /dev/null +++ b/website/versioned_sidebars/version-0.0.6-sidebars.json @@ -0,0 +1,8 @@ +{ + "tutorialSidebar": [ + { + "type": "autogenerated", + "dirName": "." + } + ] +} diff --git a/website/versions.json b/website/versions.json index daa9a70..d470196 100644 --- a/website/versions.json +++ b/website/versions.json @@ -1,3 +1,4 @@ [ + "0.0.6", "0.0.1" ] From 151a244f97e59a7a652cb41f77d2338ae634e7d7 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 22:20:07 +0800 Subject: [PATCH 14/44] fix bug --- .github/workflows/release.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4e4176a..0775194 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -176,10 +176,10 @@ jobs: #### Method 1: From GHCR (Recommended) \`\`\`bash # Install directly from GitHub Container Registry - helm install my-bison oci://ghcr.io/${{ github.repository_owner }}/bison/bison --version ${VERSION} + helm install my-bison oci://ghcr.io/${{ github.repository_owner }}/charts/bison --version ${VERSION} # Or pull first, then install - helm pull oci://ghcr.io/${{ github.repository_owner }}/bison/bison --version ${VERSION} + helm pull oci://ghcr.io/${{ github.repository_owner }}/charts/bison --version ${VERSION} helm install my-bison bison-${VERSION}.tgz \`\`\` @@ -203,7 +203,7 @@ jobs: ### 📚 Documentation Version-specific documentation will be available shortly at: - - 📖 [https://supermarioyl.github.io/Bison/docs/${VERSION}/](https://supermarioyl.github.io/Bison/docs/${VERSION}/) + - 📖 [https://bison.lei6393.com/docs/${VERSION}/](https://bison.lei6393.com/docs/${VERSION}/) > Note: Documentation versioning happens automatically after release. Allow a few minutes for the docs site to update. @@ -253,4 +253,4 @@ jobs: env: VERSION: ${{ needs.prepare.outputs.version }} run: | - helm push bison-${VERSION}.tgz oci://ghcr.io/${{ github.repository_owner }}/bison + helm push bison-${VERSION}.tgz oci://ghcr.io/${{ github.repository_owner }}/charts From f8461170a3f3791c5d0c1b48e4a5a6734d5607ce Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 28 Dec 2025 14:22:38 +0000 Subject: [PATCH 15/44] docs: add version 0.0.7 [skip ci] Auto-generated documentation version from release refs/tags/v0.0.7 - Added version 0.0.7 to versions.json - Created versioned_docs/version-0.0.7/ - Created versioned_sidebars/version-0.0.7-sidebars.json --- .../version-0.0.7/architecture.md | 418 ++++++++++++++++++ .../version-0.0.7/configuration.md | 361 +++++++++++++++ .../versioned_docs/version-0.0.7/features.md | 195 ++++++++ .../version-0.0.7/installation.md | 316 +++++++++++++ website/versioned_docs/version-0.0.7/intro.md | 167 +++++++ .../version-0.0.7/user-guides/_category_.json | 8 + .../version-0.0.7/user-guides/admin.md | 176 ++++++++ .../version-0.0.7/user-guides/developer.md | 187 ++++++++ .../version-0.0.7/user-guides/team-leader.md | 126 ++++++ .../version-0.0.7-sidebars.json | 8 + website/versions.json | 1 + 11 files changed, 1963 insertions(+) create mode 100644 website/versioned_docs/version-0.0.7/architecture.md create mode 100644 website/versioned_docs/version-0.0.7/configuration.md create mode 100644 website/versioned_docs/version-0.0.7/features.md create mode 100644 website/versioned_docs/version-0.0.7/installation.md create mode 100644 website/versioned_docs/version-0.0.7/intro.md create mode 100644 website/versioned_docs/version-0.0.7/user-guides/_category_.json create mode 100644 website/versioned_docs/version-0.0.7/user-guides/admin.md create mode 100644 website/versioned_docs/version-0.0.7/user-guides/developer.md create mode 100644 website/versioned_docs/version-0.0.7/user-guides/team-leader.md create mode 100644 website/versioned_sidebars/version-0.0.7-sidebars.json diff --git a/website/versioned_docs/version-0.0.7/architecture.md b/website/versioned_docs/version-0.0.7/architecture.md new file mode 100644 index 0000000..dbbe50d --- /dev/null +++ b/website/versioned_docs/version-0.0.7/architecture.md @@ -0,0 +1,418 @@ +--- +sidebar_position: 5 +--- + +# Architecture + +This document provides a technical overview of Bison's architecture, designed with high cohesion and low coupling principles for maintainability and scalability. + +## System Overview + +### High-Level Architecture + +```mermaid +graph TB + subgraph PRESENT[Presentation Layer] + WEB[Web UI
React 18 + Ant Design 5] + CLI[kubectl / API Client] + end + + subgraph GATEWAY[API Gateway Layer] + GW[API Server
Go + Gin Framework] + AUTH[Auth Middleware
JWT + OIDC] + end + + subgraph BUSINESS[Business Logic Layer] + TS[Tenant Service
Team & Project CRUD] + BS[Billing Service
Cost Calculation] + BLS[Balance Service
Wallet Management] + QS[Quota Service
Resource Limits] + AS[Alert Service
Notifications] + RS[Report Service
Analytics] + end + + subgraph INTEGRATION[Integration Layer] + K8S[Kubernetes Client
client-go] + OCC[OpenCost Client
REST API] + PC[Prometheus Client
PromQL] + end + + subgraph EXTERNAL[External Systems] + KAPI[Kubernetes API] + CAP[Capsule Controller] + OC[OpenCost] + PROM[Prometheus] + end + + subgraph DATA[Data Layer] + CM[ConfigMaps
Persistent Storage] + end + + WEB --> GW + CLI --> GW + GW --> AUTH + AUTH --> TS & BS & BLS & QS & AS & RS + + TS --> K8S + BS --> OCC + BLS --> K8S + QS --> K8S + RS --> OCC & PC + + K8S --> KAPI + K8S --> CAP + OCC --> OC + PC --> PROM + + TS & BLS --> CM + KAPI --> CM +``` + +### Design Principles + +| Principle | Implementation | +|-----------|----------------| +| **High Cohesion** | Each service handles a single domain (billing, quota, alerts) | +| **Low Coupling** | Services communicate via well-defined interfaces | +| **Stateless API** | All state persisted in Kubernetes ConfigMaps | +| **Cloud Native** | Leverages Kubernetes primitives for HA and scaling | +| **Zero Database** | ConfigMaps eliminate external database dependencies | + +## Architecture Layers + +Bison follows a layered architecture pattern: + +### 1. Presentation Layer +- **Web UI**: React 18 + TypeScript + Ant Design 5 +- **API Client**: REST API for external integrations + +### 2. API Gateway Layer +- **API Server**: Go + Gin framework +- **Authentication**: JWT and OIDC support +- **Middleware**: Logging, recovery, CORS handling + +### 3. Business Logic Layer +- **Tenant Service**: Team and project management +- **Billing Service**: Cost calculation and aggregation +- **Balance Service**: Wallet management and auto-deduction +- **Quota Service**: Resource limit enforcement +- **Alert Service**: Multi-channel notifications +- **Report Service**: Analytics and export + +### 4. Integration Layer +- **Kubernetes Client**: client-go for K8s API interaction +- **OpenCost Client**: REST API for cost data +- **Prometheus Client**: PromQL queries for metrics + +### 5. Data Layer +- **ConfigMaps**: Persistent storage for balances, billing config, and metadata +- **etcd**: Backing store via Kubernetes ConfigMaps + +## Core Components + +### API Server + +The API server is the central component that handles all HTTP requests: + +**Technology Stack:** +- Go 1.24+ +- Gin web framework +- client-go for Kubernetes API + +**Key Responsibilities:** +- Serve REST API endpoints +- Authentication and authorization +- Request routing and middleware +- Background task scheduling + +**Endpoints:** +``` +/api/v1/teams - Team management +/api/v1/projects - Project management +/api/v1/billing - Billing configuration +/api/v1/balance - Balance operations +/api/v1/stats - Statistics and reports +``` + +### Web UI + +React-based single-page application: + +**Technology Stack:** +- React 18 +- TypeScript +- Vite (build tool) +- Ant Design 5 +- ECharts (visualization) +- React Query (state management) + +**Features:** +- Dashboard with real-time metrics +- Team and project management +- Billing configuration +- Balance monitoring +- Usage reports and export + +### Billing Service + +Calculates costs based on resource usage: + +**Data Flow:** +```mermaid +sequenceDiagram + participant S as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant K8S as Kubernetes + + S->>BS: Trigger billing (every 10min) + BS->>OC: Query team costs + OC-->>BS: Return usage data + BS->>BS: Calculate cost + BS->>BLS: Deduct from balance + BLS->>K8S: Update ConfigMap + BLS-->>BS: Confirm + BS->>AS: Check threshold + alt Balance low + AS->>Webhook: Send alert + end +``` + +**Billing Formula:** +``` +Total Cost = (CPU_cores × CPU_price × hours) + + (Memory_GB × Memory_price × hours) + + (GPU_count × GPU_price × hours) +``` + +### Balance Service + +Manages team wallets and auto-deduction: + +**Features:** +- Real-time balance tracking +- Auto-deduction based on usage +- Recharge operations +- Transaction history +- Auto-suspension when balance depleted + +**Storage:** +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: bison-team-balances + namespace: bison-system +data: + ml-team: "1523.45" + data-team: "890.12" + dev-team: "2100.00" +``` + +### Tenant Service + +Manages teams (Capsule Tenants) and projects (Namespaces): + +**Features:** +- Create/delete teams +- Assign resource quotas +- Configure node pools (shared/exclusive) +- Manage team metadata + +**Capsule Integration:** +```yaml +apiVersion: capsule.clastix.io/v1beta1 +kind: Tenant +metadata: + name: ml-team +spec: + owners: + - name: team-leader + kind: User + resourceQuota: + items: + - hard: + cpu: "20" + memory: 64Gi + nvidia.com/gpu: "4" +``` + +## Data Flow + +### Team Creation Flow + +```mermaid +sequenceDiagram + participant U as Admin + participant API as API Server + participant TS as Tenant Service + participant K8S as Kubernetes + participant CAP as Capsule + + U->>API: POST /api/v1/teams + API->>TS: CreateTeam(name, quota, balance) + TS->>CAP: Create Tenant + CAP-->>TS: Tenant created + TS->>K8S: Create ConfigMap (balance) + K8S-->>TS: ConfigMap created + TS-->>API: Success + API-->>U: 201 Created +``` + +### Billing Cycle Flow + +```mermaid +sequenceDiagram + participant SCH as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant AS as Alert Service + + loop Every 10 minutes + SCH->>BS: Trigger billing calculation + BS->>OC: Query costs (last 10min) + OC-->>BS: Return usage metrics + BS->>BS: Calculate total cost + BS->>BLS: Deduct cost from balance + BLS->>BLS: Update balance + BLS-->>BS: Balance updated + BS->>AS: Check balance threshold + alt Balance < 20% + AS->>AS: Send low balance alert + end + alt Balance <= 0 + AS->>BS: Suspend team workloads + end + end +``` + +## Integration Points + +### Kubernetes Integration + +Bison integrates deeply with Kubernetes: + +- **Capsule Tenants** for multi-tenancy +- **ResourceQuotas** for limit enforcement +- **Namespaces** for project isolation +- **ConfigMaps** for data persistence +- **RBAC** for access control + +### OpenCost Integration + +Real-time cost tracking via OpenCost API: + +```bash +# Query team costs +GET /allocation/compute?window=10m&aggregate=namespace&filter=namespace:ml-team + +# Response +{ + "ml-team": { + "cpuCost": 0.25, + "memCost": 0.10, + "gpuCost": 4.17, + "totalCost": 4.52 + } +} +``` + +### Prometheus Integration + +Metrics collection for monitoring: + +- Resource utilization metrics +- Cost metrics +- Balance metrics +- Alert metrics + +## Deployment Architecture + +### High Availability Setup + +```mermaid +graph TB + subgraph K8S[Kubernetes Cluster] + subgraph NS1[bison-system namespace] + API1[API Server Pod 1] + API2[API Server Pod 2] + WEB1[Web UI Pod 1] + WEB2[Web UI Pod 2] + end + + subgraph NS2[opencost-system namespace] + OC[OpenCost] + end + + subgraph NS3[prometheus-system namespace] + PROM[Prometheus] + end + + LB[LoadBalancer] + end + + LB --> API1 & API2 + LB --> WEB1 & WEB2 + API1 & API2 --> OC + OC --> PROM +``` + +### Resource Requirements + +**Minimum:** +- API Server: 200m CPU, 256Mi Memory +- Web UI: 100m CPU, 128Mi Memory + +**Recommended (Production):** +- API Server: 1000m CPU, 512Mi Memory (2 replicas) +- Web UI: 500m CPU, 256Mi Memory (2 replicas) + +## Security Model + +### Authentication +- JWT token-based authentication +- OIDC/SSO integration +- Admin user management + +### Authorization +- Kubernetes RBAC integration +- Role-based access control +- Team-scoped permissions + +### Data Security +- All data encrypted at rest (etcd encryption) +- TLS for API communication +- Secret management via Kubernetes Secrets + +## Technology Stack + +### Backend +- **Language**: Go 1.24+ +- **Framework**: Gin +- **Kubernetes Client**: client-go +- **Configuration**: Viper +- **Logging**: Logrus + +### Frontend +- **Framework**: React 18 +- **Language**: TypeScript +- **UI Library**: Ant Design 5 +- **Build Tool**: Vite +- **State Management**: React Query +- **Charts**: ECharts + +### Infrastructure +- **Platform**: Kubernetes 1.22+ +- **Multi-Tenancy**: Capsule +- **Cost Tracking**: OpenCost +- **Metrics**: Prometheus +- **Storage**: ConfigMaps (etcd) + +## Next Steps + +- [Installation Guide](installation.md) - Deploy Bison +- [Configuration](configuration.md) - Configure billing +- [User Guides](user-guides/admin.md) - Learn to use Bison +- [Features](features.md) - Explore capabilities diff --git a/website/versioned_docs/version-0.0.7/configuration.md b/website/versioned_docs/version-0.0.7/configuration.md new file mode 100644 index 0000000..539aaa3 --- /dev/null +++ b/website/versioned_docs/version-0.0.7/configuration.md @@ -0,0 +1,361 @@ +--- +sidebar_position: 6 +--- + +# Configuration + +This guide covers how to configure Bison for your specific environment and requirements. + +## Helm Chart Configuration + +Bison is configured primarily through Helm values. You can customize the installation by providing a `values.yaml` file or using `--set` flags. + +### Key Configuration Parameters + +| Parameter | Description | Default | Example | +|-----------|-------------|---------|---------| +| `auth.enabled` | Enable authentication | `false` | `true` | +| `auth.admin.username` | Admin username | `admin` | `admin` | +| `auth.admin.password` | Admin password | `admin` | `changeme` | +| `apiServer.replicaCount` | API server replicas | `2` | `3` | +| `apiServer.image.repository` | API server image | `ghcr.io/supermarioyl/bison/api-server` | - | +| `apiServer.image.tag` | API server image tag | `0.0.1` | `latest` | +| `webUI.replicaCount` | Web UI replicas | `2` | `3` | +| `webUI.image.repository` | Web UI image | `ghcr.io/supermarioyl/bison/web-ui` | - | +| `webUI.image.tag` | Web UI image tag | `0.0.1` | `latest` | +| `opencost.url` | OpenCost API endpoint | `http://opencost.opencost-system.svc:9003` | Custom URL | + +### Example Custom Values + +Create a `custom-values.yaml` file: + +```yaml +# Authentication +auth: + enabled: true + admin: + username: admin + password: MySecurePassword123 + +# API Server +apiServer: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 512Mi + +# Web UI +webUI: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + +# OpenCost Integration +opencost: + url: http://opencost.opencost-system.svc:9003 + +# Node Selection (optional) +nodeSelector: + node-role.kubernetes.io/control-plane: "" + +# Tolerations (optional) +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule +``` + +Install with custom values: + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --values custom-values.yaml +``` + +## Billing Configuration + +Billing settings are configured through the Web UI or API after installation. + +### Access Billing Configuration + +1. **Via Web UI:** + - Navigate to **Settings** > **Billing Configuration** + - Set pricing for CPU, Memory, GPU, and other resources + - Configure currency and billing intervals + +2. **Via API:** + ```bash + curl -X POST http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50 + }, + "billingInterval": "hourly" + }' + ``` + +### Billing Parameters + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `enabled` | Enable/disable billing | `true` | +| `currency` | Currency for billing | `USD`, `CNY`, `EUR` | +| `pricing.cpu` | CPU price per core-hour | `0.05` | +| `pricing.memory` | Memory price per GB-hour | `0.01` | +| `pricing["nvidia.com/gpu"]` | GPU price per GPU-hour | `2.50` | +| `billingInterval` | Billing aggregation period | `hourly`, `daily` | +| `lowBalanceThreshold` | Warning threshold (%) | `20` | +| `suspendThreshold` | Auto-suspend threshold (%) | `5` | + +### Example Billing Configuration + +```json +{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "nvidia.com/mig-1g.5gb": 0.50, + "nvidia.com/mig-2g.10gb": 1.00 + }, + "billingInterval": "hourly", + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## Team Configuration + +### Creating Teams + +Teams can be created through the Web UI or API: + +**Via Web UI:** +1. Navigate to **Teams** page +2. Click **Create Team** +3. Set team name, quota, and initial balance + +**Via API:** +```bash +curl -X POST http://localhost:8080/api/v1/teams \ + -H "Content-Type: application/json" \ + -d '{ + "name": "ml-team", + "description": "Machine Learning Team", + "quota": { + "cpu": "20", + "memory": "64Gi", + "nvidia.com/gpu": "4" + }, + "balance": 1000.00 + }' +``` + +### Team Quotas + +Team quotas define resource limits: + +```yaml +quota: + cpu: "20" # 20 CPU cores + memory: "64Gi" # 64 GB RAM + nvidia.com/gpu: "4" # 4 GPUs + storage: "500Gi" # 500 GB storage +``` + +### Team Balance Management + +Set initial balance and configure auto-recharge: + +```json +{ + "balance": 1000.00, + "autoRecharge": { + "enabled": true, + "amount": 500.00, + "schedule": "monthly", + "threshold": 100.00 + } +} +``` + +## Alert Configuration + +Configure multi-channel alerts for low balance and quota warnings. + +### Webhook Alerts + +```json +{ + "type": "webhook", + "enabled": true, + "url": "https://your-webhook-endpoint.com/alerts", + "headers": { + "Authorization": "Bearer YOUR_TOKEN" + }, + "template": { + "title": "Bison Alert", + "message": "Team {{.TeamName}} balance is {{.Balance}}" + } +} +``` + +### DingTalk Alerts + +```json +{ + "type": "dingtalk", + "enabled": true, + "webhook": "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN", + "secret": "YOUR_SECRET" +} +``` + +### WeChat Work Alerts + +```json +{ + "type": "wechat", + "enabled": true, + "corpid": "YOUR_CORP_ID", + "corpsecret": "YOUR_CORP_SECRET", + "agentid": 1000001 +} +``` + +## OpenCost Integration + +Configure OpenCost connection: + +### Check OpenCost Connectivity + +```bash +# Test OpenCost API +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz + +# Test allocation API +curl http://localhost:9003/allocation/compute?window=1d +``` + +### Update OpenCost URL + +If OpenCost is deployed in a different namespace or with a different service name: + +```bash +helm upgrade bison bison/bison \ + --set opencost.url=http://my-opencost.custom-namespace.svc:9003 \ + --namespace bison-system +``` + +## Authentication & OIDC + +Enable authentication and integrate with your SSO provider: + +### Basic Authentication + +```yaml +auth: + enabled: true + admin: + username: admin + password: SecurePassword123 +``` + +### OIDC Integration + +```yaml +auth: + enabled: true + oidc: + enabled: true + issuerURL: https://your-oidc-provider.com + clientID: bison-client-id + clientSecret: your-client-secret + redirectURL: https://bison.example.com/callback +``` + +## Environment Variables + +Additional configuration can be provided via environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `KUBECONFIG` | Path to kubeconfig file | In-cluster config | +| `OPENCOST_URL` | OpenCost API URL | `http://opencost.opencost-system.svc:9003` | +| `AUTH_ENABLED` | Enable authentication | `false` | +| `LOG_LEVEL` | Logging level | `info` | +| `BILLING_INTERVAL` | Billing calculation interval | `10m` | + +Set environment variables in Helm values: + +```yaml +apiServer: + env: + - name: LOG_LEVEL + value: debug + - name: BILLING_INTERVAL + value: 5m +``` + +## Advanced Configuration + +### Custom Resource Pricing + +Price any Kubernetes resource: + +```json +{ + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "amd.com/gpu": 2.00, + "ephemeral-storage": 0.001, + "custom.io/fpga": 5.00 + } +} +``` + +### Multi-Cluster Support + +Deploy Bison in each cluster with shared billing: + +```yaml +# Cluster A +apiServer: + clusterName: prod-us-west + +# Cluster B +apiServer: + clusterName: prod-us-east +``` + +## Next Steps + +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Understand the system design +- [Features](features.md) - Explore all capabilities diff --git a/website/versioned_docs/version-0.0.7/features.md b/website/versioned_docs/version-0.0.7/features.md new file mode 100644 index 0000000..8d0c438 --- /dev/null +++ b/website/versioned_docs/version-0.0.7/features.md @@ -0,0 +1,195 @@ +--- +sidebar_position: 2 +--- + +# Features + +Bison provides a comprehensive suite of features for GPU resource management, billing, and multi-tenant isolation in Kubernetes environments. + +## See Bison in Action + +### 🎯 Real-Time Resource Dashboard + +![Bison Dashboard](/img/ui-dashboard.png) + +**What you see:** +- **Cluster Overview** - Total teams, projects, resource pools, and quotas at a glance +- **Resource Utilization** - Visual breakdown showing which teams are consuming resources +- **7-Day Cost Trends** - Historical cost data to identify spending patterns +- **Top 5 Cost Rankings** - Quickly identify heavy GPU consumers +- **Team Budget Status** - Real-time balance monitoring with color-coded alerts + +**Who benefits:** +- **Platform Administrators** get instant visibility into cluster health and usage patterns +- **Finance Teams** can track costs in real-time without waiting for monthly reports +- **Team Leaders** can compare their usage against other teams + +--- + +### 💼 Team Management & Budget Monitoring + +![Team Management](/img/ui-team.png) + +**What you see:** +- **Team List** with real-time status indicators: + - 🟢 Green balance = Healthy budget + - 🟡 Yellow balance = Approaching threshold + - 🔴 Red balance = Low balance or suspended +- **Resource Allocation** - CPU/Memory/GPU quotas per team (e.g., "cpu 0/10" means 0 used out of 10 allocated) +- **Project Count** - Number of namespaces/projects under each team +- **Quick Actions** - Edit quotas, recharge balance, or delete team with one click + +**Who benefits:** +- **Team Leaders** monitor their budget status and resource usage at a glance +- **Administrators** manage multiple teams from a single unified view +- **Finance Teams** see which teams need recharging + +--- + +### 💰 Flexible Billing Configuration + +![Billing Configuration](/img/ui-billing.png) + +**What you see:** +- **Per-Resource Pricing** - Set custom prices for CPU (per core-hour), Memory (per GB-hour), GPU (per GPU-hour) +- **Currency Selection** - Support for CNY, USD, EUR, and other currencies +- **Enable/Disable Toggle** - Turn billing on/off for specific resources with one click +- **Billing Rules** - Define how resources are metered (hourly, daily, etc.) +- **Alert Thresholds** - Configure when to send low-balance warnings + +**Who benefits:** +- **Finance Teams** align cloud costs with internal chargeback policies +- **Administrators** adjust pricing based on actual hardware costs +- **Budget Managers** set appropriate warning thresholds to prevent overruns + +--- + +## Core Capabilities + +### Multi-Tenant Management +✅ **Capsule-Powered Isolation** - True multi-tenancy using Kubernetes-native Capsule operator +✅ **OIDC Integration** - Enterprise SSO support for authentication +✅ **Team-Based Access Control** - Manage users, roles, and permissions per team +✅ **Shared & Exclusive Node Pools** - Flexible resource allocation strategies + +### Real-Time Billing +✅ **Usage-Based Billing** - Accurate cost tracking based on actual resource consumption +✅ **Configurable Pricing** - Set custom rates for CPU, Memory, GPU, and any Kubernetes resource +✅ **Multi-Currency Support** - CNY, USD, EUR, and more +✅ **Billing Rules Engine** - Define custom billing logic and aggregation periods + +### Dynamic Resource Quotas +✅ **Per-Team Quotas** - CPU, Memory, GPU, Storage, and custom resources +✅ **Namespace Quotas** - Project-level resource limits within teams +✅ **Auto-Enforcement** - Kubernetes-native quota enforcement +✅ **Quota Alerts** - Notifications when approaching limits + +### Team Balance & Wallet System +✅ **Prepaid Balances** - Team wallets with real-time deduction +✅ **Auto-Deduction** - Automated billing based on resource usage +✅ **Balance Thresholds** - Configurable warning and suspension levels +✅ **Transaction History** - Complete audit trail of all balance changes + +### Auto-Recharge +✅ **Scheduled Top-Ups** - Weekly or monthly automatic recharges +✅ **Custom Amounts** - Flexible recharge amounts per team +✅ **Recharge Notifications** - Alert teams when balance is added + +### Balance Alerts +✅ **Multi-Channel Notifications** - Webhook, DingTalk, WeChat, Email +✅ **Configurable Thresholds** - Set warning levels (e.g., 20%, 10%, 5%) +✅ **Auto-Suspension** - Automatically suspend workloads when balance depleted +✅ **Custom Templates** - Customize alert messages + +### Usage Reports +✅ **Team Analytics** - Per-team cost breakdowns and trends +✅ **Project Analytics** - Namespace-level resource consumption +✅ **Export Capabilities** - CSV, Excel, PDF reports +✅ **Historical Data** - 30/60/90-day cost analysis + +### Audit Logging +✅ **Complete Operation History** - Track all administrative actions +✅ **User Attribution** - Who did what and when +✅ **Resource Changes** - Track quota, balance, and configuration changes +✅ **Compliance Ready** - Meet internal audit requirements + +--- + +## Architecture Highlights + +Bison's architecture is designed for simplicity, scalability, and zero external dependencies. + +```mermaid +graph TB + subgraph USER_LAYER[User Layer] + UI[Web UI
React + Ant Design] + API[REST API
Go + Gin] + end + + subgraph CORE[Core Services] + BS[Billing Service] + TS[Tenant Service] + QS[Quota Service] + end + + subgraph K8S[Kubernetes Layer] + CA[Capsule
Multi-Tenancy] + OC[OpenCost
Cost Tracking] + PR[Prometheus
Metrics] + end + + subgraph DATA[Data Layer] + CM[ConfigMaps
Zero Database] + end + + UI --> API + API --> BS & TS & QS + BS --> OC + TS --> CA + QS --> CA + BS & TS --> CM + OC --> PR +``` + +### Key Architectural Benefits + +- **Zero External Dependencies** - All data stored in Kubernetes ConfigMaps (etcd-backed) +- **Cloud-Native** - Built on Kubernetes primitives for maximum portability +- **Scalable** - Stateless API server that can scale horizontally +- **Secure** - Kubernetes RBAC integration and optional authentication +- **Observable** - Prometheus metrics and structured logging +- **Extensible** - Plugin architecture for custom billing rules and alerts + +--- + +## Integration Points + +### OpenCost Integration +Bison leverages [OpenCost](https://www.opencost.io/) for real-time cost tracking: +- Per-pod, per-namespace, per-team cost visibility +- GPU utilization metrics +- Historical cost data and trends +- Integration with Prometheus for metric collection + +### Capsule Integration +Bison uses [Capsule](https://capsule.clastix.io/) for multi-tenancy: +- Team-based tenant isolation +- Namespace quota enforcement +- Network and security policies +- OIDC/SSO integration + +### Prometheus Integration +Metrics collection and monitoring: +- Resource utilization tracking +- Custom billing metrics +- Alert rule evaluation +- Historical data retention + +--- + +## Next Steps + +- [Installation Guide](installation.md) - Deploy Bison in your cluster +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Deep dive into system design +- [Configuration](configuration.md) - Configure billing and settings diff --git a/website/versioned_docs/version-0.0.7/installation.md b/website/versioned_docs/version-0.0.7/installation.md new file mode 100644 index 0000000..da89191 --- /dev/null +++ b/website/versioned_docs/version-0.0.7/installation.md @@ -0,0 +1,316 @@ +--- +sidebar_position: 3 +--- + +# Installation Guide + +This guide provides detailed instructions for installing Bison in your Kubernetes cluster. + +## Prerequisites + +Before installing Bison, ensure you have: + +- **Kubernetes 1.22+** - A running Kubernetes cluster +- **kubectl** - Configured to access your cluster +- **Helm 3.0+** - Package manager for Kubernetes +- **Capsule Operator v0.1.0+** - For multi-tenant isolation +- **OpenCost** - Deployed with Prometheus for cost tracking + +### Install Prerequisites + +If you haven't installed the required components: + +#### Install Capsule + +```bash +# Using Helm +helm repo add projectcapsule https://projectcapsule.github.io/charts +helm install capsule projectcapsule/capsule \ + --namespace capsule-system \ + --create-namespace +``` + +#### Install OpenCost + +```bash +# Using Helm +helm repo add opencost https://opencost.github.io/opencost-helm-chart +helm install opencost opencost/opencost \ + --namespace opencost-system \ + --create-namespace \ + --set prometheus.internal.serviceName=prometheus-server \ + --set prometheus.internal.namespaceName=prometheus-system +``` + +## Installation Methods + +Bison Helm charts are distributed via **GitHub Container Registry (GHCR)** using the modern OCI format. + +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 + +### Option A: From GHCR (Recommended) + +The simplest way to install Bison is directly from GitHub Container Registry: + +```bash +# Install specific version from GHCR +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace + +# Or pull the chart first, then install +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace + +# Customize installation +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true \ + --set apiServer.image.tag=0.0.2 \ + --set webUI.image.tag=0.0.2 +``` + +**Why GHCR OCI Format?** +- ✅ No separate Helm repository maintenance needed +- ✅ Unified with Docker images in GHCR +- ✅ Faster installation (direct registry pull) +- ✅ Modern Helm 3.8+ standard practice + +### Option B: From GitHub Release + +Download a specific version from GitHub Releases: + +```bash +# Download Helm chart +VERSION=0.0.2 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# Install the chart +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +### Option C: From Source + +Clone and build from source: + +```bash +# Clone repository +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# Install dependencies and build +make install-deps +make build + +# Deploy using Helm +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +## Configuration Options + +Bison can be configured using Helm values. Here are the key configuration options: + +### Basic Configuration + +```yaml +# values.yaml +apiServer: + image: + repository: ghcr.io/supermarioyl/bison/api-server + tag: 0.0.1 + replicas: 2 + +webUI: + image: + repository: ghcr.io/supermarioyl/bison/web-ui + tag: 0.0.1 + replicas: 2 + +# OpenCost URL +opencost: + url: http://opencost.opencost-system.svc:9003 + +# Authentication +auth: + enabled: false +``` + +### Custom Configuration Example + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set apiServer.replicas=3 \ + --set webUI.replicas=3 \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true +``` + +## Verify Installation + +After installation, verify that all components are running: + +```bash +# Check pod status +kubectl get pods -n bison-system + +# Expected output: +# NAME READY STATUS RESTARTS AGE +# bison-api-server-xxxxxxxxx-xxxxx 1/1 Running 0 2m +# bison-webui-xxxxxxxxx-xxxxx 1/1 Running 0 2m + +# Check services +kubectl get svc -n bison-system + +# Check logs +kubectl logs -n bison-system deployment/bison-api-server +kubectl logs -n bison-system deployment/bison-webui +``` + +## Access the Platform + +### Port Forward (Development) + +```bash +# Port-forward the Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# Access at http://localhost:3000 +``` + +### Ingress (Production) + +For production deployments, configure an Ingress: + +```yaml +# ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: bison-ingress + namespace: bison-system + annotations: + kubernetes.io/ingress.class: nginx +spec: + rules: + - host: bison.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: bison-webui + port: + number: 80 + - path: /api + pathType: Prefix + backend: + service: + name: bison-api-server + port: + number: 8080 +``` + +Apply the Ingress: + +```bash +kubectl apply -f ingress.yaml +``` + +## Docker Images + +Bison images are available on GitHub Container Registry: + +```bash +# Pull images +docker pull ghcr.io/supermarioyl/bison/api-server:0.0.1 +docker pull ghcr.io/supermarioyl/bison/web-ui:0.0.1 + +# Or use latest +docker pull ghcr.io/supermarioyl/bison/api-server:latest +docker pull ghcr.io/supermarioyl/bison/web-ui:latest +``` + +**Supported Platforms:** +- `linux/amd64` +- `linux/arm64` + +## Upgrading + +To upgrade Bison to a new version: + +```bash +# Update Helm repository +helm repo update + +# Upgrade to latest version +helm upgrade bison bison/bison --namespace bison-system + +# Or upgrade to specific version +helm upgrade bison bison/bison --version 0.0.2 --namespace bison-system +``` + +## Uninstalling + +To completely remove Bison: + +```bash +# Uninstall Helm release +helm uninstall bison --namespace bison-system + +# Remove namespace (optional) +kubectl delete namespace bison-system +``` + +## Troubleshooting + +### Pod Not Starting + +Check pod logs for errors: + +```bash +kubectl logs -n bison-system deployment/bison-api-server +kubectl describe pod -n bison-system +``` + +### Cannot Connect to OpenCost + +Verify OpenCost is running and accessible: + +```bash +kubectl get svc -n opencost-system +kubectl port-forward -n opencost-system svc/opencost 9003:9003 + +# Test endpoint +curl http://localhost:9003/healthz +``` + +### Authentication Issues + +If authentication is enabled, ensure you have the correct credentials: + +```bash +# Default credentials (change in production!) +Username: admin +Password: admin +``` + +## Next Steps + +- [Configuration Guide](configuration.md) - Configure billing and settings +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Understand the system design diff --git a/website/versioned_docs/version-0.0.7/intro.md b/website/versioned_docs/version-0.0.7/intro.md new file mode 100644 index 0000000..b3dc3aa --- /dev/null +++ b/website/versioned_docs/version-0.0.7/intro.md @@ -0,0 +1,167 @@ +--- +sidebar_position: 1 +slug: / +--- + +# Introduction to Bison + +![Bison Logo](/img/logo.png) + +**Enterprise GPU Resource Billing & Multi-Tenant Management Platform** + +Bison is a Kubernetes-based platform that provides comprehensive GPU resource management, billing, and multi-tenant isolation for organizations running shared GPU clusters. + +## The GPU Management Challenge + +Managing shared GPU clusters across multiple teams creates critical operational and financial challenges: + +**For Platform Administrators:** +- How do you fairly allocate expensive GPU resources across competing teams? +- How do you prevent resource hogging while ensuring everyone gets their fair share? +- How do you track who's using what and implement accurate chargeback? +- How do you maintain strict multi-tenant isolation without complex manual configuration? + +**For Finance & Budget Teams:** +- How do you implement automated chargeback for GPU usage without manual accounting? +- How do you prevent budget overruns before they happen? +- How do you generate accurate cost reports for internal billing? + +**For Development Teams:** +- How do you get predictable, isolated access to GPU resources? +- How do you know when you're approaching your budget limits? +- How do you avoid impacting other teams' workloads? + +**Traditional Approach:** +- Manual quota configuration per namespace +- Excel-based billing calculations +- No real-time cost visibility +- Complex multi-tool setup (quota management + cost tracking + billing system) +- Frequent resource conflicts and budget surprises + +## Bison's Integrated Solution + +```mermaid +graph TB + subgraph WITHOUT["Without Bison"] + P1[❌ Manual Quota Management
Per-namespace configuration] + P2[❌ Spreadsheet Billing
Manual calculations & reports] + P3[❌ No Resource Isolation
Teams compete for resources] + P4[❌ Budget Overruns
No proactive alerts] + P5[❌ Complex Tooling
Multiple systems to manage] + end + + subgraph WITH["With Bison"] + S1[✅ Automated Team Quotas
Capsule-powered isolation] + S2[✅ Real-Time Billing
OpenCost integration] + S3[✅ True Multi-Tenancy
Shared/Exclusive modes] + S4[✅ Proactive Alerts
Balance monitoring & auto-suspend] + S5[✅ Unified Platform
Single pane of glass] + end + + P1 -.Transform.-> S1 + P2 -.Transform.-> S2 + P3 -.Transform.-> S3 + P4 -.Transform.-> S4 + P5 -.Transform.-> S5 + + style WITHOUT fill:#ffebee + style WITH fill:#e8f5e9 + style S1 fill:#4caf50,color:#fff + style S2 fill:#4caf50,color:#fff + style S3 fill:#4caf50,color:#fff + style S4 fill:#4caf50,color:#fff + style S5 fill:#4caf50,color:#fff +``` + +**Bison combines:** +- 🔐 **Kubernetes-native multi-tenancy** (Capsule) - True team isolation with shared or exclusive node pools +- 💰 **Real-time cost tracking** (OpenCost + Prometheus) - Per-pod, per-namespace, per-team cost visibility +- 💳 **Automated billing & budgets** - Prepaid balances, auto-deduction, low-balance alerts, and auto-suspension +- 📊 **Unified dashboard** - Single interface for admins, team leaders, and finance teams +- 🔧 **Zero external dependencies** - All data stored in Kubernetes ConfigMaps (etcd-backed) + +**Result:** Deploy once, get complete GPU resource management with automated billing in under 30 minutes. + +## Quick Start + +### Prerequisites + +- Kubernetes 1.22+ +- kubectl configured +- Helm 3.0+ +- Capsule operator (v0.1.0+) installed +- OpenCost deployed with Prometheus + +### Installation + +Choose one of the following installation methods: + +#### Option A: Using Helm Repository (Recommended) + +```bash +# Add Bison Helm repository +helm repo add bison https://supermarioyl.github.io/Bison/charts/ +helm repo update + +# Install with default configuration +helm install bison bison/bison --namespace bison-system --create-namespace + +# Or customize installation +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=false +``` + +#### Option B: From GitHub Release + +```bash +# Download latest Helm chart +VERSION=0.0.1 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# Install +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +#### Option C: From Source + +```bash +# Clone repository +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# Install dependencies and build +make install-deps +make build + +# Deploy using Helm +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +### Access the Platform + +After installation, access Bison through: + +```bash +# Port-forward the Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# Access at http://localhost:3000 +# Default credentials (if auth enabled): +# Username: admin +# Password: admin (change immediately in production!) +``` + +## Next Steps + +- [Explore Features](features.md) - Learn about all capabilities +- [Installation Guide](installation.md) - Detailed installation instructions +- [User Guides](user-guides/admin.md) - Role-based user guides +- [Architecture](architecture.md) - Understand the system architecture +- [Configuration](configuration.md) - Configure billing and settings diff --git a/website/versioned_docs/version-0.0.7/user-guides/_category_.json b/website/versioned_docs/version-0.0.7/user-guides/_category_.json new file mode 100644 index 0000000..fe79f61 --- /dev/null +++ b/website/versioned_docs/version-0.0.7/user-guides/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "User Guides", + "position": 4, + "link": { + "type": "generated-index", + "description": "Role-based guides for using Bison effectively." + } +} diff --git a/website/versioned_docs/version-0.0.7/user-guides/admin.md b/website/versioned_docs/version-0.0.7/user-guides/admin.md new file mode 100644 index 0000000..a466cbb --- /dev/null +++ b/website/versioned_docs/version-0.0.7/user-guides/admin.md @@ -0,0 +1,176 @@ +--- +sidebar_position: 1 +--- + +# Administrator Guide + +This guide is for platform administrators who deploy, configure, and manage the Bison platform. + +## Responsibilities + +As a platform administrator, you are responsible for: + +- ✅ Deploying and configuring Bison +- ✅ Creating and managing teams +- ✅ Setting global billing configuration +- ✅ Monitoring cluster-wide metrics +- ✅ Responding to alerts and recharge requests + +## Getting Started + +### 1. Deploy Bison + +Follow the [Installation Guide](../installation.md) to deploy Bison in your Kubernetes cluster. + +### 2. Configure Billing + +Set up billing rules and pricing: + +1. Access the Web UI +2. Navigate to **Settings** > **Billing Configuration** +3. Configure: + - **Currency**: USD, CNY, EUR, etc. + - **CPU Price**: Cost per core-hour + - **Memory Price**: Cost per GB-hour + - **GPU Price**: Cost per GPU-hour +4. Click **Save** + +### 3. Create First Team + +Create a team for your users: + +1. Navigate to **Teams** page +2. Click **Create Team** +3. Fill in: + - **Team Name**: e.g., "ml-team" + - **Description**: Team purpose + - **Resource Quota**: + - CPU: e.g., "20" cores + - Memory: e.g., "64Gi" + - GPU: e.g., "4" + - **Initial Balance**: e.g., 1000.00 +4. Click **Create** + +## Common Tasks + +### Managing Teams + +#### View All Teams + +```bash +# Via kubectl +kubectl get tenants + +# Via API +curl http://localhost:8080/api/v1/teams +``` + +#### Update Team Quota + +1. Navigate to **Teams** page +2. Click **Edit** on the team row +3. Modify quotas +4. Click **Save** + +#### Recharge Team Balance + +1. Navigate to **Teams** page +2. Click **Recharge** on the team row +3. Enter amount +4. Add notes (optional) +5. Click **Confirm** + +### Monitoring + +#### View Dashboard + +Access real-time cluster metrics: +- Total teams and projects +- Resource utilization +- Cost trends +- Top consumers +- Balance status + +#### Check Alerts + +Monitor low-balance and quota alerts: +1. Navigate to **Alerts** page +2. Review active alerts +3. Take action as needed + +### Billing Configuration + +#### Update Pricing + +```bash +curl -X PUT http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "pricing": { + "cpu": 0.06, + "memory": 0.012, + "nvidia.com/gpu": 3.00 + } + }' +``` + +#### Configure Alert Thresholds + +```json +{ + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## Best Practices + +### Team Naming +- Use lowercase, alphanumeric characters and hyphens +- Example: `ml-team`, `data-science`, `dev-team` + +### Quota Allocation +- Start with conservative quotas +- Monitor usage for 1-2 weeks +- Adjust based on actual needs + +### Balance Management +- Set up auto-recharge for critical teams +- Monitor balance trends weekly +- Respond to low-balance alerts promptly + +### Security +- Enable authentication in production +- Use OIDC/SSO for enterprise deployments +- Regularly audit user permissions + +## Troubleshooting + +### Team Creation Failed + +Check Capsule operator logs: +```bash +kubectl logs -n capsule-system deployment/capsule-controller-manager +``` + +### Billing Not Working + +Verify OpenCost connectivity: +```bash +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz +``` + +### High Resource Usage + +Check resource consumption: +```bash +kubectl top pods -n bison-system +``` + +## Next Steps + +- [Team Leader Guide](team-leader.md) - Guide for team leaders +- [Developer Guide](developer.md) - Guide for developers +- [Configuration](../configuration.md) - Advanced configuration diff --git a/website/versioned_docs/version-0.0.7/user-guides/developer.md b/website/versioned_docs/version-0.0.7/user-guides/developer.md new file mode 100644 index 0000000..d14ab62 --- /dev/null +++ b/website/versioned_docs/version-0.0.7/user-guides/developer.md @@ -0,0 +1,187 @@ +--- +sidebar_position: 3 +--- + +# Developer Guide + +This guide is for developers who deploy workloads and consume resources within team projects. + +## Responsibilities + +As a developer, you are responsible for: + +- ✅ Deploying applications within your project +- ✅ Monitoring resource usage +- ✅ Staying within quota limits +- ✅ Optimizing resource consumption + +## Getting Started + +### 1. Get Kubeconfig + +Request kubeconfig from your team leader or administrator. + +### 2. Set Context + +```bash +# Set context to your project namespace +kubectl config set-context --current --namespace=your-project + +# Verify +kubectl config view --minify | grep namespace +``` + +### 3. Check Quota + +See your available resources: +```bash +kubectl describe quota +``` + +## Deploying Workloads + +### Basic Pod Deployment + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-training-job + namespace: your-project +spec: + containers: + - name: trainer + image: your-ml-image:latest + resources: + requests: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" +``` + +### Using Deployments + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ml-inference + namespace: your-project +spec: + replicas: 2 + selector: + matchLabels: + app: ml-inference + template: + metadata: + labels: + app: ml-inference + spec: + containers: + - name: inference + image: your-inference-image:latest + resources: + requests: + cpu: "2" + memory: "8Gi" + nvidia.com/gpu: "1" +``` + +## Monitoring Usage + +### Check Pod Resource Usage + +```bash +# View resource consumption +kubectl top pods + +# Detailed pod information +kubectl describe pod +``` + +### View Logs + +```bash +# Stream logs +kubectl logs -f + +# Previous logs (if pod restarted) +kubectl logs --previous +``` + +## Best Practices + +### Resource Requests and Limits + +Always specify both requests and limits: +```yaml +resources: + requests: + cpu: "2" + memory: "8Gi" + limits: + cpu: "4" + memory: "16Gi" +``` + +### GPU Usage + +- Request GPUs only when needed +- Use GPU for compute-intensive tasks +- Monitor GPU utilization + +### Clean Up + +Delete resources when no longer needed: +```bash +# Delete pod +kubectl delete pod + +# Delete deployment +kubectl delete deployment + +# Clean up completed jobs +kubectl delete job --field-selector status.successful=1 +``` + +### Cost Optimization + +- Right-size your resource requests +- Use horizontal pod autoscaling +- Clean up idle resources +- Share GPUs when possible (if supported) + +## Troubleshooting + +### Pod Pending (Insufficient Quota) + +If your pod is stuck in `Pending` state: + +```bash +kubectl describe pod +``` + +Look for quota-related errors and reduce resource requests or ask your team leader for more quota. + +### Out of Memory (OOM) + +If pods are killed due to OOM: +1. Check memory usage patterns +2. Increase memory limits +3. Optimize application memory usage + +### GPU Not Available + +Verify GPU requests: +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,GPU:.status.allocatable."nvidia\.com/gpu" +``` + +## Next Steps + +- [Team Leader Guide](team-leader.md) - Understand team management +- [Architecture](../architecture.md) - Learn about the platform diff --git a/website/versioned_docs/version-0.0.7/user-guides/team-leader.md b/website/versioned_docs/version-0.0.7/user-guides/team-leader.md new file mode 100644 index 0000000..7b6de96 --- /dev/null +++ b/website/versioned_docs/version-0.0.7/user-guides/team-leader.md @@ -0,0 +1,126 @@ +--- +sidebar_position: 2 +--- + +# Team Leader Guide + +This guide is for team leaders who manage projects, monitor budgets, and allocate resources within their team. + +## Responsibilities + +As a team leader, you are responsible for: + +- ✅ Creating and managing projects (namespaces) +- ✅ Allocating quotas to projects +- ✅ Monitoring team balance and consumption +- ✅ Requesting recharges when needed + +## Getting Started + +### 1. Access Bison + +Log in to the Web UI with your credentials. + +### 2. View Team Dashboard + +Your dashboard shows: +- Team balance and status +- Resource utilization +- Active projects +- Cost trends + +## Managing Projects + +### Create a Project + +1. Navigate to **Projects** page +2. Click **Create Project** +3. Fill in: + - **Project Name**: e.g., "training-ml-models" + - **Description**: Project purpose + - **Quota** (optional): + - CPU: e.g., "8" cores + - Memory: e.g., "32Gi" + - GPU: e.g., "2" +4. Click **Create** + +### List Projects + +```bash +# Via kubectl (if you have access) +kubectl get namespaces -l capsule.clastix.io/tenant=your-team + +# Via API +curl http://localhost:8080/api/v1/teams/your-team/projects +``` + +### Delete a Project + +1. Navigate to **Projects** page +2. Click **Delete** on the project row +3. Confirm deletion + +**Warning**: This will delete all resources in the project! + +## Monitoring Budget + +### Check Balance + +View your current balance: +1. Navigate to **Team** page +2. See balance in the status card + +### View Usage Trends + +Analyze spending patterns: +1. Navigate to **Reports** page +2. Select time range (7 days, 30 days, 90 days) +3. View: + - Cost breakdown by resource type + - Daily cost trends + - Per-project consumption + +### Request Recharge + +When balance is low: +1. Click **Request Recharge** button +2. Enter requested amount +3. Add justification +4. Submit request to administrator + +## Resource Management + +### Monitor Quota Usage + +Check how much of your quota is being used: +```bash +kubectl describe quota -n your-project +``` + +### Optimize Costs + +Tips to reduce spending: +- **Right-size resources**: Don't over-provision CPU/Memory +- **Clean up idle pods**: Delete unused workloads +- **Use spot/preemptible instances**: Where applicable +- **Monitor GPU utilization**: Ensure GPUs are fully utilized + +## Best Practices + +### Project Organization +- Create separate projects for different workloads +- Example: `ml-training`, `ml-inference`, `data-processing` + +### Quota Allocation +- Allocate quotas based on project priority +- Reserve buffer for urgent tasks + +### Cost Awareness +- Review costs weekly +- Identify and eliminate waste +- Set up cost alerts + +## Next Steps + +- [Developer Guide](developer.md) - Guide for your team members +- [Features](../features.md) - Explore all Bison features diff --git a/website/versioned_sidebars/version-0.0.7-sidebars.json b/website/versioned_sidebars/version-0.0.7-sidebars.json new file mode 100644 index 0000000..caea0c0 --- /dev/null +++ b/website/versioned_sidebars/version-0.0.7-sidebars.json @@ -0,0 +1,8 @@ +{ + "tutorialSidebar": [ + { + "type": "autogenerated", + "dirName": "." + } + ] +} diff --git a/website/versions.json b/website/versions.json index d470196..297a5c4 100644 --- a/website/versions.json +++ b/website/versions.json @@ -1,4 +1,5 @@ [ + "0.0.7", "0.0.6", "0.0.1" ] From db5e3d4dee14a674ded953b34835f25732ec7b20 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 22:35:24 +0800 Subject: [PATCH 16/44] fix bug --- .github/workflows/release.yml | 19 +++++--- scripts/make-ghcr-public.md | 89 +++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 scripts/make-ghcr-public.md diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0775194..b4e8963 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -81,6 +81,12 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + - name: Set GHCR organization + id: ghcr + run: | + # Use custom GHCR organization name + echo "owner=lei6393" >> $GITHUB_OUTPUT + - name: Log in to GitHub Container Registry uses: docker/login-action@v3 with: @@ -92,7 +98,7 @@ jobs: id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.IMAGE_PREFIX }}/${{ matrix.component }} + images: ghcr.io/${{ steps.ghcr.outputs.owner }}/bison/${{ matrix.component }} tags: | type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} @@ -176,10 +182,10 @@ jobs: #### Method 1: From GHCR (Recommended) \`\`\`bash # Install directly from GitHub Container Registry - helm install my-bison oci://ghcr.io/${{ github.repository_owner }}/charts/bison --version ${VERSION} + helm install my-bison oci://ghcr.io/lei6393/charts/bison --version ${VERSION} # Or pull first, then install - helm pull oci://ghcr.io/${{ github.repository_owner }}/charts/bison --version ${VERSION} + helm pull oci://ghcr.io/lei6393/charts/bison --version ${VERSION} helm install my-bison bison-${VERSION}.tgz \`\`\` @@ -192,8 +198,8 @@ jobs: ### 🐳 Docker Images \`\`\`bash - docker pull ghcr.io/${{ github.repository_owner }}/bison/api-server:${VERSION} - docker pull ghcr.io/${{ github.repository_owner }}/bison/web-ui:${VERSION} + docker pull ghcr.io/lei6393/bison/api-server:${VERSION} + docker pull ghcr.io/lei6393/bison/web-ui:${VERSION} \`\`\` ### 📦 What's Changed @@ -253,4 +259,5 @@ jobs: env: VERSION: ${{ needs.prepare.outputs.version }} run: | - helm push bison-${VERSION}.tgz oci://ghcr.io/${{ github.repository_owner }}/charts + # Use custom GHCR organization + helm push bison-${VERSION}.tgz oci://ghcr.io/lei6393/charts diff --git a/scripts/make-ghcr-public.md b/scripts/make-ghcr-public.md new file mode 100644 index 0000000..7b71804 --- /dev/null +++ b/scripts/make-ghcr-public.md @@ -0,0 +1,89 @@ +# 将 GHCR Package 设为公开 + +由于你无法通过 OCI 路径拉取 Helm chart (`Error: invalid_reference: invalid repository` 或 `403 Forbidden`),这是因为 GitHub Container Registry 的包默认是私有的。 + +## 手动设置 Package 为 Public + +1. **访问 GitHub Packages**: + - 进入 https://github.com/SuperMarioYL?tab=packages + - 或者直接访问仓库首页,点击右侧的 "Packages" + +2. **找到 charts/bison package**: + - 如果存在,点击进入 `charts/bison` package + +3. **修改可见性**: + - 点击 **Package settings** (右上角齿轮图标) + - 滚动到底部找到 **Danger Zone** + - 点击 **Change visibility** + - 选择 **Public** + - 确认更改 + +## 如果找不到 charts/bison Package + +说明 Helm chart 推送到 GHCR 失败了。检查步骤: + +1. **检查 GitHub Actions 运行日志**: + ``` + https://github.com/SuperMarioYL/Bison/actions + ``` + - 找到最近的 "Release" workflow run (v0.0.7) + - 查看 "Publish to Helm Repository (GHCR)" job 的日志 + - 检查是否有错误信息 + +2. **常见失败原因**: + - 权限不足 (GITHUB_TOKEN 没有 `packages: write` 权限) + - OCI 路径错误 + - Helm 登录失败 + +## 临时解决方案 + +在 GHCR OCI 路径修复之前,使用 GitHub Releases: + +```bash +# 列出所有可用版本 +curl -s https://api.github.com/repos/SuperMarioYL/Bison/releases | grep tag_name + +# 下载特定版本 +VERSION=0.0.7 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# 安装 +helm install my-bison bison-${VERSION}.tgz + +# 或者创建本地 Helm 仓库 +mkdir -p ~/helm-charts +cp bison-*.tgz ~/helm-charts/ +helm repo index ~/helm-charts/ +helm repo add local-bison ~/helm-charts/ +helm install my-bison local-bison/bison --version ${VERSION} +``` + +## 验证 Package 是否存在于 GHCR + +```bash +# 使用 GitHub API 检查 +curl -H "Authorization: token YOUR_GITHUB_TOKEN" \ + https://api.github.com/users/SuperMarioYL/packages/container/charts%2Fbison/versions + +# 或者尝试拉取(如果是公开的) +helm pull oci://ghcr.io/supermarioyl/charts/bison --version 0.0.7 +``` + +## 检查是否需要认证 + +即使设置为 public,某些情况下可能仍需要认证: + +```bash +# 登录 GHCR +echo YOUR_GITHUB_TOKEN | helm registry login ghcr.io -u SuperMarioYL --password-stdin + +# 然后拉取 +helm pull oci://ghcr.io/supermarioyl/charts/bison --version 0.0.7 +``` + +## 下一步 + +1. 先检查 v0.0.7 的 GitHub Actions workflow 日志 +2. 确认 Helm chart 推送步骤是否成功 +3. 如果推送成功,设置 package 为 public +4. 如果推送失败,发布新版本 v0.0.8 来测试修复后的配置 From 06ba827e06582a2fd48bd92698a9e09c359960de Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 28 Dec 2025 14:36:57 +0000 Subject: [PATCH 17/44] docs: add version 0.0.8 [skip ci] Auto-generated documentation version from release refs/tags/v0.0.8 - Added version 0.0.8 to versions.json - Created versioned_docs/version-0.0.8/ - Created versioned_sidebars/version-0.0.8-sidebars.json --- .../version-0.0.8/architecture.md | 418 ++++++++++++++++++ .../version-0.0.8/configuration.md | 361 +++++++++++++++ .../versioned_docs/version-0.0.8/features.md | 195 ++++++++ .../version-0.0.8/installation.md | 316 +++++++++++++ website/versioned_docs/version-0.0.8/intro.md | 167 +++++++ .../version-0.0.8/user-guides/_category_.json | 8 + .../version-0.0.8/user-guides/admin.md | 176 ++++++++ .../version-0.0.8/user-guides/developer.md | 187 ++++++++ .../version-0.0.8/user-guides/team-leader.md | 126 ++++++ .../version-0.0.8-sidebars.json | 8 + website/versions.json | 1 + 11 files changed, 1963 insertions(+) create mode 100644 website/versioned_docs/version-0.0.8/architecture.md create mode 100644 website/versioned_docs/version-0.0.8/configuration.md create mode 100644 website/versioned_docs/version-0.0.8/features.md create mode 100644 website/versioned_docs/version-0.0.8/installation.md create mode 100644 website/versioned_docs/version-0.0.8/intro.md create mode 100644 website/versioned_docs/version-0.0.8/user-guides/_category_.json create mode 100644 website/versioned_docs/version-0.0.8/user-guides/admin.md create mode 100644 website/versioned_docs/version-0.0.8/user-guides/developer.md create mode 100644 website/versioned_docs/version-0.0.8/user-guides/team-leader.md create mode 100644 website/versioned_sidebars/version-0.0.8-sidebars.json diff --git a/website/versioned_docs/version-0.0.8/architecture.md b/website/versioned_docs/version-0.0.8/architecture.md new file mode 100644 index 0000000..dbbe50d --- /dev/null +++ b/website/versioned_docs/version-0.0.8/architecture.md @@ -0,0 +1,418 @@ +--- +sidebar_position: 5 +--- + +# Architecture + +This document provides a technical overview of Bison's architecture, designed with high cohesion and low coupling principles for maintainability and scalability. + +## System Overview + +### High-Level Architecture + +```mermaid +graph TB + subgraph PRESENT[Presentation Layer] + WEB[Web UI
React 18 + Ant Design 5] + CLI[kubectl / API Client] + end + + subgraph GATEWAY[API Gateway Layer] + GW[API Server
Go + Gin Framework] + AUTH[Auth Middleware
JWT + OIDC] + end + + subgraph BUSINESS[Business Logic Layer] + TS[Tenant Service
Team & Project CRUD] + BS[Billing Service
Cost Calculation] + BLS[Balance Service
Wallet Management] + QS[Quota Service
Resource Limits] + AS[Alert Service
Notifications] + RS[Report Service
Analytics] + end + + subgraph INTEGRATION[Integration Layer] + K8S[Kubernetes Client
client-go] + OCC[OpenCost Client
REST API] + PC[Prometheus Client
PromQL] + end + + subgraph EXTERNAL[External Systems] + KAPI[Kubernetes API] + CAP[Capsule Controller] + OC[OpenCost] + PROM[Prometheus] + end + + subgraph DATA[Data Layer] + CM[ConfigMaps
Persistent Storage] + end + + WEB --> GW + CLI --> GW + GW --> AUTH + AUTH --> TS & BS & BLS & QS & AS & RS + + TS --> K8S + BS --> OCC + BLS --> K8S + QS --> K8S + RS --> OCC & PC + + K8S --> KAPI + K8S --> CAP + OCC --> OC + PC --> PROM + + TS & BLS --> CM + KAPI --> CM +``` + +### Design Principles + +| Principle | Implementation | +|-----------|----------------| +| **High Cohesion** | Each service handles a single domain (billing, quota, alerts) | +| **Low Coupling** | Services communicate via well-defined interfaces | +| **Stateless API** | All state persisted in Kubernetes ConfigMaps | +| **Cloud Native** | Leverages Kubernetes primitives for HA and scaling | +| **Zero Database** | ConfigMaps eliminate external database dependencies | + +## Architecture Layers + +Bison follows a layered architecture pattern: + +### 1. Presentation Layer +- **Web UI**: React 18 + TypeScript + Ant Design 5 +- **API Client**: REST API for external integrations + +### 2. API Gateway Layer +- **API Server**: Go + Gin framework +- **Authentication**: JWT and OIDC support +- **Middleware**: Logging, recovery, CORS handling + +### 3. Business Logic Layer +- **Tenant Service**: Team and project management +- **Billing Service**: Cost calculation and aggregation +- **Balance Service**: Wallet management and auto-deduction +- **Quota Service**: Resource limit enforcement +- **Alert Service**: Multi-channel notifications +- **Report Service**: Analytics and export + +### 4. Integration Layer +- **Kubernetes Client**: client-go for K8s API interaction +- **OpenCost Client**: REST API for cost data +- **Prometheus Client**: PromQL queries for metrics + +### 5. Data Layer +- **ConfigMaps**: Persistent storage for balances, billing config, and metadata +- **etcd**: Backing store via Kubernetes ConfigMaps + +## Core Components + +### API Server + +The API server is the central component that handles all HTTP requests: + +**Technology Stack:** +- Go 1.24+ +- Gin web framework +- client-go for Kubernetes API + +**Key Responsibilities:** +- Serve REST API endpoints +- Authentication and authorization +- Request routing and middleware +- Background task scheduling + +**Endpoints:** +``` +/api/v1/teams - Team management +/api/v1/projects - Project management +/api/v1/billing - Billing configuration +/api/v1/balance - Balance operations +/api/v1/stats - Statistics and reports +``` + +### Web UI + +React-based single-page application: + +**Technology Stack:** +- React 18 +- TypeScript +- Vite (build tool) +- Ant Design 5 +- ECharts (visualization) +- React Query (state management) + +**Features:** +- Dashboard with real-time metrics +- Team and project management +- Billing configuration +- Balance monitoring +- Usage reports and export + +### Billing Service + +Calculates costs based on resource usage: + +**Data Flow:** +```mermaid +sequenceDiagram + participant S as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant K8S as Kubernetes + + S->>BS: Trigger billing (every 10min) + BS->>OC: Query team costs + OC-->>BS: Return usage data + BS->>BS: Calculate cost + BS->>BLS: Deduct from balance + BLS->>K8S: Update ConfigMap + BLS-->>BS: Confirm + BS->>AS: Check threshold + alt Balance low + AS->>Webhook: Send alert + end +``` + +**Billing Formula:** +``` +Total Cost = (CPU_cores × CPU_price × hours) + + (Memory_GB × Memory_price × hours) + + (GPU_count × GPU_price × hours) +``` + +### Balance Service + +Manages team wallets and auto-deduction: + +**Features:** +- Real-time balance tracking +- Auto-deduction based on usage +- Recharge operations +- Transaction history +- Auto-suspension when balance depleted + +**Storage:** +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: bison-team-balances + namespace: bison-system +data: + ml-team: "1523.45" + data-team: "890.12" + dev-team: "2100.00" +``` + +### Tenant Service + +Manages teams (Capsule Tenants) and projects (Namespaces): + +**Features:** +- Create/delete teams +- Assign resource quotas +- Configure node pools (shared/exclusive) +- Manage team metadata + +**Capsule Integration:** +```yaml +apiVersion: capsule.clastix.io/v1beta1 +kind: Tenant +metadata: + name: ml-team +spec: + owners: + - name: team-leader + kind: User + resourceQuota: + items: + - hard: + cpu: "20" + memory: 64Gi + nvidia.com/gpu: "4" +``` + +## Data Flow + +### Team Creation Flow + +```mermaid +sequenceDiagram + participant U as Admin + participant API as API Server + participant TS as Tenant Service + participant K8S as Kubernetes + participant CAP as Capsule + + U->>API: POST /api/v1/teams + API->>TS: CreateTeam(name, quota, balance) + TS->>CAP: Create Tenant + CAP-->>TS: Tenant created + TS->>K8S: Create ConfigMap (balance) + K8S-->>TS: ConfigMap created + TS-->>API: Success + API-->>U: 201 Created +``` + +### Billing Cycle Flow + +```mermaid +sequenceDiagram + participant SCH as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant AS as Alert Service + + loop Every 10 minutes + SCH->>BS: Trigger billing calculation + BS->>OC: Query costs (last 10min) + OC-->>BS: Return usage metrics + BS->>BS: Calculate total cost + BS->>BLS: Deduct cost from balance + BLS->>BLS: Update balance + BLS-->>BS: Balance updated + BS->>AS: Check balance threshold + alt Balance < 20% + AS->>AS: Send low balance alert + end + alt Balance <= 0 + AS->>BS: Suspend team workloads + end + end +``` + +## Integration Points + +### Kubernetes Integration + +Bison integrates deeply with Kubernetes: + +- **Capsule Tenants** for multi-tenancy +- **ResourceQuotas** for limit enforcement +- **Namespaces** for project isolation +- **ConfigMaps** for data persistence +- **RBAC** for access control + +### OpenCost Integration + +Real-time cost tracking via OpenCost API: + +```bash +# Query team costs +GET /allocation/compute?window=10m&aggregate=namespace&filter=namespace:ml-team + +# Response +{ + "ml-team": { + "cpuCost": 0.25, + "memCost": 0.10, + "gpuCost": 4.17, + "totalCost": 4.52 + } +} +``` + +### Prometheus Integration + +Metrics collection for monitoring: + +- Resource utilization metrics +- Cost metrics +- Balance metrics +- Alert metrics + +## Deployment Architecture + +### High Availability Setup + +```mermaid +graph TB + subgraph K8S[Kubernetes Cluster] + subgraph NS1[bison-system namespace] + API1[API Server Pod 1] + API2[API Server Pod 2] + WEB1[Web UI Pod 1] + WEB2[Web UI Pod 2] + end + + subgraph NS2[opencost-system namespace] + OC[OpenCost] + end + + subgraph NS3[prometheus-system namespace] + PROM[Prometheus] + end + + LB[LoadBalancer] + end + + LB --> API1 & API2 + LB --> WEB1 & WEB2 + API1 & API2 --> OC + OC --> PROM +``` + +### Resource Requirements + +**Minimum:** +- API Server: 200m CPU, 256Mi Memory +- Web UI: 100m CPU, 128Mi Memory + +**Recommended (Production):** +- API Server: 1000m CPU, 512Mi Memory (2 replicas) +- Web UI: 500m CPU, 256Mi Memory (2 replicas) + +## Security Model + +### Authentication +- JWT token-based authentication +- OIDC/SSO integration +- Admin user management + +### Authorization +- Kubernetes RBAC integration +- Role-based access control +- Team-scoped permissions + +### Data Security +- All data encrypted at rest (etcd encryption) +- TLS for API communication +- Secret management via Kubernetes Secrets + +## Technology Stack + +### Backend +- **Language**: Go 1.24+ +- **Framework**: Gin +- **Kubernetes Client**: client-go +- **Configuration**: Viper +- **Logging**: Logrus + +### Frontend +- **Framework**: React 18 +- **Language**: TypeScript +- **UI Library**: Ant Design 5 +- **Build Tool**: Vite +- **State Management**: React Query +- **Charts**: ECharts + +### Infrastructure +- **Platform**: Kubernetes 1.22+ +- **Multi-Tenancy**: Capsule +- **Cost Tracking**: OpenCost +- **Metrics**: Prometheus +- **Storage**: ConfigMaps (etcd) + +## Next Steps + +- [Installation Guide](installation.md) - Deploy Bison +- [Configuration](configuration.md) - Configure billing +- [User Guides](user-guides/admin.md) - Learn to use Bison +- [Features](features.md) - Explore capabilities diff --git a/website/versioned_docs/version-0.0.8/configuration.md b/website/versioned_docs/version-0.0.8/configuration.md new file mode 100644 index 0000000..539aaa3 --- /dev/null +++ b/website/versioned_docs/version-0.0.8/configuration.md @@ -0,0 +1,361 @@ +--- +sidebar_position: 6 +--- + +# Configuration + +This guide covers how to configure Bison for your specific environment and requirements. + +## Helm Chart Configuration + +Bison is configured primarily through Helm values. You can customize the installation by providing a `values.yaml` file or using `--set` flags. + +### Key Configuration Parameters + +| Parameter | Description | Default | Example | +|-----------|-------------|---------|---------| +| `auth.enabled` | Enable authentication | `false` | `true` | +| `auth.admin.username` | Admin username | `admin` | `admin` | +| `auth.admin.password` | Admin password | `admin` | `changeme` | +| `apiServer.replicaCount` | API server replicas | `2` | `3` | +| `apiServer.image.repository` | API server image | `ghcr.io/supermarioyl/bison/api-server` | - | +| `apiServer.image.tag` | API server image tag | `0.0.1` | `latest` | +| `webUI.replicaCount` | Web UI replicas | `2` | `3` | +| `webUI.image.repository` | Web UI image | `ghcr.io/supermarioyl/bison/web-ui` | - | +| `webUI.image.tag` | Web UI image tag | `0.0.1` | `latest` | +| `opencost.url` | OpenCost API endpoint | `http://opencost.opencost-system.svc:9003` | Custom URL | + +### Example Custom Values + +Create a `custom-values.yaml` file: + +```yaml +# Authentication +auth: + enabled: true + admin: + username: admin + password: MySecurePassword123 + +# API Server +apiServer: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 512Mi + +# Web UI +webUI: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + +# OpenCost Integration +opencost: + url: http://opencost.opencost-system.svc:9003 + +# Node Selection (optional) +nodeSelector: + node-role.kubernetes.io/control-plane: "" + +# Tolerations (optional) +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule +``` + +Install with custom values: + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --values custom-values.yaml +``` + +## Billing Configuration + +Billing settings are configured through the Web UI or API after installation. + +### Access Billing Configuration + +1. **Via Web UI:** + - Navigate to **Settings** > **Billing Configuration** + - Set pricing for CPU, Memory, GPU, and other resources + - Configure currency and billing intervals + +2. **Via API:** + ```bash + curl -X POST http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50 + }, + "billingInterval": "hourly" + }' + ``` + +### Billing Parameters + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `enabled` | Enable/disable billing | `true` | +| `currency` | Currency for billing | `USD`, `CNY`, `EUR` | +| `pricing.cpu` | CPU price per core-hour | `0.05` | +| `pricing.memory` | Memory price per GB-hour | `0.01` | +| `pricing["nvidia.com/gpu"]` | GPU price per GPU-hour | `2.50` | +| `billingInterval` | Billing aggregation period | `hourly`, `daily` | +| `lowBalanceThreshold` | Warning threshold (%) | `20` | +| `suspendThreshold` | Auto-suspend threshold (%) | `5` | + +### Example Billing Configuration + +```json +{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "nvidia.com/mig-1g.5gb": 0.50, + "nvidia.com/mig-2g.10gb": 1.00 + }, + "billingInterval": "hourly", + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## Team Configuration + +### Creating Teams + +Teams can be created through the Web UI or API: + +**Via Web UI:** +1. Navigate to **Teams** page +2. Click **Create Team** +3. Set team name, quota, and initial balance + +**Via API:** +```bash +curl -X POST http://localhost:8080/api/v1/teams \ + -H "Content-Type: application/json" \ + -d '{ + "name": "ml-team", + "description": "Machine Learning Team", + "quota": { + "cpu": "20", + "memory": "64Gi", + "nvidia.com/gpu": "4" + }, + "balance": 1000.00 + }' +``` + +### Team Quotas + +Team quotas define resource limits: + +```yaml +quota: + cpu: "20" # 20 CPU cores + memory: "64Gi" # 64 GB RAM + nvidia.com/gpu: "4" # 4 GPUs + storage: "500Gi" # 500 GB storage +``` + +### Team Balance Management + +Set initial balance and configure auto-recharge: + +```json +{ + "balance": 1000.00, + "autoRecharge": { + "enabled": true, + "amount": 500.00, + "schedule": "monthly", + "threshold": 100.00 + } +} +``` + +## Alert Configuration + +Configure multi-channel alerts for low balance and quota warnings. + +### Webhook Alerts + +```json +{ + "type": "webhook", + "enabled": true, + "url": "https://your-webhook-endpoint.com/alerts", + "headers": { + "Authorization": "Bearer YOUR_TOKEN" + }, + "template": { + "title": "Bison Alert", + "message": "Team {{.TeamName}} balance is {{.Balance}}" + } +} +``` + +### DingTalk Alerts + +```json +{ + "type": "dingtalk", + "enabled": true, + "webhook": "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN", + "secret": "YOUR_SECRET" +} +``` + +### WeChat Work Alerts + +```json +{ + "type": "wechat", + "enabled": true, + "corpid": "YOUR_CORP_ID", + "corpsecret": "YOUR_CORP_SECRET", + "agentid": 1000001 +} +``` + +## OpenCost Integration + +Configure OpenCost connection: + +### Check OpenCost Connectivity + +```bash +# Test OpenCost API +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz + +# Test allocation API +curl http://localhost:9003/allocation/compute?window=1d +``` + +### Update OpenCost URL + +If OpenCost is deployed in a different namespace or with a different service name: + +```bash +helm upgrade bison bison/bison \ + --set opencost.url=http://my-opencost.custom-namespace.svc:9003 \ + --namespace bison-system +``` + +## Authentication & OIDC + +Enable authentication and integrate with your SSO provider: + +### Basic Authentication + +```yaml +auth: + enabled: true + admin: + username: admin + password: SecurePassword123 +``` + +### OIDC Integration + +```yaml +auth: + enabled: true + oidc: + enabled: true + issuerURL: https://your-oidc-provider.com + clientID: bison-client-id + clientSecret: your-client-secret + redirectURL: https://bison.example.com/callback +``` + +## Environment Variables + +Additional configuration can be provided via environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `KUBECONFIG` | Path to kubeconfig file | In-cluster config | +| `OPENCOST_URL` | OpenCost API URL | `http://opencost.opencost-system.svc:9003` | +| `AUTH_ENABLED` | Enable authentication | `false` | +| `LOG_LEVEL` | Logging level | `info` | +| `BILLING_INTERVAL` | Billing calculation interval | `10m` | + +Set environment variables in Helm values: + +```yaml +apiServer: + env: + - name: LOG_LEVEL + value: debug + - name: BILLING_INTERVAL + value: 5m +``` + +## Advanced Configuration + +### Custom Resource Pricing + +Price any Kubernetes resource: + +```json +{ + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "amd.com/gpu": 2.00, + "ephemeral-storage": 0.001, + "custom.io/fpga": 5.00 + } +} +``` + +### Multi-Cluster Support + +Deploy Bison in each cluster with shared billing: + +```yaml +# Cluster A +apiServer: + clusterName: prod-us-west + +# Cluster B +apiServer: + clusterName: prod-us-east +``` + +## Next Steps + +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Understand the system design +- [Features](features.md) - Explore all capabilities diff --git a/website/versioned_docs/version-0.0.8/features.md b/website/versioned_docs/version-0.0.8/features.md new file mode 100644 index 0000000..8d0c438 --- /dev/null +++ b/website/versioned_docs/version-0.0.8/features.md @@ -0,0 +1,195 @@ +--- +sidebar_position: 2 +--- + +# Features + +Bison provides a comprehensive suite of features for GPU resource management, billing, and multi-tenant isolation in Kubernetes environments. + +## See Bison in Action + +### 🎯 Real-Time Resource Dashboard + +![Bison Dashboard](/img/ui-dashboard.png) + +**What you see:** +- **Cluster Overview** - Total teams, projects, resource pools, and quotas at a glance +- **Resource Utilization** - Visual breakdown showing which teams are consuming resources +- **7-Day Cost Trends** - Historical cost data to identify spending patterns +- **Top 5 Cost Rankings** - Quickly identify heavy GPU consumers +- **Team Budget Status** - Real-time balance monitoring with color-coded alerts + +**Who benefits:** +- **Platform Administrators** get instant visibility into cluster health and usage patterns +- **Finance Teams** can track costs in real-time without waiting for monthly reports +- **Team Leaders** can compare their usage against other teams + +--- + +### 💼 Team Management & Budget Monitoring + +![Team Management](/img/ui-team.png) + +**What you see:** +- **Team List** with real-time status indicators: + - 🟢 Green balance = Healthy budget + - 🟡 Yellow balance = Approaching threshold + - 🔴 Red balance = Low balance or suspended +- **Resource Allocation** - CPU/Memory/GPU quotas per team (e.g., "cpu 0/10" means 0 used out of 10 allocated) +- **Project Count** - Number of namespaces/projects under each team +- **Quick Actions** - Edit quotas, recharge balance, or delete team with one click + +**Who benefits:** +- **Team Leaders** monitor their budget status and resource usage at a glance +- **Administrators** manage multiple teams from a single unified view +- **Finance Teams** see which teams need recharging + +--- + +### 💰 Flexible Billing Configuration + +![Billing Configuration](/img/ui-billing.png) + +**What you see:** +- **Per-Resource Pricing** - Set custom prices for CPU (per core-hour), Memory (per GB-hour), GPU (per GPU-hour) +- **Currency Selection** - Support for CNY, USD, EUR, and other currencies +- **Enable/Disable Toggle** - Turn billing on/off for specific resources with one click +- **Billing Rules** - Define how resources are metered (hourly, daily, etc.) +- **Alert Thresholds** - Configure when to send low-balance warnings + +**Who benefits:** +- **Finance Teams** align cloud costs with internal chargeback policies +- **Administrators** adjust pricing based on actual hardware costs +- **Budget Managers** set appropriate warning thresholds to prevent overruns + +--- + +## Core Capabilities + +### Multi-Tenant Management +✅ **Capsule-Powered Isolation** - True multi-tenancy using Kubernetes-native Capsule operator +✅ **OIDC Integration** - Enterprise SSO support for authentication +✅ **Team-Based Access Control** - Manage users, roles, and permissions per team +✅ **Shared & Exclusive Node Pools** - Flexible resource allocation strategies + +### Real-Time Billing +✅ **Usage-Based Billing** - Accurate cost tracking based on actual resource consumption +✅ **Configurable Pricing** - Set custom rates for CPU, Memory, GPU, and any Kubernetes resource +✅ **Multi-Currency Support** - CNY, USD, EUR, and more +✅ **Billing Rules Engine** - Define custom billing logic and aggregation periods + +### Dynamic Resource Quotas +✅ **Per-Team Quotas** - CPU, Memory, GPU, Storage, and custom resources +✅ **Namespace Quotas** - Project-level resource limits within teams +✅ **Auto-Enforcement** - Kubernetes-native quota enforcement +✅ **Quota Alerts** - Notifications when approaching limits + +### Team Balance & Wallet System +✅ **Prepaid Balances** - Team wallets with real-time deduction +✅ **Auto-Deduction** - Automated billing based on resource usage +✅ **Balance Thresholds** - Configurable warning and suspension levels +✅ **Transaction History** - Complete audit trail of all balance changes + +### Auto-Recharge +✅ **Scheduled Top-Ups** - Weekly or monthly automatic recharges +✅ **Custom Amounts** - Flexible recharge amounts per team +✅ **Recharge Notifications** - Alert teams when balance is added + +### Balance Alerts +✅ **Multi-Channel Notifications** - Webhook, DingTalk, WeChat, Email +✅ **Configurable Thresholds** - Set warning levels (e.g., 20%, 10%, 5%) +✅ **Auto-Suspension** - Automatically suspend workloads when balance depleted +✅ **Custom Templates** - Customize alert messages + +### Usage Reports +✅ **Team Analytics** - Per-team cost breakdowns and trends +✅ **Project Analytics** - Namespace-level resource consumption +✅ **Export Capabilities** - CSV, Excel, PDF reports +✅ **Historical Data** - 30/60/90-day cost analysis + +### Audit Logging +✅ **Complete Operation History** - Track all administrative actions +✅ **User Attribution** - Who did what and when +✅ **Resource Changes** - Track quota, balance, and configuration changes +✅ **Compliance Ready** - Meet internal audit requirements + +--- + +## Architecture Highlights + +Bison's architecture is designed for simplicity, scalability, and zero external dependencies. + +```mermaid +graph TB + subgraph USER_LAYER[User Layer] + UI[Web UI
React + Ant Design] + API[REST API
Go + Gin] + end + + subgraph CORE[Core Services] + BS[Billing Service] + TS[Tenant Service] + QS[Quota Service] + end + + subgraph K8S[Kubernetes Layer] + CA[Capsule
Multi-Tenancy] + OC[OpenCost
Cost Tracking] + PR[Prometheus
Metrics] + end + + subgraph DATA[Data Layer] + CM[ConfigMaps
Zero Database] + end + + UI --> API + API --> BS & TS & QS + BS --> OC + TS --> CA + QS --> CA + BS & TS --> CM + OC --> PR +``` + +### Key Architectural Benefits + +- **Zero External Dependencies** - All data stored in Kubernetes ConfigMaps (etcd-backed) +- **Cloud-Native** - Built on Kubernetes primitives for maximum portability +- **Scalable** - Stateless API server that can scale horizontally +- **Secure** - Kubernetes RBAC integration and optional authentication +- **Observable** - Prometheus metrics and structured logging +- **Extensible** - Plugin architecture for custom billing rules and alerts + +--- + +## Integration Points + +### OpenCost Integration +Bison leverages [OpenCost](https://www.opencost.io/) for real-time cost tracking: +- Per-pod, per-namespace, per-team cost visibility +- GPU utilization metrics +- Historical cost data and trends +- Integration with Prometheus for metric collection + +### Capsule Integration +Bison uses [Capsule](https://capsule.clastix.io/) for multi-tenancy: +- Team-based tenant isolation +- Namespace quota enforcement +- Network and security policies +- OIDC/SSO integration + +### Prometheus Integration +Metrics collection and monitoring: +- Resource utilization tracking +- Custom billing metrics +- Alert rule evaluation +- Historical data retention + +--- + +## Next Steps + +- [Installation Guide](installation.md) - Deploy Bison in your cluster +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Deep dive into system design +- [Configuration](configuration.md) - Configure billing and settings diff --git a/website/versioned_docs/version-0.0.8/installation.md b/website/versioned_docs/version-0.0.8/installation.md new file mode 100644 index 0000000..da89191 --- /dev/null +++ b/website/versioned_docs/version-0.0.8/installation.md @@ -0,0 +1,316 @@ +--- +sidebar_position: 3 +--- + +# Installation Guide + +This guide provides detailed instructions for installing Bison in your Kubernetes cluster. + +## Prerequisites + +Before installing Bison, ensure you have: + +- **Kubernetes 1.22+** - A running Kubernetes cluster +- **kubectl** - Configured to access your cluster +- **Helm 3.0+** - Package manager for Kubernetes +- **Capsule Operator v0.1.0+** - For multi-tenant isolation +- **OpenCost** - Deployed with Prometheus for cost tracking + +### Install Prerequisites + +If you haven't installed the required components: + +#### Install Capsule + +```bash +# Using Helm +helm repo add projectcapsule https://projectcapsule.github.io/charts +helm install capsule projectcapsule/capsule \ + --namespace capsule-system \ + --create-namespace +``` + +#### Install OpenCost + +```bash +# Using Helm +helm repo add opencost https://opencost.github.io/opencost-helm-chart +helm install opencost opencost/opencost \ + --namespace opencost-system \ + --create-namespace \ + --set prometheus.internal.serviceName=prometheus-server \ + --set prometheus.internal.namespaceName=prometheus-system +``` + +## Installation Methods + +Bison Helm charts are distributed via **GitHub Container Registry (GHCR)** using the modern OCI format. + +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 + +### Option A: From GHCR (Recommended) + +The simplest way to install Bison is directly from GitHub Container Registry: + +```bash +# Install specific version from GHCR +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace + +# Or pull the chart first, then install +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace + +# Customize installation +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true \ + --set apiServer.image.tag=0.0.2 \ + --set webUI.image.tag=0.0.2 +``` + +**Why GHCR OCI Format?** +- ✅ No separate Helm repository maintenance needed +- ✅ Unified with Docker images in GHCR +- ✅ Faster installation (direct registry pull) +- ✅ Modern Helm 3.8+ standard practice + +### Option B: From GitHub Release + +Download a specific version from GitHub Releases: + +```bash +# Download Helm chart +VERSION=0.0.2 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# Install the chart +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +### Option C: From Source + +Clone and build from source: + +```bash +# Clone repository +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# Install dependencies and build +make install-deps +make build + +# Deploy using Helm +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +## Configuration Options + +Bison can be configured using Helm values. Here are the key configuration options: + +### Basic Configuration + +```yaml +# values.yaml +apiServer: + image: + repository: ghcr.io/supermarioyl/bison/api-server + tag: 0.0.1 + replicas: 2 + +webUI: + image: + repository: ghcr.io/supermarioyl/bison/web-ui + tag: 0.0.1 + replicas: 2 + +# OpenCost URL +opencost: + url: http://opencost.opencost-system.svc:9003 + +# Authentication +auth: + enabled: false +``` + +### Custom Configuration Example + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set apiServer.replicas=3 \ + --set webUI.replicas=3 \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true +``` + +## Verify Installation + +After installation, verify that all components are running: + +```bash +# Check pod status +kubectl get pods -n bison-system + +# Expected output: +# NAME READY STATUS RESTARTS AGE +# bison-api-server-xxxxxxxxx-xxxxx 1/1 Running 0 2m +# bison-webui-xxxxxxxxx-xxxxx 1/1 Running 0 2m + +# Check services +kubectl get svc -n bison-system + +# Check logs +kubectl logs -n bison-system deployment/bison-api-server +kubectl logs -n bison-system deployment/bison-webui +``` + +## Access the Platform + +### Port Forward (Development) + +```bash +# Port-forward the Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# Access at http://localhost:3000 +``` + +### Ingress (Production) + +For production deployments, configure an Ingress: + +```yaml +# ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: bison-ingress + namespace: bison-system + annotations: + kubernetes.io/ingress.class: nginx +spec: + rules: + - host: bison.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: bison-webui + port: + number: 80 + - path: /api + pathType: Prefix + backend: + service: + name: bison-api-server + port: + number: 8080 +``` + +Apply the Ingress: + +```bash +kubectl apply -f ingress.yaml +``` + +## Docker Images + +Bison images are available on GitHub Container Registry: + +```bash +# Pull images +docker pull ghcr.io/supermarioyl/bison/api-server:0.0.1 +docker pull ghcr.io/supermarioyl/bison/web-ui:0.0.1 + +# Or use latest +docker pull ghcr.io/supermarioyl/bison/api-server:latest +docker pull ghcr.io/supermarioyl/bison/web-ui:latest +``` + +**Supported Platforms:** +- `linux/amd64` +- `linux/arm64` + +## Upgrading + +To upgrade Bison to a new version: + +```bash +# Update Helm repository +helm repo update + +# Upgrade to latest version +helm upgrade bison bison/bison --namespace bison-system + +# Or upgrade to specific version +helm upgrade bison bison/bison --version 0.0.2 --namespace bison-system +``` + +## Uninstalling + +To completely remove Bison: + +```bash +# Uninstall Helm release +helm uninstall bison --namespace bison-system + +# Remove namespace (optional) +kubectl delete namespace bison-system +``` + +## Troubleshooting + +### Pod Not Starting + +Check pod logs for errors: + +```bash +kubectl logs -n bison-system deployment/bison-api-server +kubectl describe pod -n bison-system +``` + +### Cannot Connect to OpenCost + +Verify OpenCost is running and accessible: + +```bash +kubectl get svc -n opencost-system +kubectl port-forward -n opencost-system svc/opencost 9003:9003 + +# Test endpoint +curl http://localhost:9003/healthz +``` + +### Authentication Issues + +If authentication is enabled, ensure you have the correct credentials: + +```bash +# Default credentials (change in production!) +Username: admin +Password: admin +``` + +## Next Steps + +- [Configuration Guide](configuration.md) - Configure billing and settings +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Understand the system design diff --git a/website/versioned_docs/version-0.0.8/intro.md b/website/versioned_docs/version-0.0.8/intro.md new file mode 100644 index 0000000..b3dc3aa --- /dev/null +++ b/website/versioned_docs/version-0.0.8/intro.md @@ -0,0 +1,167 @@ +--- +sidebar_position: 1 +slug: / +--- + +# Introduction to Bison + +![Bison Logo](/img/logo.png) + +**Enterprise GPU Resource Billing & Multi-Tenant Management Platform** + +Bison is a Kubernetes-based platform that provides comprehensive GPU resource management, billing, and multi-tenant isolation for organizations running shared GPU clusters. + +## The GPU Management Challenge + +Managing shared GPU clusters across multiple teams creates critical operational and financial challenges: + +**For Platform Administrators:** +- How do you fairly allocate expensive GPU resources across competing teams? +- How do you prevent resource hogging while ensuring everyone gets their fair share? +- How do you track who's using what and implement accurate chargeback? +- How do you maintain strict multi-tenant isolation without complex manual configuration? + +**For Finance & Budget Teams:** +- How do you implement automated chargeback for GPU usage without manual accounting? +- How do you prevent budget overruns before they happen? +- How do you generate accurate cost reports for internal billing? + +**For Development Teams:** +- How do you get predictable, isolated access to GPU resources? +- How do you know when you're approaching your budget limits? +- How do you avoid impacting other teams' workloads? + +**Traditional Approach:** +- Manual quota configuration per namespace +- Excel-based billing calculations +- No real-time cost visibility +- Complex multi-tool setup (quota management + cost tracking + billing system) +- Frequent resource conflicts and budget surprises + +## Bison's Integrated Solution + +```mermaid +graph TB + subgraph WITHOUT["Without Bison"] + P1[❌ Manual Quota Management
Per-namespace configuration] + P2[❌ Spreadsheet Billing
Manual calculations & reports] + P3[❌ No Resource Isolation
Teams compete for resources] + P4[❌ Budget Overruns
No proactive alerts] + P5[❌ Complex Tooling
Multiple systems to manage] + end + + subgraph WITH["With Bison"] + S1[✅ Automated Team Quotas
Capsule-powered isolation] + S2[✅ Real-Time Billing
OpenCost integration] + S3[✅ True Multi-Tenancy
Shared/Exclusive modes] + S4[✅ Proactive Alerts
Balance monitoring & auto-suspend] + S5[✅ Unified Platform
Single pane of glass] + end + + P1 -.Transform.-> S1 + P2 -.Transform.-> S2 + P3 -.Transform.-> S3 + P4 -.Transform.-> S4 + P5 -.Transform.-> S5 + + style WITHOUT fill:#ffebee + style WITH fill:#e8f5e9 + style S1 fill:#4caf50,color:#fff + style S2 fill:#4caf50,color:#fff + style S3 fill:#4caf50,color:#fff + style S4 fill:#4caf50,color:#fff + style S5 fill:#4caf50,color:#fff +``` + +**Bison combines:** +- 🔐 **Kubernetes-native multi-tenancy** (Capsule) - True team isolation with shared or exclusive node pools +- 💰 **Real-time cost tracking** (OpenCost + Prometheus) - Per-pod, per-namespace, per-team cost visibility +- 💳 **Automated billing & budgets** - Prepaid balances, auto-deduction, low-balance alerts, and auto-suspension +- 📊 **Unified dashboard** - Single interface for admins, team leaders, and finance teams +- 🔧 **Zero external dependencies** - All data stored in Kubernetes ConfigMaps (etcd-backed) + +**Result:** Deploy once, get complete GPU resource management with automated billing in under 30 minutes. + +## Quick Start + +### Prerequisites + +- Kubernetes 1.22+ +- kubectl configured +- Helm 3.0+ +- Capsule operator (v0.1.0+) installed +- OpenCost deployed with Prometheus + +### Installation + +Choose one of the following installation methods: + +#### Option A: Using Helm Repository (Recommended) + +```bash +# Add Bison Helm repository +helm repo add bison https://supermarioyl.github.io/Bison/charts/ +helm repo update + +# Install with default configuration +helm install bison bison/bison --namespace bison-system --create-namespace + +# Or customize installation +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=false +``` + +#### Option B: From GitHub Release + +```bash +# Download latest Helm chart +VERSION=0.0.1 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# Install +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +#### Option C: From Source + +```bash +# Clone repository +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# Install dependencies and build +make install-deps +make build + +# Deploy using Helm +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +### Access the Platform + +After installation, access Bison through: + +```bash +# Port-forward the Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# Access at http://localhost:3000 +# Default credentials (if auth enabled): +# Username: admin +# Password: admin (change immediately in production!) +``` + +## Next Steps + +- [Explore Features](features.md) - Learn about all capabilities +- [Installation Guide](installation.md) - Detailed installation instructions +- [User Guides](user-guides/admin.md) - Role-based user guides +- [Architecture](architecture.md) - Understand the system architecture +- [Configuration](configuration.md) - Configure billing and settings diff --git a/website/versioned_docs/version-0.0.8/user-guides/_category_.json b/website/versioned_docs/version-0.0.8/user-guides/_category_.json new file mode 100644 index 0000000..fe79f61 --- /dev/null +++ b/website/versioned_docs/version-0.0.8/user-guides/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "User Guides", + "position": 4, + "link": { + "type": "generated-index", + "description": "Role-based guides for using Bison effectively." + } +} diff --git a/website/versioned_docs/version-0.0.8/user-guides/admin.md b/website/versioned_docs/version-0.0.8/user-guides/admin.md new file mode 100644 index 0000000..a466cbb --- /dev/null +++ b/website/versioned_docs/version-0.0.8/user-guides/admin.md @@ -0,0 +1,176 @@ +--- +sidebar_position: 1 +--- + +# Administrator Guide + +This guide is for platform administrators who deploy, configure, and manage the Bison platform. + +## Responsibilities + +As a platform administrator, you are responsible for: + +- ✅ Deploying and configuring Bison +- ✅ Creating and managing teams +- ✅ Setting global billing configuration +- ✅ Monitoring cluster-wide metrics +- ✅ Responding to alerts and recharge requests + +## Getting Started + +### 1. Deploy Bison + +Follow the [Installation Guide](../installation.md) to deploy Bison in your Kubernetes cluster. + +### 2. Configure Billing + +Set up billing rules and pricing: + +1. Access the Web UI +2. Navigate to **Settings** > **Billing Configuration** +3. Configure: + - **Currency**: USD, CNY, EUR, etc. + - **CPU Price**: Cost per core-hour + - **Memory Price**: Cost per GB-hour + - **GPU Price**: Cost per GPU-hour +4. Click **Save** + +### 3. Create First Team + +Create a team for your users: + +1. Navigate to **Teams** page +2. Click **Create Team** +3. Fill in: + - **Team Name**: e.g., "ml-team" + - **Description**: Team purpose + - **Resource Quota**: + - CPU: e.g., "20" cores + - Memory: e.g., "64Gi" + - GPU: e.g., "4" + - **Initial Balance**: e.g., 1000.00 +4. Click **Create** + +## Common Tasks + +### Managing Teams + +#### View All Teams + +```bash +# Via kubectl +kubectl get tenants + +# Via API +curl http://localhost:8080/api/v1/teams +``` + +#### Update Team Quota + +1. Navigate to **Teams** page +2. Click **Edit** on the team row +3. Modify quotas +4. Click **Save** + +#### Recharge Team Balance + +1. Navigate to **Teams** page +2. Click **Recharge** on the team row +3. Enter amount +4. Add notes (optional) +5. Click **Confirm** + +### Monitoring + +#### View Dashboard + +Access real-time cluster metrics: +- Total teams and projects +- Resource utilization +- Cost trends +- Top consumers +- Balance status + +#### Check Alerts + +Monitor low-balance and quota alerts: +1. Navigate to **Alerts** page +2. Review active alerts +3. Take action as needed + +### Billing Configuration + +#### Update Pricing + +```bash +curl -X PUT http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "pricing": { + "cpu": 0.06, + "memory": 0.012, + "nvidia.com/gpu": 3.00 + } + }' +``` + +#### Configure Alert Thresholds + +```json +{ + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## Best Practices + +### Team Naming +- Use lowercase, alphanumeric characters and hyphens +- Example: `ml-team`, `data-science`, `dev-team` + +### Quota Allocation +- Start with conservative quotas +- Monitor usage for 1-2 weeks +- Adjust based on actual needs + +### Balance Management +- Set up auto-recharge for critical teams +- Monitor balance trends weekly +- Respond to low-balance alerts promptly + +### Security +- Enable authentication in production +- Use OIDC/SSO for enterprise deployments +- Regularly audit user permissions + +## Troubleshooting + +### Team Creation Failed + +Check Capsule operator logs: +```bash +kubectl logs -n capsule-system deployment/capsule-controller-manager +``` + +### Billing Not Working + +Verify OpenCost connectivity: +```bash +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz +``` + +### High Resource Usage + +Check resource consumption: +```bash +kubectl top pods -n bison-system +``` + +## Next Steps + +- [Team Leader Guide](team-leader.md) - Guide for team leaders +- [Developer Guide](developer.md) - Guide for developers +- [Configuration](../configuration.md) - Advanced configuration diff --git a/website/versioned_docs/version-0.0.8/user-guides/developer.md b/website/versioned_docs/version-0.0.8/user-guides/developer.md new file mode 100644 index 0000000..d14ab62 --- /dev/null +++ b/website/versioned_docs/version-0.0.8/user-guides/developer.md @@ -0,0 +1,187 @@ +--- +sidebar_position: 3 +--- + +# Developer Guide + +This guide is for developers who deploy workloads and consume resources within team projects. + +## Responsibilities + +As a developer, you are responsible for: + +- ✅ Deploying applications within your project +- ✅ Monitoring resource usage +- ✅ Staying within quota limits +- ✅ Optimizing resource consumption + +## Getting Started + +### 1. Get Kubeconfig + +Request kubeconfig from your team leader or administrator. + +### 2. Set Context + +```bash +# Set context to your project namespace +kubectl config set-context --current --namespace=your-project + +# Verify +kubectl config view --minify | grep namespace +``` + +### 3. Check Quota + +See your available resources: +```bash +kubectl describe quota +``` + +## Deploying Workloads + +### Basic Pod Deployment + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-training-job + namespace: your-project +spec: + containers: + - name: trainer + image: your-ml-image:latest + resources: + requests: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" +``` + +### Using Deployments + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ml-inference + namespace: your-project +spec: + replicas: 2 + selector: + matchLabels: + app: ml-inference + template: + metadata: + labels: + app: ml-inference + spec: + containers: + - name: inference + image: your-inference-image:latest + resources: + requests: + cpu: "2" + memory: "8Gi" + nvidia.com/gpu: "1" +``` + +## Monitoring Usage + +### Check Pod Resource Usage + +```bash +# View resource consumption +kubectl top pods + +# Detailed pod information +kubectl describe pod +``` + +### View Logs + +```bash +# Stream logs +kubectl logs -f + +# Previous logs (if pod restarted) +kubectl logs --previous +``` + +## Best Practices + +### Resource Requests and Limits + +Always specify both requests and limits: +```yaml +resources: + requests: + cpu: "2" + memory: "8Gi" + limits: + cpu: "4" + memory: "16Gi" +``` + +### GPU Usage + +- Request GPUs only when needed +- Use GPU for compute-intensive tasks +- Monitor GPU utilization + +### Clean Up + +Delete resources when no longer needed: +```bash +# Delete pod +kubectl delete pod + +# Delete deployment +kubectl delete deployment + +# Clean up completed jobs +kubectl delete job --field-selector status.successful=1 +``` + +### Cost Optimization + +- Right-size your resource requests +- Use horizontal pod autoscaling +- Clean up idle resources +- Share GPUs when possible (if supported) + +## Troubleshooting + +### Pod Pending (Insufficient Quota) + +If your pod is stuck in `Pending` state: + +```bash +kubectl describe pod +``` + +Look for quota-related errors and reduce resource requests or ask your team leader for more quota. + +### Out of Memory (OOM) + +If pods are killed due to OOM: +1. Check memory usage patterns +2. Increase memory limits +3. Optimize application memory usage + +### GPU Not Available + +Verify GPU requests: +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,GPU:.status.allocatable."nvidia\.com/gpu" +``` + +## Next Steps + +- [Team Leader Guide](team-leader.md) - Understand team management +- [Architecture](../architecture.md) - Learn about the platform diff --git a/website/versioned_docs/version-0.0.8/user-guides/team-leader.md b/website/versioned_docs/version-0.0.8/user-guides/team-leader.md new file mode 100644 index 0000000..7b6de96 --- /dev/null +++ b/website/versioned_docs/version-0.0.8/user-guides/team-leader.md @@ -0,0 +1,126 @@ +--- +sidebar_position: 2 +--- + +# Team Leader Guide + +This guide is for team leaders who manage projects, monitor budgets, and allocate resources within their team. + +## Responsibilities + +As a team leader, you are responsible for: + +- ✅ Creating and managing projects (namespaces) +- ✅ Allocating quotas to projects +- ✅ Monitoring team balance and consumption +- ✅ Requesting recharges when needed + +## Getting Started + +### 1. Access Bison + +Log in to the Web UI with your credentials. + +### 2. View Team Dashboard + +Your dashboard shows: +- Team balance and status +- Resource utilization +- Active projects +- Cost trends + +## Managing Projects + +### Create a Project + +1. Navigate to **Projects** page +2. Click **Create Project** +3. Fill in: + - **Project Name**: e.g., "training-ml-models" + - **Description**: Project purpose + - **Quota** (optional): + - CPU: e.g., "8" cores + - Memory: e.g., "32Gi" + - GPU: e.g., "2" +4. Click **Create** + +### List Projects + +```bash +# Via kubectl (if you have access) +kubectl get namespaces -l capsule.clastix.io/tenant=your-team + +# Via API +curl http://localhost:8080/api/v1/teams/your-team/projects +``` + +### Delete a Project + +1. Navigate to **Projects** page +2. Click **Delete** on the project row +3. Confirm deletion + +**Warning**: This will delete all resources in the project! + +## Monitoring Budget + +### Check Balance + +View your current balance: +1. Navigate to **Team** page +2. See balance in the status card + +### View Usage Trends + +Analyze spending patterns: +1. Navigate to **Reports** page +2. Select time range (7 days, 30 days, 90 days) +3. View: + - Cost breakdown by resource type + - Daily cost trends + - Per-project consumption + +### Request Recharge + +When balance is low: +1. Click **Request Recharge** button +2. Enter requested amount +3. Add justification +4. Submit request to administrator + +## Resource Management + +### Monitor Quota Usage + +Check how much of your quota is being used: +```bash +kubectl describe quota -n your-project +``` + +### Optimize Costs + +Tips to reduce spending: +- **Right-size resources**: Don't over-provision CPU/Memory +- **Clean up idle pods**: Delete unused workloads +- **Use spot/preemptible instances**: Where applicable +- **Monitor GPU utilization**: Ensure GPUs are fully utilized + +## Best Practices + +### Project Organization +- Create separate projects for different workloads +- Example: `ml-training`, `ml-inference`, `data-processing` + +### Quota Allocation +- Allocate quotas based on project priority +- Reserve buffer for urgent tasks + +### Cost Awareness +- Review costs weekly +- Identify and eliminate waste +- Set up cost alerts + +## Next Steps + +- [Developer Guide](developer.md) - Guide for your team members +- [Features](../features.md) - Explore all Bison features diff --git a/website/versioned_sidebars/version-0.0.8-sidebars.json b/website/versioned_sidebars/version-0.0.8-sidebars.json new file mode 100644 index 0000000..caea0c0 --- /dev/null +++ b/website/versioned_sidebars/version-0.0.8-sidebars.json @@ -0,0 +1,8 @@ +{ + "tutorialSidebar": [ + { + "type": "autogenerated", + "dirName": "." + } + ] +} diff --git a/website/versions.json b/website/versions.json index 297a5c4..2b4b0b6 100644 --- a/website/versions.json +++ b/website/versions.json @@ -1,4 +1,5 @@ [ + "0.0.8", "0.0.7", "0.0.6", "0.0.1" From 66989a8c1b61d2c18529d21118349be5998d45a9 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 22:41:32 +0800 Subject: [PATCH 18/44] fix: use lowercase username for GHCR --- .github/workflows/release.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b4e8963..b8d6034 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -84,8 +84,8 @@ jobs: - name: Set GHCR organization id: ghcr run: | - # Use custom GHCR organization name - echo "owner=lei6393" >> $GITHUB_OUTPUT + # Convert GitHub username to lowercase for GHCR + echo "owner=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT - name: Log in to GitHub Container Registry uses: docker/login-action@v3 @@ -182,10 +182,10 @@ jobs: #### Method 1: From GHCR (Recommended) \`\`\`bash # Install directly from GitHub Container Registry - helm install my-bison oci://ghcr.io/lei6393/charts/bison --version ${VERSION} + helm install my-bison oci://ghcr.io/supermarioyl/charts/bison --version ${VERSION} # Or pull first, then install - helm pull oci://ghcr.io/lei6393/charts/bison --version ${VERSION} + helm pull oci://ghcr.io/supermarioyl/charts/bison --version ${VERSION} helm install my-bison bison-${VERSION}.tgz \`\`\` @@ -198,8 +198,8 @@ jobs: ### 🐳 Docker Images \`\`\`bash - docker pull ghcr.io/lei6393/bison/api-server:${VERSION} - docker pull ghcr.io/lei6393/bison/web-ui:${VERSION} + docker pull ghcr.io/supermarioyl/bison/api-server:${VERSION} + docker pull ghcr.io/supermarioyl/bison/web-ui:${VERSION} \`\`\` ### 📦 What's Changed @@ -259,5 +259,6 @@ jobs: env: VERSION: ${{ needs.prepare.outputs.version }} run: | - # Use custom GHCR organization - helm push bison-${VERSION}.tgz oci://ghcr.io/lei6393/charts + # Convert GitHub username to lowercase for GHCR + REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + helm push bison-${VERSION}.tgz oci://ghcr.io/${REPO_OWNER}/charts From caee2ce52f2f11b4405281fb72d224f221526f69 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sun, 28 Dec 2025 22:51:49 +0800 Subject: [PATCH 19/44] docs: add OCI annotations and improve Helm chart README --- deploy/charts/bison/Chart.yaml | 5 +++++ deploy/charts/bison/README.md | 30 ++++++++++++++++++------------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/deploy/charts/bison/Chart.yaml b/deploy/charts/bison/Chart.yaml index 4a36d93..1e97232 100644 --- a/deploy/charts/bison/Chart.yaml +++ b/deploy/charts/bison/Chart.yaml @@ -14,6 +14,11 @@ keywords: - cost-management maintainers: - name: Bison Team +annotations: + org.opencontainers.image.source: https://github.com/SuperMarioYL/Bison + org.opencontainers.image.description: "Bison Helm Chart - GPU资源计费与多租户管理平台" + org.opencontainers.image.documentation: https://bison.lei6393.com + org.opencontainers.image.usage: "helm install my-bison oci://ghcr.io/supermarioyl/charts/bison --version VERSION" # Dependencies are installed separately: # - Capsule: helm install capsule projectcapsule/capsule -n capsule-system --create-namespace # - OpenCost: helm install opencost opencost/opencost -n opencost --create-namespace diff --git a/deploy/charts/bison/README.md b/deploy/charts/bison/README.md index 59bc116..49c31f2 100644 --- a/deploy/charts/bison/README.md +++ b/deploy/charts/bison/README.md @@ -2,6 +2,12 @@ Kubernetes-based GPU Resource Billing and Scheduling Platform +## ⚠️ 重要提示 / Important Notice + +**这是一个 Helm Chart,请使用 `helm` 命令安装,而不是 `docker pull`!** + +**This is a Helm Chart. Use `helm` command to install, NOT `docker pull`!** + ## Installation **Requirements:** @@ -14,15 +20,15 @@ Install directly from GitHub Container Registry using OCI format: ```bash # Install specific version -helm install my-bison oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install my-bison oci://ghcr.io/supermarioyl/charts/bison --version 0.0.9 # Or pull first, then install -helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 -helm install my-bison bison-0.0.2.tgz +helm pull oci://ghcr.io/supermarioyl/charts/bison --version 0.0.9 +helm install my-bison bison-0.0.9.tgz # With custom configuration -helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ - --version 0.0.2 \ +helm install my-bison oci://ghcr.io/supermarioyl/charts/bison \ + --version 0.0.9 \ --namespace bison-system \ --create-namespace \ --set opencost.url=http://opencost.opencost-system.svc:9003 \ @@ -31,7 +37,7 @@ helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ **Why GHCR OCI Format?** - ✅ No separate Helm repository needed -- ✅ Unified with Docker images in GHCR +- ✅ Unified storage with Docker images in GHCR - ✅ Faster installation - ✅ Modern Helm 3.8+ standard @@ -41,10 +47,10 @@ Download the chart from [GitHub Releases](https://github.com/SuperMarioYL/Bison/ ```bash # Download from release page -wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.2/bison-0.0.2.tgz +wget https://github.com/SuperMarioYL/Bison/releases/download/v0.0.9/bison-0.0.9.tgz # Install -helm install my-bison bison-0.0.2.tgz \ +helm install my-bison bison-0.0.9.tgz \ --namespace bison-system \ --create-namespace ``` @@ -75,7 +81,7 @@ See [values.yaml](./values.yaml) for all configuration options. ### Basic Configuration ```bash -helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ +helm install my-bison oci://ghcr.io/supermarioyl/charts/bison \ --set apiServer.replicas=2 \ --set webUI.replicas=2 ``` @@ -83,11 +89,11 @@ helm install my-bison oci://ghcr.io/supermarioyl/bison/bison \ ## Uninstall ```bash -helm uninstall my-bison +helm uninstall my-bison -n bison-system ``` ## More Information -- [Project Homepage](https://supermarioyl.github.io/Bison/) -- [Documentation](https://supermarioyl.github.io/Bison/docs/) +- [Project Homepage](https://bison.lei6393.com) +- [Documentation](https://bison.lei6393.com/docs/) - [GitHub Repository](https://github.com/SuperMarioYL/Bison) From b3c547e36ac95b4c322008958d06d89659c9cdad Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Mon, 29 Dec 2025 10:01:42 +0800 Subject: [PATCH 20/44] add docs --- .github/workflows/release.yml | 63 +++ deploy/charts/bison/Chart.yaml | 4 +- web-ui/package.json | 2 +- .../current/architecture.md | 418 ++++++++++++++++++ .../current/configuration.md | 361 +++++++++++++++ .../current/features.md | 195 ++++++++ .../current/installation.md | 316 +++++++++++++ .../current/user-guides/_category_.json | 8 + .../current/user-guides/admin.md | 176 ++++++++ .../current/user-guides/developer.md | 187 ++++++++ .../current/user-guides/team-leader.md | 126 ++++++ .../docusaurus-theme-classic/footer.json | 14 +- .../docusaurus-theme-classic/navbar.json | 2 +- 13 files changed, 1861 insertions(+), 11 deletions(-) create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/architecture.md create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/configuration.md create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/features.md create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/installation.md create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/_category_.json create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/admin.md create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/developer.md create mode 100644 website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/team-leader.md diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b8d6034..394f0a7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -229,6 +229,69 @@ jobs: prerelease: false token: ${{ secrets.GITHUB_TOKEN }} + commit-versions: + name: Commit Version Updates to Main + runs-on: ubuntu-latest + needs: [prepare, create-release] + permissions: + contents: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: main + fetch-depth: 0 + + - name: Download updated files + uses: actions/download-artifact@v4 + with: + name: updated-files + path: . + + - name: Configure Git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Commit and push version updates + run: | + VERSION=${{ needs.prepare.outputs.version }} + + # Check if there are changes to commit + if git diff --quiet deploy/charts/bison/Chart.yaml web-ui/package.json; then + echo "⚠️ No changes to commit" + exit 0 + fi + + git add deploy/charts/bison/Chart.yaml + git add web-ui/package.json + + git commit -m "chore: bump version to ${VERSION} [skip ci] + + Auto-updated version files from release ${GITHUB_REF} + + - Updated Chart.yaml version to ${VERSION} + - Updated package.json version to ${VERSION}" + + git push origin HEAD:main + + echo "✅ Version updates committed to main branch" + + - name: Summary + run: | + VERSION=${{ needs.prepare.outputs.version }} + echo "## Version Update Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Version**: ${VERSION}" >> $GITHUB_STEP_SUMMARY + echo "**Release**: ${GITHUB_REF}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "✅ **Status**: Version files committed to main branch" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Updated files**:" >> $GITHUB_STEP_SUMMARY + echo "- \`deploy/charts/bison/Chart.yaml\`" >> $GITHUB_STEP_SUMMARY + echo "- \`web-ui/package.json\`" >> $GITHUB_STEP_SUMMARY + publish-helm-repo: name: Publish to Helm Repository (GHCR) runs-on: ubuntu-latest diff --git a/deploy/charts/bison/Chart.yaml b/deploy/charts/bison/Chart.yaml index 1e97232..4bdba29 100644 --- a/deploy/charts/bison/Chart.yaml +++ b/deploy/charts/bison/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: bison description: Bison - GPU 资源计费平台,基于 Capsule 多租户 + OpenCost 成本追踪 type: application -version: 0.0.1 -appVersion: "0.0.1" +version: 0.0.10 +appVersion: "0.0.10" keywords: - gpu - billing diff --git a/web-ui/package.json b/web-ui/package.json index 1d4bdd1..3b73a5c 100644 --- a/web-ui/package.json +++ b/web-ui/package.json @@ -1,6 +1,6 @@ { "name": "bison-web-ui", - "version": "0.0.1", + "version": "0.0.10", "private": true, "scripts": { "dev": "vite", diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/architecture.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/architecture.md new file mode 100644 index 0000000..9b467ec --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/architecture.md @@ -0,0 +1,418 @@ +--- +sidebar_position: 5 +--- + +# 架构 + +本文档提供 Bison 架构的技术概览,采用高内聚、低耦合的设计原则,以保证可维护性和可扩展性。 + +## 系统概览 + +### 高层架构 + +```mermaid +graph TB + subgraph PRESENT[展示层] + WEB[Web UI
React 18 + Ant Design 5] + CLI[kubectl / API Client] + end + + subgraph GATEWAY[API 网关层] + GW[API Server
Go + Gin Framework] + AUTH[Auth Middleware
JWT + OIDC] + end + + subgraph BUSINESS[业务逻辑层] + TS[Tenant Service
团队与项目 CRUD] + BS[Billing Service
成本计算] + BLS[Balance Service
钱包管理] + QS[Quota Service
资源限制] + AS[Alert Service
通知] + RS[Report Service
分析] + end + + subgraph INTEGRATION[集成层] + K8S[Kubernetes Client
client-go] + OCC[OpenCost Client
REST API] + PC[Prometheus Client
PromQL] + end + + subgraph EXTERNAL[外部系统] + KAPI[Kubernetes API] + CAP[Capsule Controller] + OC[OpenCost] + PROM[Prometheus] + end + + subgraph DATA[数据层] + CM[ConfigMaps
持久化存储] + end + + WEB --> GW + CLI --> GW + GW --> AUTH + AUTH --> TS & BS & BLS & QS & AS & RS + + TS --> K8S + BS --> OCC + BLS --> K8S + QS --> K8S + RS --> OCC & PC + + K8S --> KAPI + K8S --> CAP + OCC --> OC + PC --> PROM + + TS & BLS --> CM + KAPI --> CM +``` + +### 设计原则 + +| 原则 | 实现方式 | +|-----------|----------------| +| **高内聚** | 每个服务处理单一领域(计费、配额、告警) | +| **低耦合** | 服务之间通过明确定义的接口通信 | +| **无状态 API** | 所有状态持久化到 Kubernetes ConfigMaps | +| **云原生** | 利用 Kubernetes 原语实现高可用和扩展 | +| **零数据库** | ConfigMaps 消除了外部数据库依赖 | + +## 架构层次 + +Bison 遵循分层架构模式: + +### 1. 展示层 +- **Web UI**: React 18 + TypeScript + Ant Design 5 +- **API Client**: REST API 用于外部集成 + +### 2. API 网关层 +- **API Server**: Go + Gin 框架 +- **认证**: JWT 和 OIDC 支持 +- **中间件**: 日志、恢复、CORS 处理 + +### 3. 业务逻辑层 +- **Tenant Service**: 团队和项目管理 +- **Billing Service**: 成本计算和聚合 +- **Balance Service**: 钱包管理和自动扣费 +- **Quota Service**: 资源限制执行 +- **Alert Service**: 多渠道通知 +- **Report Service**: 分析和导出 + +### 4. 集成层 +- **Kubernetes Client**: client-go 用于 K8s API 交互 +- **OpenCost Client**: REST API 用于成本数据 +- **Prometheus Client**: PromQL 查询指标 + +### 5. 数据层 +- **ConfigMaps**: 余额、计费配置和元数据的持久化存储 +- **etcd**: 通过 Kubernetes ConfigMaps 作为后端存储 + +## 核心组件 + +### API Server + +API Server 是处理所有 HTTP 请求的核心组件: + +**技术栈:** +- Go 1.24+ +- Gin web 框架 +- client-go 用于 Kubernetes API + +**关键职责:** +- 提供 REST API 端点 +- 认证和授权 +- 请求路由和中间件 +- 后台任务调度 + +**端点:** +``` +/api/v1/teams - 团队管理 +/api/v1/projects - 项目管理 +/api/v1/billing - 计费配置 +/api/v1/balance - 余额操作 +/api/v1/stats - 统计和报告 +``` + +### Web UI + +基于 React 的单页应用: + +**技术栈:** +- React 18 +- TypeScript +- Vite(构建工具) +- Ant Design 5 +- ECharts(可视化) +- React Query(状态管理) + +**功能:** +- 实时指标仪表板 +- 团队和项目管理 +- 计费配置 +- 余额监控 +- 使用报告和导出 + +### Billing Service + +基于资源使用计算成本: + +**数据流:** +```mermaid +sequenceDiagram + participant S as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant K8S as Kubernetes + + S->>BS: 触发计费(每 10 分钟) + BS->>OC: 查询团队成本 + OC-->>BS: 返回使用数据 + BS->>BS: 计算成本 + BS->>BLS: 从余额扣除 + BLS->>K8S: 更新 ConfigMap + BLS-->>BS: 确认 + BS->>AS: 检查阈值 + alt 余额不足 + AS->>Webhook: 发送告警 + end +``` + +**计费公式:** +``` +总成本 = (CPU_核数 × CPU_价格 × 小时) + + (内存_GB × 内存_价格 × 小时) + + (GPU_数量 × GPU_价格 × 小时) +``` + +### Balance Service + +管理团队钱包和自动扣费: + +**功能:** +- 实时余额追踪 +- 基于使用量自动扣费 +- 充值操作 +- 交易历史 +- 余额耗尽时自动暂停 + +**存储:** +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: bison-team-balances + namespace: bison-system +data: + ml-team: "1523.45" + data-team: "890.12" + dev-team: "2100.00" +``` + +### Tenant Service + +管理团队(Capsule Tenants)和项目(Namespaces): + +**功能:** +- 创建/删除团队 +- 分配资源配额 +- 配置节点池(共享/独占) +- 管理团队元数据 + +**Capsule 集成:** +```yaml +apiVersion: capsule.clastix.io/v1beta1 +kind: Tenant +metadata: + name: ml-team +spec: + owners: + - name: team-leader + kind: User + resourceQuota: + items: + - hard: + cpu: "20" + memory: 64Gi + nvidia.com/gpu: "4" +``` + +## 数据流 + +### 团队创建流程 + +```mermaid +sequenceDiagram + participant U as Admin + participant API as API Server + participant TS as Tenant Service + participant K8S as Kubernetes + participant CAP as Capsule + + U->>API: POST /api/v1/teams + API->>TS: CreateTeam(name, quota, balance) + TS->>CAP: 创建 Tenant + CAP-->>TS: Tenant 已创建 + TS->>K8S: 创建 ConfigMap(余额) + K8S-->>TS: ConfigMap 已创建 + TS-->>API: 成功 + API-->>U: 201 Created +``` + +### 计费周期流程 + +```mermaid +sequenceDiagram + participant SCH as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant AS as Alert Service + + loop 每 10 分钟 + SCH->>BS: 触发计费计算 + BS->>OC: 查询成本(最近 10 分钟) + OC-->>BS: 返回使用指标 + BS->>BS: 计算总成本 + BS->>BLS: 从余额扣除成本 + BLS->>BLS: 更新余额 + BLS-->>BS: 余额已更新 + BS->>AS: 检查余额阈值 + alt 余额 < 20% + AS->>AS: 发送低余额告警 + end + alt 余额 <= 0 + AS->>BS: 暂停团队工作负载 + end + end +``` + +## 集成点 + +### Kubernetes 集成 + +Bison 与 Kubernetes 深度集成: + +- **Capsule Tenants** 用于多租户 +- **ResourceQuotas** 用于限制执行 +- **Namespaces** 用于项目隔离 +- **ConfigMaps** 用于数据持久化 +- **RBAC** 用于访问控制 + +### OpenCost 集成 + +通过 OpenCost API 实现实时成本追踪: + +```bash +# 查询团队成本 +GET /allocation/compute?window=10m&aggregate=namespace&filter=namespace:ml-team + +# 响应 +{ + "ml-team": { + "cpuCost": 0.25, + "memCost": 0.10, + "gpuCost": 4.17, + "totalCost": 4.52 + } +} +``` + +### Prometheus 集成 + +指标收集用于监控: + +- 资源利用率指标 +- 成本指标 +- 余额指标 +- 告警指标 + +## 部署架构 + +### 高可用设置 + +```mermaid +graph TB + subgraph K8S[Kubernetes 集群] + subgraph NS1[bison-system 命名空间] + API1[API Server Pod 1] + API2[API Server Pod 2] + WEB1[Web UI Pod 1] + WEB2[Web UI Pod 2] + end + + subgraph NS2[opencost-system 命名空间] + OC[OpenCost] + end + + subgraph NS3[prometheus-system 命名空间] + PROM[Prometheus] + end + + LB[LoadBalancer] + end + + LB --> API1 & API2 + LB --> WEB1 & WEB2 + API1 & API2 --> OC + OC --> PROM +``` + +### 资源要求 + +**最低配置:** +- API Server: 200m CPU,256Mi 内存 +- Web UI: 100m CPU,128Mi 内存 + +**推荐配置(生产环境):** +- API Server: 1000m CPU,512Mi 内存(2 个副本) +- Web UI: 500m CPU,256Mi 内存(2 个副本) + +## 安全模型 + +### 认证 +- 基于 JWT token 的认证 +- OIDC/SSO 集成 +- 管理员用户管理 + +### 授权 +- Kubernetes RBAC 集成 +- 基于角色的访问控制 +- 团队范围的权限 + +### 数据安全 +- 所有数据静态加密(etcd 加密) +- API 通信使用 TLS +- 通过 Kubernetes Secrets 管理密钥 + +## 技术栈 + +### 后端 +- **语言**: Go 1.24+ +- **框架**: Gin +- **Kubernetes 客户端**: client-go +- **配置**: Viper +- **日志**: Logrus + +### 前端 +- **框架**: React 18 +- **语言**: TypeScript +- **UI 库**: Ant Design 5 +- **构建工具**: Vite +- **状态管理**: React Query +- **图表**: ECharts + +### 基础设施 +- **平台**: Kubernetes 1.22+ +- **多租户**: Capsule +- **成本追踪**: OpenCost +- **指标**: Prometheus +- **存储**: ConfigMaps(etcd) + +## 下一步 + +- [安装指南](installation.md) - 部署 Bison +- [配置](configuration.md) - 配置计费 +- [用户指南](user-guides/admin.md) - 学习使用 Bison +- [功能特性](features.md) - 探索功能 diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/configuration.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/configuration.md new file mode 100644 index 0000000..e8d7e4f --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/configuration.md @@ -0,0 +1,361 @@ +--- +sidebar_position: 6 +--- + +# 配置 + +本指南介绍如何根据您的特定环境和需求配置 Bison。 + +## Helm Chart 配置 + +Bison 主要通过 Helm values 进行配置。您可以通过提供 `values.yaml` 文件或使用 `--set` 参数来自定义安装。 + +### 关键配置参数 + +| 参数 | 描述 | 默认值 | 示例 | +|-----------|-------------|---------|---------| +| `auth.enabled` | 启用认证 | `false` | `true` | +| `auth.admin.username` | 管理员用户名 | `admin` | `admin` | +| `auth.admin.password` | 管理员密码 | `admin` | `changeme` | +| `apiServer.replicaCount` | API Server 副本数 | `2` | `3` | +| `apiServer.image.repository` | API Server 镜像 | `ghcr.io/supermarioyl/bison/api-server` | - | +| `apiServer.image.tag` | API Server 镜像标签 | `0.0.1` | `latest` | +| `webUI.replicaCount` | Web UI 副本数 | `2` | `3` | +| `webUI.image.repository` | Web UI 镜像 | `ghcr.io/supermarioyl/bison/web-ui` | - | +| `webUI.image.tag` | Web UI 镜像标签 | `0.0.1` | `latest` | +| `opencost.url` | OpenCost API 端点 | `http://opencost.opencost-system.svc:9003` | 自定义 URL | + +### 自定义 Values 示例 + +创建一个 `custom-values.yaml` 文件: + +```yaml +# 认证 +auth: + enabled: true + admin: + username: admin + password: MySecurePassword123 + +# API Server +apiServer: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 512Mi + +# Web UI +webUI: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + +# OpenCost 集成 +opencost: + url: http://opencost.opencost-system.svc:9003 + +# 节点选择(可选) +nodeSelector: + node-role.kubernetes.io/control-plane: "" + +# 容忍度(可选) +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule +``` + +使用自定义 values 安装: + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --values custom-values.yaml +``` + +## 计费配置 + +计费设置在安装后通过 Web UI 或 API 进行配置。 + +### 访问计费配置 + +1. **通过 Web UI:** + - 导航到 **设置** > **计费配置** + - 设置 CPU、内存、GPU 和其他资源的价格 + - 配置货币和计费周期 + +2. **通过 API:** + ```bash + curl -X POST http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50 + }, + "billingInterval": "hourly" + }' + ``` + +### 计费参数 + +| 参数 | 描述 | 示例 | +|-----------|-------------|---------| +| `enabled` | 启用/禁用计费 | `true` | +| `currency` | 计费货币 | `USD`, `CNY`, `EUR` | +| `pricing.cpu` | CPU 价格(每核心小时) | `0.05` | +| `pricing.memory` | 内存价格(每 GB 小时) | `0.01` | +| `pricing["nvidia.com/gpu"]` | GPU 价格(每 GPU 小时) | `2.50` | +| `billingInterval` | 计费聚合周期 | `hourly`, `daily` | +| `lowBalanceThreshold` | 警告阈值(%) | `20` | +| `suspendThreshold` | 自动暂停阈值(%) | `5` | + +### 计费配置示例 + +```json +{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "nvidia.com/mig-1g.5gb": 0.50, + "nvidia.com/mig-2g.10gb": 1.00 + }, + "billingInterval": "hourly", + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## 团队配置 + +### 创建团队 + +团队可以通过 Web UI 或 API 创建: + +**通过 Web UI:** +1. 导航到 **团队** 页面 +2. 点击 **创建团队** +3. 设置团队名称、配额和初始余额 + +**通过 API:** +```bash +curl -X POST http://localhost:8080/api/v1/teams \ + -H "Content-Type: application/json" \ + -d '{ + "name": "ml-team", + "description": "Machine Learning Team", + "quota": { + "cpu": "20", + "memory": "64Gi", + "nvidia.com/gpu": "4" + }, + "balance": 1000.00 + }' +``` + +### 团队配额 + +团队配额定义资源限制: + +```yaml +quota: + cpu: "20" # 20 个 CPU 核心 + memory: "64Gi" # 64 GB 内存 + nvidia.com/gpu: "4" # 4 个 GPU + storage: "500Gi" # 500 GB 存储 +``` + +### 团队余额管理 + +设置初始余额并配置自动充值: + +```json +{ + "balance": 1000.00, + "autoRecharge": { + "enabled": true, + "amount": 500.00, + "schedule": "monthly", + "threshold": 100.00 + } +} +``` + +## 告警配置 + +配置多渠道告警,用于低余额和配额警告。 + +### Webhook 告警 + +```json +{ + "type": "webhook", + "enabled": true, + "url": "https://your-webhook-endpoint.com/alerts", + "headers": { + "Authorization": "Bearer YOUR_TOKEN" + }, + "template": { + "title": "Bison Alert", + "message": "Team {{.TeamName}} balance is {{.Balance}}" + } +} +``` + +### 钉钉告警 + +```json +{ + "type": "dingtalk", + "enabled": true, + "webhook": "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN", + "secret": "YOUR_SECRET" +} +``` + +### 企业微信告警 + +```json +{ + "type": "wechat", + "enabled": true, + "corpid": "YOUR_CORP_ID", + "corpsecret": "YOUR_CORP_SECRET", + "agentid": 1000001 +} +``` + +## OpenCost 集成 + +配置 OpenCost 连接: + +### 检查 OpenCost 连通性 + +```bash +# 测试 OpenCost API +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz + +# 测试 allocation API +curl http://localhost:9003/allocation/compute?window=1d +``` + +### 更新 OpenCost URL + +如果 OpenCost 部署在不同的命名空间或使用不同的服务名称: + +```bash +helm upgrade bison bison/bison \ + --set opencost.url=http://my-opencost.custom-namespace.svc:9003 \ + --namespace bison-system +``` + +## 认证与 OIDC + +启用认证并与您的 SSO 提供商集成: + +### 基本认证 + +```yaml +auth: + enabled: true + admin: + username: admin + password: SecurePassword123 +``` + +### OIDC 集成 + +```yaml +auth: + enabled: true + oidc: + enabled: true + issuerURL: https://your-oidc-provider.com + clientID: bison-client-id + clientSecret: your-client-secret + redirectURL: https://bison.example.com/callback +``` + +## 环境变量 + +可以通过环境变量提供其他配置: + +| 变量 | 描述 | 默认值 | +|----------|-------------|---------| +| `KUBECONFIG` | kubeconfig 文件路径 | 集群内配置 | +| `OPENCOST_URL` | OpenCost API URL | `http://opencost.opencost-system.svc:9003` | +| `AUTH_ENABLED` | 启用认证 | `false` | +| `LOG_LEVEL` | 日志级别 | `info` | +| `BILLING_INTERVAL` | 计费计算间隔 | `10m` | + +在 Helm values 中设置环境变量: + +```yaml +apiServer: + env: + - name: LOG_LEVEL + value: debug + - name: BILLING_INTERVAL + value: 5m +``` + +## 高级配置 + +### 自定义资源定价 + +为任何 Kubernetes 资源定价: + +```json +{ + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "amd.com/gpu": 2.00, + "ephemeral-storage": 0.001, + "custom.io/fpga": 5.00 + } +} +``` + +### 多集群支持 + +在每个集群中部署 Bison,共享计费: + +```yaml +# 集群 A +apiServer: + clusterName: prod-us-west + +# 集群 B +apiServer: + clusterName: prod-us-east +``` + +## 下一步 + +- [用户指南](user-guides/admin.md) - 学习如何使用 Bison +- [架构](architecture.md) - 理解系统设计 +- [功能特性](features.md) - 探索所有功能 diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/features.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/features.md new file mode 100644 index 0000000..fd3be31 --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/features.md @@ -0,0 +1,195 @@ +--- +sidebar_position: 2 +--- + +# 功能特性 + +Bison 为 Kubernetes 环境中的 GPU 资源管理、计费和多租户隔离提供了全面的功能套件。 + +## Bison 实际运行 + +### 🎯 实时资源仪表板 + +![Bison Dashboard](/img/ui-dashboard.png) + +**您可以看到:** +- **集群概览** - 一目了然地查看总团队数、项目数、资源池和配额 +- **资源利用率** - 可视化展示哪些团队正在消耗资源 +- **7 天成本趋势** - 历史成本数据,用于识别支出模式 +- **Top 5 成本排名** - 快速识别 GPU 重度使用者 +- **团队预算状态** - 实时余额监控,带有颜色编码的告警 + +**谁会受益:** +- **平台管理员** 即时了解集群健康状况和使用模式 +- **财务团队** 可以实时跟踪成本,无需等待月度报告 +- **团队负责人** 可以将自己的使用情况与其他团队进行比较 + +--- + +### 💼 团队管理与预算监控 + +![Team Management](/img/ui-team.png) + +**您可以看到:** +- **团队列表** 带有实时状态指示器: + - 🟢 绿色余额 = 预算健康 + - 🟡 黄色余额 = 接近阈值 + - 🔴 红色余额 = 余额不足或已暂停 +- **资源分配** - 每个团队的 CPU/内存/GPU 配额(例如,"cpu 0/10" 表示已分配 10 个中使用了 0 个) +- **项目计数** - 每个团队下的命名空间/项目数量 +- **快速操作** - 一键编辑配额、充值余额或删除团队 + +**谁会受益:** +- **团队负责人** 一目了然地监控他们的预算状态和资源使用情况 +- **管理员** 从单一统一视图管理多个团队 +- **财务团队** 查看哪些团队需要充值 + +--- + +### 💰 灵活的计费配置 + +![Billing Configuration](/img/ui-billing.png) + +**您可以看到:** +- **按资源定价** - 为 CPU(每核心小时)、内存(每 GB 小时)、GPU(每 GPU 小时)设置自定义价格 +- **货币选择** - 支持 CNY、USD、EUR 等货币 +- **启用/禁用开关** - 一键打开或关闭特定资源的计费 +- **计费规则** - 定义资源计量方式(每小时、每天等) +- **告警阈值** - 配置何时发送低余额警告 + +**谁会受益:** +- **财务团队** 将云成本与内部退款政策对齐 +- **管理员** 根据实际硬件成本调整定价 +- **预算管理者** 设置适当的警告阈值以防止超支 + +--- + +## 核心能力 + +### 多租户管理 +✅ **Capsule 驱动的隔离** - 使用 Kubernetes 原生 Capsule operator 实现真正的多租户 +✅ **OIDC 集成** - 企业 SSO 支持认证 +✅ **基于团队的访问控制** - 管理每个团队的用户、角色和权限 +✅ **共享与独占节点池** - 灵活的资源分配策略 + +### 实时计费 +✅ **基于使用量的计费** - 基于实际资源消耗的准确成本追踪 +✅ **可配置定价** - 为 CPU、内存、GPU 和任何 Kubernetes 资源设置自定义费率 +✅ **多货币支持** - CNY、USD、EUR 等 +✅ **计费规则引擎** - 定义自定义计费逻辑和聚合周期 + +### 动态资源配额 +✅ **团队级配额** - CPU、内存、GPU、存储和自定义资源 +✅ **命名空间配额** - 团队内的项目级资源限制 +✅ **自动执行** - Kubernetes 原生配额执行 +✅ **配额告警** - 接近限制时的通知 + +### 团队余额与钱包系统 +✅ **预付费余额** - 团队钱包实时扣费 +✅ **自动扣费** - 基于资源使用的自动计费 +✅ **余额阈值** - 可配置的警告和暂停级别 +✅ **交易历史** - 所有余额变更的完整审计跟踪 + +### 自动充值 +✅ **定时充值** - 每周或每月自动充值 +✅ **自定义金额** - 每个团队的灵活充值金额 +✅ **充值通知** - 余额增加时通知团队 + +### 余额告警 +✅ **多渠道通知** - Webhook、钉钉、微信、邮件 +✅ **可配置阈值** - 设置警告级别(例如 20%、10%、5%) +✅ **自动暂停** - 余额耗尽时自动暂停工作负载 +✅ **自定义模板** - 自定义告警消息 + +### 使用报告 +✅ **团队分析** - 团队级成本细分和趋势 +✅ **项目分析** - 命名空间级资源消耗 +✅ **导出功能** - CSV、Excel、PDF 报告 +✅ **历史数据** - 30/60/90 天成本分析 + +### 审计日志 +✅ **完整操作历史** - 跟踪所有管理操作 +✅ **用户归属** - 谁在何时做了什么 +✅ **资源变更** - 跟踪配额、余额和配置变更 +✅ **合规就绪** - 满足内部审计要求 + +--- + +## 架构亮点 + +Bison 的架构设计简单、可扩展,且无外部依赖。 + +```mermaid +graph TB + subgraph USER_LAYER[用户层] + UI[Web UI
React + Ant Design] + API[REST API
Go + Gin] + end + + subgraph CORE[核心服务] + BS[Billing Service] + TS[Tenant Service] + QS[Quota Service] + end + + subgraph K8S[Kubernetes 层] + CA[Capsule
多租户] + OC[OpenCost
成本追踪] + PR[Prometheus
指标] + end + + subgraph DATA[数据层] + CM[ConfigMaps
零数据库] + end + + UI --> API + API --> BS & TS & QS + BS --> OC + TS --> CA + QS --> CA + BS & TS --> CM + OC --> PR +``` + +### 关键架构优势 + +- **零外部依赖** - 所有数据存储在 Kubernetes ConfigMaps(etcd 支持) +- **云原生** - 基于 Kubernetes 原语构建,实现最大可移植性 +- **可扩展** - 无状态 API Server 可以水平扩展 +- **安全** - Kubernetes RBAC 集成和可选认证 +- **可观测** - Prometheus 指标和结构化日志 +- **可扩展** - 用于自定义计费规则和告警的插件架构 + +--- + +## 集成点 + +### OpenCost 集成 +Bison 利用 [OpenCost](https://www.opencost.io/) 进行实时成本追踪: +- 按 pod、按命名空间、按团队的成本可见性 +- GPU 利用率指标 +- 历史成本数据和趋势 +- 与 Prometheus 集成以收集指标 + +### Capsule 集成 +Bison 使用 [Capsule](https://capsule.clastix.io/) 实现多租户: +- 基于团队的租户隔离 +- 命名空间配额执行 +- 网络和安全策略 +- OIDC/SSO 集成 + +### Prometheus 集成 +指标收集和监控: +- 资源利用率追踪 +- 自定义计费指标 +- 告警规则评估 +- 历史数据保留 + +--- + +## 下一步 + +- [安装指南](installation.md) - 在您的集群中部署 Bison +- [用户指南](user-guides/admin.md) - 学习如何使用 Bison +- [架构](architecture.md) - 深入了解系统设计 +- [配置](configuration.md) - 配置计费和设置 diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/installation.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/installation.md new file mode 100644 index 0000000..55dae7f --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/installation.md @@ -0,0 +1,316 @@ +--- +sidebar_position: 3 +--- + +# 安装指南 + +本指南提供在 Kubernetes 集群中安装 Bison 的详细说明。 + +## 前置要求 + +在安装 Bison 之前,请确保您具备: + +- **Kubernetes 1.22+** - 正在运行的 Kubernetes 集群 +- **kubectl** - 已配置为访问您的集群 +- **Helm 3.0+** - Kubernetes 包管理器 +- **Capsule Operator v0.1.0+** - 用于多租户隔离 +- **OpenCost** - 已与 Prometheus 一起部署用于成本追踪 + +### 安装前置组件 + +如果您还没有安装所需的组件: + +#### 安装 Capsule + +```bash +# 使用 Helm +helm repo add projectcapsule https://projectcapsule.github.io/charts +helm install capsule projectcapsule/capsule \ + --namespace capsule-system \ + --create-namespace +``` + +#### 安装 OpenCost + +```bash +# 使用 Helm +helm repo add opencost https://opencost.github.io/opencost-helm-chart +helm install opencost opencost/opencost \ + --namespace opencost-system \ + --create-namespace \ + --set prometheus.internal.serviceName=prometheus-server \ + --set prometheus.internal.namespaceName=prometheus-system +``` + +## 安装方法 + +Bison Helm charts 通过 **GitHub Container Registry (GHCR)** 使用现代 OCI 格式分发。 + +**要求:** +- Helm >= 3.8.0(用于 OCI 支持) +- Kubernetes >= 1.22 + +### 方式 A:从 GHCR 安装(推荐) + +从 GitHub Container Registry 直接安装 Bison 是最简单的方法: + +```bash +# 从 GHCR 安装特定版本 +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace + +# 或先拉取 chart,然后安装 +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace + +# 自定义安装 +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true \ + --set apiServer.image.tag=0.0.2 \ + --set webUI.image.tag=0.0.2 +``` + +**为什么使用 GHCR OCI 格式?** +- ✅ 无需维护单独的 Helm 仓库 +- ✅ 在 GHCR 中与 Docker 镜像统一 +- ✅ 更快的安装速度(直接从注册表拉取) +- ✅ 现代 Helm 3.8+ 标准实践 + +### 方式 B:从 GitHub Release 安装 + +从 GitHub Releases 下载特定版本: + +```bash +# 下载 Helm chart +VERSION=0.0.2 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# 安装 chart +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +### 方式 C:从源码安装 + +克隆并从源码构建: + +```bash +# 克隆仓库 +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# 安装依赖并构建 +make install-deps +make build + +# 使用 Helm 部署 +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +## 配置选项 + +Bison 可以使用 Helm values 进行配置。以下是关键配置选项: + +### 基本配置 + +```yaml +# values.yaml +apiServer: + image: + repository: ghcr.io/supermarioyl/bison/api-server + tag: 0.0.1 + replicas: 2 + +webUI: + image: + repository: ghcr.io/supermarioyl/bison/web-ui + tag: 0.0.1 + replicas: 2 + +# OpenCost URL +opencost: + url: http://opencost.opencost-system.svc:9003 + +# 认证 +auth: + enabled: false +``` + +### 自定义配置示例 + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set apiServer.replicas=3 \ + --set webUI.replicas=3 \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true +``` + +## 验证安装 + +安装后,验证所有组件是否正在运行: + +```bash +# 检查 pod 状态 +kubectl get pods -n bison-system + +# 预期输出: +# NAME READY STATUS RESTARTS AGE +# bison-api-server-xxxxxxxxx-xxxxx 1/1 Running 0 2m +# bison-webui-xxxxxxxxx-xxxxx 1/1 Running 0 2m + +# 检查服务 +kubectl get svc -n bison-system + +# 检查日志 +kubectl logs -n bison-system deployment/bison-api-server +kubectl logs -n bison-system deployment/bison-webui +``` + +## 访问平台 + +### 端口转发(开发环境) + +```bash +# 端口转发 Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# 访问 http://localhost:3000 +``` + +### Ingress(生产环境) + +对于生产部署,配置 Ingress: + +```yaml +# ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: bison-ingress + namespace: bison-system + annotations: + kubernetes.io/ingress.class: nginx +spec: + rules: + - host: bison.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: bison-webui + port: + number: 80 + - path: /api + pathType: Prefix + backend: + service: + name: bison-api-server + port: + number: 8080 +``` + +应用 Ingress: + +```bash +kubectl apply -f ingress.yaml +``` + +## Docker 镜像 + +Bison 镜像可在 GitHub Container Registry 上获取: + +```bash +# 拉取镜像 +docker pull ghcr.io/supermarioyl/bison/api-server:0.0.1 +docker pull ghcr.io/supermarioyl/bison/web-ui:0.0.1 + +# 或使用 latest +docker pull ghcr.io/supermarioyl/bison/api-server:latest +docker pull ghcr.io/supermarioyl/bison/web-ui:latest +``` + +**支持的平台:** +- `linux/amd64` +- `linux/arm64` + +## 升级 + +将 Bison 升级到新版本: + +```bash +# 更新 Helm 仓库 +helm repo update + +# 升级到最新版本 +helm upgrade bison bison/bison --namespace bison-system + +# 或升级到特定版本 +helm upgrade bison bison/bison --version 0.0.2 --namespace bison-system +``` + +## 卸载 + +完全删除 Bison: + +```bash +# 卸载 Helm release +helm uninstall bison --namespace bison-system + +# 删除命名空间(可选) +kubectl delete namespace bison-system +``` + +## 故障排查 + +### Pod 无法启动 + +检查 pod 日志以查找错误: + +```bash +kubectl logs -n bison-system deployment/bison-api-server +kubectl describe pod -n bison-system +``` + +### 无法连接到 OpenCost + +验证 OpenCost 是否正在运行且可访问: + +```bash +kubectl get svc -n opencost-system +kubectl port-forward -n opencost-system svc/opencost 9003:9003 + +# 测试端点 +curl http://localhost:9003/healthz +``` + +### 认证问题 + +如果启用了认证,请确保您有正确的凭据: + +```bash +# 默认凭据(生产环境请更改!) +用户名: admin +密码: admin +``` + +## 下一步 + +- [配置指南](configuration.md) - 配置计费和设置 +- [用户指南](user-guides/admin.md) - 学习如何使用 Bison +- [架构](architecture.md) - 理解系统设计 diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/_category_.json b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/_category_.json new file mode 100644 index 0000000..3fbcf22 --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "用户指南", + "position": 4, + "link": { + "type": "generated-index", + "description": "基于角色的 Bison 使用指南。" + } +} diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/admin.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/admin.md new file mode 100644 index 0000000..28d259d --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/admin.md @@ -0,0 +1,176 @@ +--- +sidebar_position: 1 +--- + +# 管理员指南 + +本指南面向部署、配置和管理 Bison 平台的平台管理员。 + +## 职责 + +作为平台管理员,您负责: + +- ✅ 部署和配置 Bison +- ✅ 创建和管理团队 +- ✅ 设置全局计费配置 +- ✅ 监控集群范围的指标 +- ✅ 响应告警和充值请求 + +## 入门 + +### 1. 部署 Bison + +按照[安装指南](../installation.md)在您的 Kubernetes 集群中部署 Bison。 + +### 2. 配置计费 + +设置计费规则和定价: + +1. 访问 Web UI +2. 导航到 **设置** > **计费配置** +3. 配置: + - **货币**: USD、CNY、EUR 等 + - **CPU 价格**: 每核心小时的成本 + - **内存价格**: 每 GB 小时的成本 + - **GPU 价格**: 每 GPU 小时的成本 +4. 点击 **保存** + +### 3. 创建第一个团队 + +为您的用户创建团队: + +1. 导航到 **团队** 页面 +2. 点击 **创建团队** +3. 填写: + - **团队名称**: 例如 "ml-team" + - **描述**: 团队用途 + - **资源配额**: + - CPU: 例如 "20" 核心 + - 内存: 例如 "64Gi" + - GPU: 例如 "4" + - **初始余额**: 例如 1000.00 +4. 点击 **创建** + +## 常见任务 + +### 管理团队 + +#### 查看所有团队 + +```bash +# 通过 kubectl +kubectl get tenants + +# 通过 API +curl http://localhost:8080/api/v1/teams +``` + +#### 更新团队配额 + +1. 导航到 **团队** 页面 +2. 点击团队行上的 **编辑** +3. 修改配额 +4. 点击 **保存** + +#### 充值团队余额 + +1. 导航到 **团队** 页面 +2. 点击团队行上的 **充值** +3. 输入金额 +4. 添加备注(可选) +5. 点击 **确认** + +### 监控 + +#### 查看仪表板 + +访问实时集群指标: +- 总团队数和项目数 +- 资源利用率 +- 成本趋势 +- 热门消费者 +- 余额状态 + +#### 检查告警 + +监控低余额和配额告警: +1. 导航到 **告警** 页面 +2. 查看活动告警 +3. 根据需要采取行动 + +### 计费配置 + +#### 更新定价 + +```bash +curl -X PUT http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "pricing": { + "cpu": 0.06, + "memory": 0.012, + "nvidia.com/gpu": 3.00 + } + }' +``` + +#### 配置告警阈值 + +```json +{ + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## 最佳实践 + +### 团队命名 +- 使用小写字母、数字和连字符 +- 示例:`ml-team`、`data-science`、`dev-team` + +### 配额分配 +- 从保守的配额开始 +- 监控 1-2 周的使用情况 +- 根据实际需求调整 + +### 余额管理 +- 为关键团队设置自动充值 +- 每周监控余额趋势 +- 及时响应低余额告警 + +### 安全 +- 在生产环境中启用认证 +- 使用 OIDC/SSO 进行企业部署 +- 定期审计用户权限 + +## 故障排查 + +### 团队创建失败 + +检查 Capsule operator 日志: +```bash +kubectl logs -n capsule-system deployment/capsule-controller-manager +``` + +### 计费无法工作 + +验证 OpenCost 连接性: +```bash +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz +``` + +### 高资源使用率 + +检查资源消耗: +```bash +kubectl top pods -n bison-system +``` + +## 下一步 + +- [团队负责人指南](team-leader.md) - 团队负责人指南 +- [开发者指南](developer.md) - 开发者指南 +- [配置](../configuration.md) - 高级配置 diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/developer.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/developer.md new file mode 100644 index 0000000..17f9915 --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/developer.md @@ -0,0 +1,187 @@ +--- +sidebar_position: 3 +--- + +# 开发者指南 + +本指南面向在团队项目中部署工作负载和消耗资源的开发者。 + +## 职责 + +作为开发者,您负责: + +- ✅ 在您的项目中部署应用程序 +- ✅ 监控资源使用情况 +- ✅ 保持在配额限制内 +- ✅ 优化资源消耗 + +## 入门 + +### 1. 获取 Kubeconfig + +向您的团队负责人或管理员请求 kubeconfig。 + +### 2. 设置上下文 + +```bash +# 将上下文设置为您的项目命名空间 +kubectl config set-context --current --namespace=your-project + +# 验证 +kubectl config view --minify | grep namespace +``` + +### 3. 检查配额 + +查看您的可用资源: +```bash +kubectl describe quota +``` + +## 部署工作负载 + +### 基本 Pod 部署 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-training-job + namespace: your-project +spec: + containers: + - name: trainer + image: your-ml-image:latest + resources: + requests: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" +``` + +### 使用 Deployments + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ml-inference + namespace: your-project +spec: + replicas: 2 + selector: + matchLabels: + app: ml-inference + template: + metadata: + labels: + app: ml-inference + spec: + containers: + - name: inference + image: your-inference-image:latest + resources: + requests: + cpu: "2" + memory: "8Gi" + nvidia.com/gpu: "1" +``` + +## 监控使用情况 + +### 检查 Pod 资源使用情况 + +```bash +# 查看资源消耗 +kubectl top pods + +# 详细的 pod 信息 +kubectl describe pod +``` + +### 查看日志 + +```bash +# 流式查看日志 +kubectl logs -f + +# 查看之前的日志(如果 pod 重启了) +kubectl logs --previous +``` + +## 最佳实践 + +### 资源请求和限制 + +始终指定请求和限制: +```yaml +resources: + requests: + cpu: "2" + memory: "8Gi" + limits: + cpu: "4" + memory: "16Gi" +``` + +### GPU 使用 + +- 仅在需要时请求 GPU +- 将 GPU 用于计算密集型任务 +- 监控 GPU 利用率 + +### 清理 + +不再需要时删除资源: +```bash +# 删除 pod +kubectl delete pod + +# 删除 deployment +kubectl delete deployment + +# 清理已完成的 job +kubectl delete job --field-selector status.successful=1 +``` + +### 成本优化 + +- 正确调整资源请求的大小 +- 使用水平 pod 自动扩展 +- 清理空闲资源 +- 在可能的情况下共享 GPU(如果支持) + +## 故障排查 + +### Pod 处于 Pending 状态(配额不足) + +如果您的 pod 卡在 `Pending` 状态: + +```bash +kubectl describe pod +``` + +查找与配额相关的错误,并减少资源请求或向团队负责人申请更多配额。 + +### 内存不足 (OOM) + +如果 pod 因 OOM 被杀死: +1. 检查内存使用模式 +2. 增加内存限制 +3. 优化应用程序内存使用 + +### GPU 不可用 + +验证 GPU 请求: +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,GPU:.status.allocatable."nvidia\.com/gpu" +``` + +## 下一步 + +- [团队负责人指南](team-leader.md) - 了解团队管理 +- [架构](../architecture.md) - 了解平台 diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/team-leader.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/team-leader.md new file mode 100644 index 0000000..45fd901 --- /dev/null +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guides/team-leader.md @@ -0,0 +1,126 @@ +--- +sidebar_position: 2 +--- + +# 团队负责人指南 + +本指南面向管理项目、监控预算并在团队内分配资源的团队负责人。 + +## 职责 + +作为团队负责人,您负责: + +- ✅ 创建和管理项目(命名空间) +- ✅ 为项目分配配额 +- ✅ 监控团队余额和消耗 +- ✅ 在需要时请求充值 + +## 入门 + +### 1. 访问 Bison + +使用您的凭据登录 Web UI。 + +### 2. 查看团队仪表板 + +您的仪表板显示: +- 团队余额和状态 +- 资源利用率 +- 活动项目 +- 成本趋势 + +## 管理项目 + +### 创建项目 + +1. 导航到 **项目** 页面 +2. 点击 **创建项目** +3. 填写: + - **项目名称**: 例如 "training-ml-models" + - **描述**: 项目用途 + - **配额**(可选): + - CPU: 例如 "8" 核心 + - 内存: 例如 "32Gi" + - GPU: 例如 "2" +4. 点击 **创建** + +### 列出项目 + +```bash +# 通过 kubectl(如果您有访问权限) +kubectl get namespaces -l capsule.clastix.io/tenant=your-team + +# 通过 API +curl http://localhost:8080/api/v1/teams/your-team/projects +``` + +### 删除项目 + +1. 导航到 **项目** 页面 +2. 点击项目行上的 **删除** +3. 确认删除 + +**警告**: 这将删除项目中的所有资源! + +## 监控预算 + +### 检查余额 + +查看您的当前余额: +1. 导航到 **团队** 页面 +2. 在状态卡片中查看余额 + +### 查看使用趋势 + +分析支出模式: +1. 导航到 **报告** 页面 +2. 选择时间范围(7 天、30 天、90 天) +3. 查看: + - 按资源类型的成本细分 + - 每日成本趋势 + - 每个项目的消耗 + +### 请求充值 + +当余额不足时: +1. 点击 **请求充值** 按钮 +2. 输入请求金额 +3. 添加理由说明 +4. 向管理员提交请求 + +## 资源管理 + +### 监控配额使用情况 + +检查您的配额使用了多少: +```bash +kubectl describe quota -n your-project +``` + +### 优化成本 + +减少支出的技巧: +- **合理调整资源大小**: 不要过度配置 CPU/内存 +- **清理空闲 pod**: 删除未使用的工作负载 +- **使用 spot/可抢占实例**: 在适用的情况下 +- **监控 GPU 利用率**: 确保 GPU 被充分利用 + +## 最佳实践 + +### 项目组织 +- 为不同的工作负载创建单独的项目 +- 示例:`ml-training`、`ml-inference`、`data-processing` + +### 配额分配 +- 根据项目优先级分配配额 +- 为紧急任务保留缓冲 + +### 成本意识 +- 每周审查成本 +- 识别并消除浪费 +- 设置成本告警 + +## 下一步 + +- [开发者指南](developer.md) - 您团队成员的指南 +- [功能特性](../features.md) - 探索所有 Bison 功能 diff --git a/website/i18n/zh-Hans/docusaurus-theme-classic/footer.json b/website/i18n/zh-Hans/docusaurus-theme-classic/footer.json index 0d82087..923115b 100644 --- a/website/i18n/zh-Hans/docusaurus-theme-classic/footer.json +++ b/website/i18n/zh-Hans/docusaurus-theme-classic/footer.json @@ -1,22 +1,22 @@ { "link.title.Docs": { - "message": "Docs", + "message": "文档", "description": "The title of the footer links column with title=Docs in the footer" }, "link.title.Resources": { - "message": "Resources", + "message": "资源", "description": "The title of the footer links column with title=Resources in the footer" }, "link.title.More": { - "message": "More", + "message": "更多", "description": "The title of the footer links column with title=More in the footer" }, "link.item.label.Getting Started": { - "message": "Getting Started", + "message": "快速开始", "description": "The label of footer link with label=Getting Started linking to /docs" }, "link.item.label.Architecture": { - "message": "Architecture", + "message": "架构", "description": "The label of footer link with label=Architecture linking to /docs/architecture" }, "link.item.label.Helm Charts": { @@ -32,11 +32,11 @@ "description": "The label of footer link with label=GitHub linking to https://github.com/SuperMarioYL/Bison" }, "link.item.label.Issues": { - "message": "Issues", + "message": "问题反馈", "description": "The label of footer link with label=Issues linking to https://github.com/SuperMarioYL/Bison/issues" }, "copyright": { - "message": "Copyright © 2025 Bison Project. Built with Docusaurus.", + "message": "Copyright © 2025 Bison 项目。使用 Docusaurus 构建。", "description": "The footer copyright" } } diff --git a/website/i18n/zh-Hans/docusaurus-theme-classic/navbar.json b/website/i18n/zh-Hans/docusaurus-theme-classic/navbar.json index 5de3c8a..90c372f 100644 --- a/website/i18n/zh-Hans/docusaurus-theme-classic/navbar.json +++ b/website/i18n/zh-Hans/docusaurus-theme-classic/navbar.json @@ -8,7 +8,7 @@ "description": "The alt text of navbar logo" }, "item.label.Documentation": { - "message": "Documentation", + "message": "文档", "description": "Navbar item with label Documentation" }, "item.label.GitHub": { From 370c73d52ef4643e142a856f858d83d0fdaa9502 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 29 Dec 2025 02:06:14 +0000 Subject: [PATCH 21/44] docs: add version 0.0.11 [skip ci] Auto-generated documentation version from release refs/tags/v0.0.11 - Added version 0.0.11 to versions.json - Created versioned_docs/version-0.0.11/ - Created versioned_sidebars/version-0.0.11-sidebars.json --- .../version-0.0.11/architecture.md | 418 ++++++++++++++++++ .../version-0.0.11/configuration.md | 361 +++++++++++++++ .../versioned_docs/version-0.0.11/features.md | 195 ++++++++ .../version-0.0.11/installation.md | 316 +++++++++++++ .../versioned_docs/version-0.0.11/intro.md | 167 +++++++ .../user-guides/_category_.json | 8 + .../version-0.0.11/user-guides/admin.md | 176 ++++++++ .../version-0.0.11/user-guides/developer.md | 187 ++++++++ .../version-0.0.11/user-guides/team-leader.md | 126 ++++++ .../version-0.0.11-sidebars.json | 8 + website/versions.json | 1 + 11 files changed, 1963 insertions(+) create mode 100644 website/versioned_docs/version-0.0.11/architecture.md create mode 100644 website/versioned_docs/version-0.0.11/configuration.md create mode 100644 website/versioned_docs/version-0.0.11/features.md create mode 100644 website/versioned_docs/version-0.0.11/installation.md create mode 100644 website/versioned_docs/version-0.0.11/intro.md create mode 100644 website/versioned_docs/version-0.0.11/user-guides/_category_.json create mode 100644 website/versioned_docs/version-0.0.11/user-guides/admin.md create mode 100644 website/versioned_docs/version-0.0.11/user-guides/developer.md create mode 100644 website/versioned_docs/version-0.0.11/user-guides/team-leader.md create mode 100644 website/versioned_sidebars/version-0.0.11-sidebars.json diff --git a/website/versioned_docs/version-0.0.11/architecture.md b/website/versioned_docs/version-0.0.11/architecture.md new file mode 100644 index 0000000..dbbe50d --- /dev/null +++ b/website/versioned_docs/version-0.0.11/architecture.md @@ -0,0 +1,418 @@ +--- +sidebar_position: 5 +--- + +# Architecture + +This document provides a technical overview of Bison's architecture, designed with high cohesion and low coupling principles for maintainability and scalability. + +## System Overview + +### High-Level Architecture + +```mermaid +graph TB + subgraph PRESENT[Presentation Layer] + WEB[Web UI
React 18 + Ant Design 5] + CLI[kubectl / API Client] + end + + subgraph GATEWAY[API Gateway Layer] + GW[API Server
Go + Gin Framework] + AUTH[Auth Middleware
JWT + OIDC] + end + + subgraph BUSINESS[Business Logic Layer] + TS[Tenant Service
Team & Project CRUD] + BS[Billing Service
Cost Calculation] + BLS[Balance Service
Wallet Management] + QS[Quota Service
Resource Limits] + AS[Alert Service
Notifications] + RS[Report Service
Analytics] + end + + subgraph INTEGRATION[Integration Layer] + K8S[Kubernetes Client
client-go] + OCC[OpenCost Client
REST API] + PC[Prometheus Client
PromQL] + end + + subgraph EXTERNAL[External Systems] + KAPI[Kubernetes API] + CAP[Capsule Controller] + OC[OpenCost] + PROM[Prometheus] + end + + subgraph DATA[Data Layer] + CM[ConfigMaps
Persistent Storage] + end + + WEB --> GW + CLI --> GW + GW --> AUTH + AUTH --> TS & BS & BLS & QS & AS & RS + + TS --> K8S + BS --> OCC + BLS --> K8S + QS --> K8S + RS --> OCC & PC + + K8S --> KAPI + K8S --> CAP + OCC --> OC + PC --> PROM + + TS & BLS --> CM + KAPI --> CM +``` + +### Design Principles + +| Principle | Implementation | +|-----------|----------------| +| **High Cohesion** | Each service handles a single domain (billing, quota, alerts) | +| **Low Coupling** | Services communicate via well-defined interfaces | +| **Stateless API** | All state persisted in Kubernetes ConfigMaps | +| **Cloud Native** | Leverages Kubernetes primitives for HA and scaling | +| **Zero Database** | ConfigMaps eliminate external database dependencies | + +## Architecture Layers + +Bison follows a layered architecture pattern: + +### 1. Presentation Layer +- **Web UI**: React 18 + TypeScript + Ant Design 5 +- **API Client**: REST API for external integrations + +### 2. API Gateway Layer +- **API Server**: Go + Gin framework +- **Authentication**: JWT and OIDC support +- **Middleware**: Logging, recovery, CORS handling + +### 3. Business Logic Layer +- **Tenant Service**: Team and project management +- **Billing Service**: Cost calculation and aggregation +- **Balance Service**: Wallet management and auto-deduction +- **Quota Service**: Resource limit enforcement +- **Alert Service**: Multi-channel notifications +- **Report Service**: Analytics and export + +### 4. Integration Layer +- **Kubernetes Client**: client-go for K8s API interaction +- **OpenCost Client**: REST API for cost data +- **Prometheus Client**: PromQL queries for metrics + +### 5. Data Layer +- **ConfigMaps**: Persistent storage for balances, billing config, and metadata +- **etcd**: Backing store via Kubernetes ConfigMaps + +## Core Components + +### API Server + +The API server is the central component that handles all HTTP requests: + +**Technology Stack:** +- Go 1.24+ +- Gin web framework +- client-go for Kubernetes API + +**Key Responsibilities:** +- Serve REST API endpoints +- Authentication and authorization +- Request routing and middleware +- Background task scheduling + +**Endpoints:** +``` +/api/v1/teams - Team management +/api/v1/projects - Project management +/api/v1/billing - Billing configuration +/api/v1/balance - Balance operations +/api/v1/stats - Statistics and reports +``` + +### Web UI + +React-based single-page application: + +**Technology Stack:** +- React 18 +- TypeScript +- Vite (build tool) +- Ant Design 5 +- ECharts (visualization) +- React Query (state management) + +**Features:** +- Dashboard with real-time metrics +- Team and project management +- Billing configuration +- Balance monitoring +- Usage reports and export + +### Billing Service + +Calculates costs based on resource usage: + +**Data Flow:** +```mermaid +sequenceDiagram + participant S as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant K8S as Kubernetes + + S->>BS: Trigger billing (every 10min) + BS->>OC: Query team costs + OC-->>BS: Return usage data + BS->>BS: Calculate cost + BS->>BLS: Deduct from balance + BLS->>K8S: Update ConfigMap + BLS-->>BS: Confirm + BS->>AS: Check threshold + alt Balance low + AS->>Webhook: Send alert + end +``` + +**Billing Formula:** +``` +Total Cost = (CPU_cores × CPU_price × hours) + + (Memory_GB × Memory_price × hours) + + (GPU_count × GPU_price × hours) +``` + +### Balance Service + +Manages team wallets and auto-deduction: + +**Features:** +- Real-time balance tracking +- Auto-deduction based on usage +- Recharge operations +- Transaction history +- Auto-suspension when balance depleted + +**Storage:** +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: bison-team-balances + namespace: bison-system +data: + ml-team: "1523.45" + data-team: "890.12" + dev-team: "2100.00" +``` + +### Tenant Service + +Manages teams (Capsule Tenants) and projects (Namespaces): + +**Features:** +- Create/delete teams +- Assign resource quotas +- Configure node pools (shared/exclusive) +- Manage team metadata + +**Capsule Integration:** +```yaml +apiVersion: capsule.clastix.io/v1beta1 +kind: Tenant +metadata: + name: ml-team +spec: + owners: + - name: team-leader + kind: User + resourceQuota: + items: + - hard: + cpu: "20" + memory: 64Gi + nvidia.com/gpu: "4" +``` + +## Data Flow + +### Team Creation Flow + +```mermaid +sequenceDiagram + participant U as Admin + participant API as API Server + participant TS as Tenant Service + participant K8S as Kubernetes + participant CAP as Capsule + + U->>API: POST /api/v1/teams + API->>TS: CreateTeam(name, quota, balance) + TS->>CAP: Create Tenant + CAP-->>TS: Tenant created + TS->>K8S: Create ConfigMap (balance) + K8S-->>TS: ConfigMap created + TS-->>API: Success + API-->>U: 201 Created +``` + +### Billing Cycle Flow + +```mermaid +sequenceDiagram + participant SCH as Scheduler + participant BS as Billing Service + participant OC as OpenCost + participant BLS as Balance Service + participant AS as Alert Service + + loop Every 10 minutes + SCH->>BS: Trigger billing calculation + BS->>OC: Query costs (last 10min) + OC-->>BS: Return usage metrics + BS->>BS: Calculate total cost + BS->>BLS: Deduct cost from balance + BLS->>BLS: Update balance + BLS-->>BS: Balance updated + BS->>AS: Check balance threshold + alt Balance < 20% + AS->>AS: Send low balance alert + end + alt Balance <= 0 + AS->>BS: Suspend team workloads + end + end +``` + +## Integration Points + +### Kubernetes Integration + +Bison integrates deeply with Kubernetes: + +- **Capsule Tenants** for multi-tenancy +- **ResourceQuotas** for limit enforcement +- **Namespaces** for project isolation +- **ConfigMaps** for data persistence +- **RBAC** for access control + +### OpenCost Integration + +Real-time cost tracking via OpenCost API: + +```bash +# Query team costs +GET /allocation/compute?window=10m&aggregate=namespace&filter=namespace:ml-team + +# Response +{ + "ml-team": { + "cpuCost": 0.25, + "memCost": 0.10, + "gpuCost": 4.17, + "totalCost": 4.52 + } +} +``` + +### Prometheus Integration + +Metrics collection for monitoring: + +- Resource utilization metrics +- Cost metrics +- Balance metrics +- Alert metrics + +## Deployment Architecture + +### High Availability Setup + +```mermaid +graph TB + subgraph K8S[Kubernetes Cluster] + subgraph NS1[bison-system namespace] + API1[API Server Pod 1] + API2[API Server Pod 2] + WEB1[Web UI Pod 1] + WEB2[Web UI Pod 2] + end + + subgraph NS2[opencost-system namespace] + OC[OpenCost] + end + + subgraph NS3[prometheus-system namespace] + PROM[Prometheus] + end + + LB[LoadBalancer] + end + + LB --> API1 & API2 + LB --> WEB1 & WEB2 + API1 & API2 --> OC + OC --> PROM +``` + +### Resource Requirements + +**Minimum:** +- API Server: 200m CPU, 256Mi Memory +- Web UI: 100m CPU, 128Mi Memory + +**Recommended (Production):** +- API Server: 1000m CPU, 512Mi Memory (2 replicas) +- Web UI: 500m CPU, 256Mi Memory (2 replicas) + +## Security Model + +### Authentication +- JWT token-based authentication +- OIDC/SSO integration +- Admin user management + +### Authorization +- Kubernetes RBAC integration +- Role-based access control +- Team-scoped permissions + +### Data Security +- All data encrypted at rest (etcd encryption) +- TLS for API communication +- Secret management via Kubernetes Secrets + +## Technology Stack + +### Backend +- **Language**: Go 1.24+ +- **Framework**: Gin +- **Kubernetes Client**: client-go +- **Configuration**: Viper +- **Logging**: Logrus + +### Frontend +- **Framework**: React 18 +- **Language**: TypeScript +- **UI Library**: Ant Design 5 +- **Build Tool**: Vite +- **State Management**: React Query +- **Charts**: ECharts + +### Infrastructure +- **Platform**: Kubernetes 1.22+ +- **Multi-Tenancy**: Capsule +- **Cost Tracking**: OpenCost +- **Metrics**: Prometheus +- **Storage**: ConfigMaps (etcd) + +## Next Steps + +- [Installation Guide](installation.md) - Deploy Bison +- [Configuration](configuration.md) - Configure billing +- [User Guides](user-guides/admin.md) - Learn to use Bison +- [Features](features.md) - Explore capabilities diff --git a/website/versioned_docs/version-0.0.11/configuration.md b/website/versioned_docs/version-0.0.11/configuration.md new file mode 100644 index 0000000..539aaa3 --- /dev/null +++ b/website/versioned_docs/version-0.0.11/configuration.md @@ -0,0 +1,361 @@ +--- +sidebar_position: 6 +--- + +# Configuration + +This guide covers how to configure Bison for your specific environment and requirements. + +## Helm Chart Configuration + +Bison is configured primarily through Helm values. You can customize the installation by providing a `values.yaml` file or using `--set` flags. + +### Key Configuration Parameters + +| Parameter | Description | Default | Example | +|-----------|-------------|---------|---------| +| `auth.enabled` | Enable authentication | `false` | `true` | +| `auth.admin.username` | Admin username | `admin` | `admin` | +| `auth.admin.password` | Admin password | `admin` | `changeme` | +| `apiServer.replicaCount` | API server replicas | `2` | `3` | +| `apiServer.image.repository` | API server image | `ghcr.io/supermarioyl/bison/api-server` | - | +| `apiServer.image.tag` | API server image tag | `0.0.1` | `latest` | +| `webUI.replicaCount` | Web UI replicas | `2` | `3` | +| `webUI.image.repository` | Web UI image | `ghcr.io/supermarioyl/bison/web-ui` | - | +| `webUI.image.tag` | Web UI image tag | `0.0.1` | `latest` | +| `opencost.url` | OpenCost API endpoint | `http://opencost.opencost-system.svc:9003` | Custom URL | + +### Example Custom Values + +Create a `custom-values.yaml` file: + +```yaml +# Authentication +auth: + enabled: true + admin: + username: admin + password: MySecurePassword123 + +# API Server +apiServer: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 512Mi + +# Web UI +webUI: + replicaCount: 3 + image: + tag: 0.0.1 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + +# OpenCost Integration +opencost: + url: http://opencost.opencost-system.svc:9003 + +# Node Selection (optional) +nodeSelector: + node-role.kubernetes.io/control-plane: "" + +# Tolerations (optional) +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule +``` + +Install with custom values: + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --values custom-values.yaml +``` + +## Billing Configuration + +Billing settings are configured through the Web UI or API after installation. + +### Access Billing Configuration + +1. **Via Web UI:** + - Navigate to **Settings** > **Billing Configuration** + - Set pricing for CPU, Memory, GPU, and other resources + - Configure currency and billing intervals + +2. **Via API:** + ```bash + curl -X POST http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50 + }, + "billingInterval": "hourly" + }' + ``` + +### Billing Parameters + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `enabled` | Enable/disable billing | `true` | +| `currency` | Currency for billing | `USD`, `CNY`, `EUR` | +| `pricing.cpu` | CPU price per core-hour | `0.05` | +| `pricing.memory` | Memory price per GB-hour | `0.01` | +| `pricing["nvidia.com/gpu"]` | GPU price per GPU-hour | `2.50` | +| `billingInterval` | Billing aggregation period | `hourly`, `daily` | +| `lowBalanceThreshold` | Warning threshold (%) | `20` | +| `suspendThreshold` | Auto-suspend threshold (%) | `5` | + +### Example Billing Configuration + +```json +{ + "enabled": true, + "currency": "USD", + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "nvidia.com/mig-1g.5gb": 0.50, + "nvidia.com/mig-2g.10gb": 1.00 + }, + "billingInterval": "hourly", + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## Team Configuration + +### Creating Teams + +Teams can be created through the Web UI or API: + +**Via Web UI:** +1. Navigate to **Teams** page +2. Click **Create Team** +3. Set team name, quota, and initial balance + +**Via API:** +```bash +curl -X POST http://localhost:8080/api/v1/teams \ + -H "Content-Type: application/json" \ + -d '{ + "name": "ml-team", + "description": "Machine Learning Team", + "quota": { + "cpu": "20", + "memory": "64Gi", + "nvidia.com/gpu": "4" + }, + "balance": 1000.00 + }' +``` + +### Team Quotas + +Team quotas define resource limits: + +```yaml +quota: + cpu: "20" # 20 CPU cores + memory: "64Gi" # 64 GB RAM + nvidia.com/gpu: "4" # 4 GPUs + storage: "500Gi" # 500 GB storage +``` + +### Team Balance Management + +Set initial balance and configure auto-recharge: + +```json +{ + "balance": 1000.00, + "autoRecharge": { + "enabled": true, + "amount": 500.00, + "schedule": "monthly", + "threshold": 100.00 + } +} +``` + +## Alert Configuration + +Configure multi-channel alerts for low balance and quota warnings. + +### Webhook Alerts + +```json +{ + "type": "webhook", + "enabled": true, + "url": "https://your-webhook-endpoint.com/alerts", + "headers": { + "Authorization": "Bearer YOUR_TOKEN" + }, + "template": { + "title": "Bison Alert", + "message": "Team {{.TeamName}} balance is {{.Balance}}" + } +} +``` + +### DingTalk Alerts + +```json +{ + "type": "dingtalk", + "enabled": true, + "webhook": "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN", + "secret": "YOUR_SECRET" +} +``` + +### WeChat Work Alerts + +```json +{ + "type": "wechat", + "enabled": true, + "corpid": "YOUR_CORP_ID", + "corpsecret": "YOUR_CORP_SECRET", + "agentid": 1000001 +} +``` + +## OpenCost Integration + +Configure OpenCost connection: + +### Check OpenCost Connectivity + +```bash +# Test OpenCost API +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz + +# Test allocation API +curl http://localhost:9003/allocation/compute?window=1d +``` + +### Update OpenCost URL + +If OpenCost is deployed in a different namespace or with a different service name: + +```bash +helm upgrade bison bison/bison \ + --set opencost.url=http://my-opencost.custom-namespace.svc:9003 \ + --namespace bison-system +``` + +## Authentication & OIDC + +Enable authentication and integrate with your SSO provider: + +### Basic Authentication + +```yaml +auth: + enabled: true + admin: + username: admin + password: SecurePassword123 +``` + +### OIDC Integration + +```yaml +auth: + enabled: true + oidc: + enabled: true + issuerURL: https://your-oidc-provider.com + clientID: bison-client-id + clientSecret: your-client-secret + redirectURL: https://bison.example.com/callback +``` + +## Environment Variables + +Additional configuration can be provided via environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `KUBECONFIG` | Path to kubeconfig file | In-cluster config | +| `OPENCOST_URL` | OpenCost API URL | `http://opencost.opencost-system.svc:9003` | +| `AUTH_ENABLED` | Enable authentication | `false` | +| `LOG_LEVEL` | Logging level | `info` | +| `BILLING_INTERVAL` | Billing calculation interval | `10m` | + +Set environment variables in Helm values: + +```yaml +apiServer: + env: + - name: LOG_LEVEL + value: debug + - name: BILLING_INTERVAL + value: 5m +``` + +## Advanced Configuration + +### Custom Resource Pricing + +Price any Kubernetes resource: + +```json +{ + "pricing": { + "cpu": 0.05, + "memory": 0.01, + "nvidia.com/gpu": 2.50, + "amd.com/gpu": 2.00, + "ephemeral-storage": 0.001, + "custom.io/fpga": 5.00 + } +} +``` + +### Multi-Cluster Support + +Deploy Bison in each cluster with shared billing: + +```yaml +# Cluster A +apiServer: + clusterName: prod-us-west + +# Cluster B +apiServer: + clusterName: prod-us-east +``` + +## Next Steps + +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Understand the system design +- [Features](features.md) - Explore all capabilities diff --git a/website/versioned_docs/version-0.0.11/features.md b/website/versioned_docs/version-0.0.11/features.md new file mode 100644 index 0000000..8d0c438 --- /dev/null +++ b/website/versioned_docs/version-0.0.11/features.md @@ -0,0 +1,195 @@ +--- +sidebar_position: 2 +--- + +# Features + +Bison provides a comprehensive suite of features for GPU resource management, billing, and multi-tenant isolation in Kubernetes environments. + +## See Bison in Action + +### 🎯 Real-Time Resource Dashboard + +![Bison Dashboard](/img/ui-dashboard.png) + +**What you see:** +- **Cluster Overview** - Total teams, projects, resource pools, and quotas at a glance +- **Resource Utilization** - Visual breakdown showing which teams are consuming resources +- **7-Day Cost Trends** - Historical cost data to identify spending patterns +- **Top 5 Cost Rankings** - Quickly identify heavy GPU consumers +- **Team Budget Status** - Real-time balance monitoring with color-coded alerts + +**Who benefits:** +- **Platform Administrators** get instant visibility into cluster health and usage patterns +- **Finance Teams** can track costs in real-time without waiting for monthly reports +- **Team Leaders** can compare their usage against other teams + +--- + +### 💼 Team Management & Budget Monitoring + +![Team Management](/img/ui-team.png) + +**What you see:** +- **Team List** with real-time status indicators: + - 🟢 Green balance = Healthy budget + - 🟡 Yellow balance = Approaching threshold + - 🔴 Red balance = Low balance or suspended +- **Resource Allocation** - CPU/Memory/GPU quotas per team (e.g., "cpu 0/10" means 0 used out of 10 allocated) +- **Project Count** - Number of namespaces/projects under each team +- **Quick Actions** - Edit quotas, recharge balance, or delete team with one click + +**Who benefits:** +- **Team Leaders** monitor their budget status and resource usage at a glance +- **Administrators** manage multiple teams from a single unified view +- **Finance Teams** see which teams need recharging + +--- + +### 💰 Flexible Billing Configuration + +![Billing Configuration](/img/ui-billing.png) + +**What you see:** +- **Per-Resource Pricing** - Set custom prices for CPU (per core-hour), Memory (per GB-hour), GPU (per GPU-hour) +- **Currency Selection** - Support for CNY, USD, EUR, and other currencies +- **Enable/Disable Toggle** - Turn billing on/off for specific resources with one click +- **Billing Rules** - Define how resources are metered (hourly, daily, etc.) +- **Alert Thresholds** - Configure when to send low-balance warnings + +**Who benefits:** +- **Finance Teams** align cloud costs with internal chargeback policies +- **Administrators** adjust pricing based on actual hardware costs +- **Budget Managers** set appropriate warning thresholds to prevent overruns + +--- + +## Core Capabilities + +### Multi-Tenant Management +✅ **Capsule-Powered Isolation** - True multi-tenancy using Kubernetes-native Capsule operator +✅ **OIDC Integration** - Enterprise SSO support for authentication +✅ **Team-Based Access Control** - Manage users, roles, and permissions per team +✅ **Shared & Exclusive Node Pools** - Flexible resource allocation strategies + +### Real-Time Billing +✅ **Usage-Based Billing** - Accurate cost tracking based on actual resource consumption +✅ **Configurable Pricing** - Set custom rates for CPU, Memory, GPU, and any Kubernetes resource +✅ **Multi-Currency Support** - CNY, USD, EUR, and more +✅ **Billing Rules Engine** - Define custom billing logic and aggregation periods + +### Dynamic Resource Quotas +✅ **Per-Team Quotas** - CPU, Memory, GPU, Storage, and custom resources +✅ **Namespace Quotas** - Project-level resource limits within teams +✅ **Auto-Enforcement** - Kubernetes-native quota enforcement +✅ **Quota Alerts** - Notifications when approaching limits + +### Team Balance & Wallet System +✅ **Prepaid Balances** - Team wallets with real-time deduction +✅ **Auto-Deduction** - Automated billing based on resource usage +✅ **Balance Thresholds** - Configurable warning and suspension levels +✅ **Transaction History** - Complete audit trail of all balance changes + +### Auto-Recharge +✅ **Scheduled Top-Ups** - Weekly or monthly automatic recharges +✅ **Custom Amounts** - Flexible recharge amounts per team +✅ **Recharge Notifications** - Alert teams when balance is added + +### Balance Alerts +✅ **Multi-Channel Notifications** - Webhook, DingTalk, WeChat, Email +✅ **Configurable Thresholds** - Set warning levels (e.g., 20%, 10%, 5%) +✅ **Auto-Suspension** - Automatically suspend workloads when balance depleted +✅ **Custom Templates** - Customize alert messages + +### Usage Reports +✅ **Team Analytics** - Per-team cost breakdowns and trends +✅ **Project Analytics** - Namespace-level resource consumption +✅ **Export Capabilities** - CSV, Excel, PDF reports +✅ **Historical Data** - 30/60/90-day cost analysis + +### Audit Logging +✅ **Complete Operation History** - Track all administrative actions +✅ **User Attribution** - Who did what and when +✅ **Resource Changes** - Track quota, balance, and configuration changes +✅ **Compliance Ready** - Meet internal audit requirements + +--- + +## Architecture Highlights + +Bison's architecture is designed for simplicity, scalability, and zero external dependencies. + +```mermaid +graph TB + subgraph USER_LAYER[User Layer] + UI[Web UI
React + Ant Design] + API[REST API
Go + Gin] + end + + subgraph CORE[Core Services] + BS[Billing Service] + TS[Tenant Service] + QS[Quota Service] + end + + subgraph K8S[Kubernetes Layer] + CA[Capsule
Multi-Tenancy] + OC[OpenCost
Cost Tracking] + PR[Prometheus
Metrics] + end + + subgraph DATA[Data Layer] + CM[ConfigMaps
Zero Database] + end + + UI --> API + API --> BS & TS & QS + BS --> OC + TS --> CA + QS --> CA + BS & TS --> CM + OC --> PR +``` + +### Key Architectural Benefits + +- **Zero External Dependencies** - All data stored in Kubernetes ConfigMaps (etcd-backed) +- **Cloud-Native** - Built on Kubernetes primitives for maximum portability +- **Scalable** - Stateless API server that can scale horizontally +- **Secure** - Kubernetes RBAC integration and optional authentication +- **Observable** - Prometheus metrics and structured logging +- **Extensible** - Plugin architecture for custom billing rules and alerts + +--- + +## Integration Points + +### OpenCost Integration +Bison leverages [OpenCost](https://www.opencost.io/) for real-time cost tracking: +- Per-pod, per-namespace, per-team cost visibility +- GPU utilization metrics +- Historical cost data and trends +- Integration with Prometheus for metric collection + +### Capsule Integration +Bison uses [Capsule](https://capsule.clastix.io/) for multi-tenancy: +- Team-based tenant isolation +- Namespace quota enforcement +- Network and security policies +- OIDC/SSO integration + +### Prometheus Integration +Metrics collection and monitoring: +- Resource utilization tracking +- Custom billing metrics +- Alert rule evaluation +- Historical data retention + +--- + +## Next Steps + +- [Installation Guide](installation.md) - Deploy Bison in your cluster +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Deep dive into system design +- [Configuration](configuration.md) - Configure billing and settings diff --git a/website/versioned_docs/version-0.0.11/installation.md b/website/versioned_docs/version-0.0.11/installation.md new file mode 100644 index 0000000..da89191 --- /dev/null +++ b/website/versioned_docs/version-0.0.11/installation.md @@ -0,0 +1,316 @@ +--- +sidebar_position: 3 +--- + +# Installation Guide + +This guide provides detailed instructions for installing Bison in your Kubernetes cluster. + +## Prerequisites + +Before installing Bison, ensure you have: + +- **Kubernetes 1.22+** - A running Kubernetes cluster +- **kubectl** - Configured to access your cluster +- **Helm 3.0+** - Package manager for Kubernetes +- **Capsule Operator v0.1.0+** - For multi-tenant isolation +- **OpenCost** - Deployed with Prometheus for cost tracking + +### Install Prerequisites + +If you haven't installed the required components: + +#### Install Capsule + +```bash +# Using Helm +helm repo add projectcapsule https://projectcapsule.github.io/charts +helm install capsule projectcapsule/capsule \ + --namespace capsule-system \ + --create-namespace +``` + +#### Install OpenCost + +```bash +# Using Helm +helm repo add opencost https://opencost.github.io/opencost-helm-chart +helm install opencost opencost/opencost \ + --namespace opencost-system \ + --create-namespace \ + --set prometheus.internal.serviceName=prometheus-server \ + --set prometheus.internal.namespaceName=prometheus-system +``` + +## Installation Methods + +Bison Helm charts are distributed via **GitHub Container Registry (GHCR)** using the modern OCI format. + +**Requirements:** +- Helm >= 3.8.0 (for OCI support) +- Kubernetes >= 1.22 + +### Option A: From GHCR (Recommended) + +The simplest way to install Bison is directly from GitHub Container Registry: + +```bash +# Install specific version from GHCR +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace + +# Or pull the chart first, then install +helm pull oci://ghcr.io/supermarioyl/bison/bison --version 0.0.2 +helm install bison bison-0.0.2.tgz \ + --namespace bison-system \ + --create-namespace + +# Customize installation +helm install bison oci://ghcr.io/supermarioyl/bison/bison \ + --version 0.0.2 \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true \ + --set apiServer.image.tag=0.0.2 \ + --set webUI.image.tag=0.0.2 +``` + +**Why GHCR OCI Format?** +- ✅ No separate Helm repository maintenance needed +- ✅ Unified with Docker images in GHCR +- ✅ Faster installation (direct registry pull) +- ✅ Modern Helm 3.8+ standard practice + +### Option B: From GitHub Release + +Download a specific version from GitHub Releases: + +```bash +# Download Helm chart +VERSION=0.0.2 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# Install the chart +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +### Option C: From Source + +Clone and build from source: + +```bash +# Clone repository +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# Install dependencies and build +make install-deps +make build + +# Deploy using Helm +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +## Configuration Options + +Bison can be configured using Helm values. Here are the key configuration options: + +### Basic Configuration + +```yaml +# values.yaml +apiServer: + image: + repository: ghcr.io/supermarioyl/bison/api-server + tag: 0.0.1 + replicas: 2 + +webUI: + image: + repository: ghcr.io/supermarioyl/bison/web-ui + tag: 0.0.1 + replicas: 2 + +# OpenCost URL +opencost: + url: http://opencost.opencost-system.svc:9003 + +# Authentication +auth: + enabled: false +``` + +### Custom Configuration Example + +```bash +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set apiServer.replicas=3 \ + --set webUI.replicas=3 \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=true +``` + +## Verify Installation + +After installation, verify that all components are running: + +```bash +# Check pod status +kubectl get pods -n bison-system + +# Expected output: +# NAME READY STATUS RESTARTS AGE +# bison-api-server-xxxxxxxxx-xxxxx 1/1 Running 0 2m +# bison-webui-xxxxxxxxx-xxxxx 1/1 Running 0 2m + +# Check services +kubectl get svc -n bison-system + +# Check logs +kubectl logs -n bison-system deployment/bison-api-server +kubectl logs -n bison-system deployment/bison-webui +``` + +## Access the Platform + +### Port Forward (Development) + +```bash +# Port-forward the Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# Access at http://localhost:3000 +``` + +### Ingress (Production) + +For production deployments, configure an Ingress: + +```yaml +# ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: bison-ingress + namespace: bison-system + annotations: + kubernetes.io/ingress.class: nginx +spec: + rules: + - host: bison.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: bison-webui + port: + number: 80 + - path: /api + pathType: Prefix + backend: + service: + name: bison-api-server + port: + number: 8080 +``` + +Apply the Ingress: + +```bash +kubectl apply -f ingress.yaml +``` + +## Docker Images + +Bison images are available on GitHub Container Registry: + +```bash +# Pull images +docker pull ghcr.io/supermarioyl/bison/api-server:0.0.1 +docker pull ghcr.io/supermarioyl/bison/web-ui:0.0.1 + +# Or use latest +docker pull ghcr.io/supermarioyl/bison/api-server:latest +docker pull ghcr.io/supermarioyl/bison/web-ui:latest +``` + +**Supported Platforms:** +- `linux/amd64` +- `linux/arm64` + +## Upgrading + +To upgrade Bison to a new version: + +```bash +# Update Helm repository +helm repo update + +# Upgrade to latest version +helm upgrade bison bison/bison --namespace bison-system + +# Or upgrade to specific version +helm upgrade bison bison/bison --version 0.0.2 --namespace bison-system +``` + +## Uninstalling + +To completely remove Bison: + +```bash +# Uninstall Helm release +helm uninstall bison --namespace bison-system + +# Remove namespace (optional) +kubectl delete namespace bison-system +``` + +## Troubleshooting + +### Pod Not Starting + +Check pod logs for errors: + +```bash +kubectl logs -n bison-system deployment/bison-api-server +kubectl describe pod -n bison-system +``` + +### Cannot Connect to OpenCost + +Verify OpenCost is running and accessible: + +```bash +kubectl get svc -n opencost-system +kubectl port-forward -n opencost-system svc/opencost 9003:9003 + +# Test endpoint +curl http://localhost:9003/healthz +``` + +### Authentication Issues + +If authentication is enabled, ensure you have the correct credentials: + +```bash +# Default credentials (change in production!) +Username: admin +Password: admin +``` + +## Next Steps + +- [Configuration Guide](configuration.md) - Configure billing and settings +- [User Guides](user-guides/admin.md) - Learn how to use Bison +- [Architecture](architecture.md) - Understand the system design diff --git a/website/versioned_docs/version-0.0.11/intro.md b/website/versioned_docs/version-0.0.11/intro.md new file mode 100644 index 0000000..b3dc3aa --- /dev/null +++ b/website/versioned_docs/version-0.0.11/intro.md @@ -0,0 +1,167 @@ +--- +sidebar_position: 1 +slug: / +--- + +# Introduction to Bison + +![Bison Logo](/img/logo.png) + +**Enterprise GPU Resource Billing & Multi-Tenant Management Platform** + +Bison is a Kubernetes-based platform that provides comprehensive GPU resource management, billing, and multi-tenant isolation for organizations running shared GPU clusters. + +## The GPU Management Challenge + +Managing shared GPU clusters across multiple teams creates critical operational and financial challenges: + +**For Platform Administrators:** +- How do you fairly allocate expensive GPU resources across competing teams? +- How do you prevent resource hogging while ensuring everyone gets their fair share? +- How do you track who's using what and implement accurate chargeback? +- How do you maintain strict multi-tenant isolation without complex manual configuration? + +**For Finance & Budget Teams:** +- How do you implement automated chargeback for GPU usage without manual accounting? +- How do you prevent budget overruns before they happen? +- How do you generate accurate cost reports for internal billing? + +**For Development Teams:** +- How do you get predictable, isolated access to GPU resources? +- How do you know when you're approaching your budget limits? +- How do you avoid impacting other teams' workloads? + +**Traditional Approach:** +- Manual quota configuration per namespace +- Excel-based billing calculations +- No real-time cost visibility +- Complex multi-tool setup (quota management + cost tracking + billing system) +- Frequent resource conflicts and budget surprises + +## Bison's Integrated Solution + +```mermaid +graph TB + subgraph WITHOUT["Without Bison"] + P1[❌ Manual Quota Management
Per-namespace configuration] + P2[❌ Spreadsheet Billing
Manual calculations & reports] + P3[❌ No Resource Isolation
Teams compete for resources] + P4[❌ Budget Overruns
No proactive alerts] + P5[❌ Complex Tooling
Multiple systems to manage] + end + + subgraph WITH["With Bison"] + S1[✅ Automated Team Quotas
Capsule-powered isolation] + S2[✅ Real-Time Billing
OpenCost integration] + S3[✅ True Multi-Tenancy
Shared/Exclusive modes] + S4[✅ Proactive Alerts
Balance monitoring & auto-suspend] + S5[✅ Unified Platform
Single pane of glass] + end + + P1 -.Transform.-> S1 + P2 -.Transform.-> S2 + P3 -.Transform.-> S3 + P4 -.Transform.-> S4 + P5 -.Transform.-> S5 + + style WITHOUT fill:#ffebee + style WITH fill:#e8f5e9 + style S1 fill:#4caf50,color:#fff + style S2 fill:#4caf50,color:#fff + style S3 fill:#4caf50,color:#fff + style S4 fill:#4caf50,color:#fff + style S5 fill:#4caf50,color:#fff +``` + +**Bison combines:** +- 🔐 **Kubernetes-native multi-tenancy** (Capsule) - True team isolation with shared or exclusive node pools +- 💰 **Real-time cost tracking** (OpenCost + Prometheus) - Per-pod, per-namespace, per-team cost visibility +- 💳 **Automated billing & budgets** - Prepaid balances, auto-deduction, low-balance alerts, and auto-suspension +- 📊 **Unified dashboard** - Single interface for admins, team leaders, and finance teams +- 🔧 **Zero external dependencies** - All data stored in Kubernetes ConfigMaps (etcd-backed) + +**Result:** Deploy once, get complete GPU resource management with automated billing in under 30 minutes. + +## Quick Start + +### Prerequisites + +- Kubernetes 1.22+ +- kubectl configured +- Helm 3.0+ +- Capsule operator (v0.1.0+) installed +- OpenCost deployed with Prometheus + +### Installation + +Choose one of the following installation methods: + +#### Option A: Using Helm Repository (Recommended) + +```bash +# Add Bison Helm repository +helm repo add bison https://supermarioyl.github.io/Bison/charts/ +helm repo update + +# Install with default configuration +helm install bison bison/bison --namespace bison-system --create-namespace + +# Or customize installation +helm install bison bison/bison \ + --namespace bison-system \ + --create-namespace \ + --set opencost.url=http://opencost.opencost-system.svc:9003 \ + --set auth.enabled=false +``` + +#### Option B: From GitHub Release + +```bash +# Download latest Helm chart +VERSION=0.0.1 +wget https://github.com/SuperMarioYL/Bison/releases/download/v${VERSION}/bison-${VERSION}.tgz + +# Install +helm install bison bison-${VERSION}.tgz \ + --namespace bison-system \ + --create-namespace +``` + +#### Option C: From Source + +```bash +# Clone repository +git clone https://github.com/SuperMarioYL/Bison.git +cd Bison + +# Install dependencies and build +make install-deps +make build + +# Deploy using Helm +helm install bison ./deploy/charts/bison \ + --namespace bison-system \ + --create-namespace +``` + +### Access the Platform + +After installation, access Bison through: + +```bash +# Port-forward the Web UI +kubectl port-forward -n bison-system svc/bison-webui 3000:80 + +# Access at http://localhost:3000 +# Default credentials (if auth enabled): +# Username: admin +# Password: admin (change immediately in production!) +``` + +## Next Steps + +- [Explore Features](features.md) - Learn about all capabilities +- [Installation Guide](installation.md) - Detailed installation instructions +- [User Guides](user-guides/admin.md) - Role-based user guides +- [Architecture](architecture.md) - Understand the system architecture +- [Configuration](configuration.md) - Configure billing and settings diff --git a/website/versioned_docs/version-0.0.11/user-guides/_category_.json b/website/versioned_docs/version-0.0.11/user-guides/_category_.json new file mode 100644 index 0000000..fe79f61 --- /dev/null +++ b/website/versioned_docs/version-0.0.11/user-guides/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "User Guides", + "position": 4, + "link": { + "type": "generated-index", + "description": "Role-based guides for using Bison effectively." + } +} diff --git a/website/versioned_docs/version-0.0.11/user-guides/admin.md b/website/versioned_docs/version-0.0.11/user-guides/admin.md new file mode 100644 index 0000000..a466cbb --- /dev/null +++ b/website/versioned_docs/version-0.0.11/user-guides/admin.md @@ -0,0 +1,176 @@ +--- +sidebar_position: 1 +--- + +# Administrator Guide + +This guide is for platform administrators who deploy, configure, and manage the Bison platform. + +## Responsibilities + +As a platform administrator, you are responsible for: + +- ✅ Deploying and configuring Bison +- ✅ Creating and managing teams +- ✅ Setting global billing configuration +- ✅ Monitoring cluster-wide metrics +- ✅ Responding to alerts and recharge requests + +## Getting Started + +### 1. Deploy Bison + +Follow the [Installation Guide](../installation.md) to deploy Bison in your Kubernetes cluster. + +### 2. Configure Billing + +Set up billing rules and pricing: + +1. Access the Web UI +2. Navigate to **Settings** > **Billing Configuration** +3. Configure: + - **Currency**: USD, CNY, EUR, etc. + - **CPU Price**: Cost per core-hour + - **Memory Price**: Cost per GB-hour + - **GPU Price**: Cost per GPU-hour +4. Click **Save** + +### 3. Create First Team + +Create a team for your users: + +1. Navigate to **Teams** page +2. Click **Create Team** +3. Fill in: + - **Team Name**: e.g., "ml-team" + - **Description**: Team purpose + - **Resource Quota**: + - CPU: e.g., "20" cores + - Memory: e.g., "64Gi" + - GPU: e.g., "4" + - **Initial Balance**: e.g., 1000.00 +4. Click **Create** + +## Common Tasks + +### Managing Teams + +#### View All Teams + +```bash +# Via kubectl +kubectl get tenants + +# Via API +curl http://localhost:8080/api/v1/teams +``` + +#### Update Team Quota + +1. Navigate to **Teams** page +2. Click **Edit** on the team row +3. Modify quotas +4. Click **Save** + +#### Recharge Team Balance + +1. Navigate to **Teams** page +2. Click **Recharge** on the team row +3. Enter amount +4. Add notes (optional) +5. Click **Confirm** + +### Monitoring + +#### View Dashboard + +Access real-time cluster metrics: +- Total teams and projects +- Resource utilization +- Cost trends +- Top consumers +- Balance status + +#### Check Alerts + +Monitor low-balance and quota alerts: +1. Navigate to **Alerts** page +2. Review active alerts +3. Take action as needed + +### Billing Configuration + +#### Update Pricing + +```bash +curl -X PUT http://localhost:8080/api/v1/billing/config \ + -H "Content-Type: application/json" \ + -d '{ + "pricing": { + "cpu": 0.06, + "memory": 0.012, + "nvidia.com/gpu": 3.00 + } + }' +``` + +#### Configure Alert Thresholds + +```json +{ + "lowBalanceThreshold": 20, + "suspendThreshold": 5, + "alertChannels": ["webhook", "dingtalk"] +} +``` + +## Best Practices + +### Team Naming +- Use lowercase, alphanumeric characters and hyphens +- Example: `ml-team`, `data-science`, `dev-team` + +### Quota Allocation +- Start with conservative quotas +- Monitor usage for 1-2 weeks +- Adjust based on actual needs + +### Balance Management +- Set up auto-recharge for critical teams +- Monitor balance trends weekly +- Respond to low-balance alerts promptly + +### Security +- Enable authentication in production +- Use OIDC/SSO for enterprise deployments +- Regularly audit user permissions + +## Troubleshooting + +### Team Creation Failed + +Check Capsule operator logs: +```bash +kubectl logs -n capsule-system deployment/capsule-controller-manager +``` + +### Billing Not Working + +Verify OpenCost connectivity: +```bash +kubectl port-forward -n opencost-system svc/opencost 9003:9003 +curl http://localhost:9003/healthz +``` + +### High Resource Usage + +Check resource consumption: +```bash +kubectl top pods -n bison-system +``` + +## Next Steps + +- [Team Leader Guide](team-leader.md) - Guide for team leaders +- [Developer Guide](developer.md) - Guide for developers +- [Configuration](../configuration.md) - Advanced configuration diff --git a/website/versioned_docs/version-0.0.11/user-guides/developer.md b/website/versioned_docs/version-0.0.11/user-guides/developer.md new file mode 100644 index 0000000..d14ab62 --- /dev/null +++ b/website/versioned_docs/version-0.0.11/user-guides/developer.md @@ -0,0 +1,187 @@ +--- +sidebar_position: 3 +--- + +# Developer Guide + +This guide is for developers who deploy workloads and consume resources within team projects. + +## Responsibilities + +As a developer, you are responsible for: + +- ✅ Deploying applications within your project +- ✅ Monitoring resource usage +- ✅ Staying within quota limits +- ✅ Optimizing resource consumption + +## Getting Started + +### 1. Get Kubeconfig + +Request kubeconfig from your team leader or administrator. + +### 2. Set Context + +```bash +# Set context to your project namespace +kubectl config set-context --current --namespace=your-project + +# Verify +kubectl config view --minify | grep namespace +``` + +### 3. Check Quota + +See your available resources: +```bash +kubectl describe quota +``` + +## Deploying Workloads + +### Basic Pod Deployment + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-training-job + namespace: your-project +spec: + containers: + - name: trainer + image: your-ml-image:latest + resources: + requests: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "1" +``` + +### Using Deployments + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ml-inference + namespace: your-project +spec: + replicas: 2 + selector: + matchLabels: + app: ml-inference + template: + metadata: + labels: + app: ml-inference + spec: + containers: + - name: inference + image: your-inference-image:latest + resources: + requests: + cpu: "2" + memory: "8Gi" + nvidia.com/gpu: "1" +``` + +## Monitoring Usage + +### Check Pod Resource Usage + +```bash +# View resource consumption +kubectl top pods + +# Detailed pod information +kubectl describe pod +``` + +### View Logs + +```bash +# Stream logs +kubectl logs -f + +# Previous logs (if pod restarted) +kubectl logs --previous +``` + +## Best Practices + +### Resource Requests and Limits + +Always specify both requests and limits: +```yaml +resources: + requests: + cpu: "2" + memory: "8Gi" + limits: + cpu: "4" + memory: "16Gi" +``` + +### GPU Usage + +- Request GPUs only when needed +- Use GPU for compute-intensive tasks +- Monitor GPU utilization + +### Clean Up + +Delete resources when no longer needed: +```bash +# Delete pod +kubectl delete pod + +# Delete deployment +kubectl delete deployment + +# Clean up completed jobs +kubectl delete job --field-selector status.successful=1 +``` + +### Cost Optimization + +- Right-size your resource requests +- Use horizontal pod autoscaling +- Clean up idle resources +- Share GPUs when possible (if supported) + +## Troubleshooting + +### Pod Pending (Insufficient Quota) + +If your pod is stuck in `Pending` state: + +```bash +kubectl describe pod +``` + +Look for quota-related errors and reduce resource requests or ask your team leader for more quota. + +### Out of Memory (OOM) + +If pods are killed due to OOM: +1. Check memory usage patterns +2. Increase memory limits +3. Optimize application memory usage + +### GPU Not Available + +Verify GPU requests: +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,GPU:.status.allocatable."nvidia\.com/gpu" +``` + +## Next Steps + +- [Team Leader Guide](team-leader.md) - Understand team management +- [Architecture](../architecture.md) - Learn about the platform diff --git a/website/versioned_docs/version-0.0.11/user-guides/team-leader.md b/website/versioned_docs/version-0.0.11/user-guides/team-leader.md new file mode 100644 index 0000000..7b6de96 --- /dev/null +++ b/website/versioned_docs/version-0.0.11/user-guides/team-leader.md @@ -0,0 +1,126 @@ +--- +sidebar_position: 2 +--- + +# Team Leader Guide + +This guide is for team leaders who manage projects, monitor budgets, and allocate resources within their team. + +## Responsibilities + +As a team leader, you are responsible for: + +- ✅ Creating and managing projects (namespaces) +- ✅ Allocating quotas to projects +- ✅ Monitoring team balance and consumption +- ✅ Requesting recharges when needed + +## Getting Started + +### 1. Access Bison + +Log in to the Web UI with your credentials. + +### 2. View Team Dashboard + +Your dashboard shows: +- Team balance and status +- Resource utilization +- Active projects +- Cost trends + +## Managing Projects + +### Create a Project + +1. Navigate to **Projects** page +2. Click **Create Project** +3. Fill in: + - **Project Name**: e.g., "training-ml-models" + - **Description**: Project purpose + - **Quota** (optional): + - CPU: e.g., "8" cores + - Memory: e.g., "32Gi" + - GPU: e.g., "2" +4. Click **Create** + +### List Projects + +```bash +# Via kubectl (if you have access) +kubectl get namespaces -l capsule.clastix.io/tenant=your-team + +# Via API +curl http://localhost:8080/api/v1/teams/your-team/projects +``` + +### Delete a Project + +1. Navigate to **Projects** page +2. Click **Delete** on the project row +3. Confirm deletion + +**Warning**: This will delete all resources in the project! + +## Monitoring Budget + +### Check Balance + +View your current balance: +1. Navigate to **Team** page +2. See balance in the status card + +### View Usage Trends + +Analyze spending patterns: +1. Navigate to **Reports** page +2. Select time range (7 days, 30 days, 90 days) +3. View: + - Cost breakdown by resource type + - Daily cost trends + - Per-project consumption + +### Request Recharge + +When balance is low: +1. Click **Request Recharge** button +2. Enter requested amount +3. Add justification +4. Submit request to administrator + +## Resource Management + +### Monitor Quota Usage + +Check how much of your quota is being used: +```bash +kubectl describe quota -n your-project +``` + +### Optimize Costs + +Tips to reduce spending: +- **Right-size resources**: Don't over-provision CPU/Memory +- **Clean up idle pods**: Delete unused workloads +- **Use spot/preemptible instances**: Where applicable +- **Monitor GPU utilization**: Ensure GPUs are fully utilized + +## Best Practices + +### Project Organization +- Create separate projects for different workloads +- Example: `ml-training`, `ml-inference`, `data-processing` + +### Quota Allocation +- Allocate quotas based on project priority +- Reserve buffer for urgent tasks + +### Cost Awareness +- Review costs weekly +- Identify and eliminate waste +- Set up cost alerts + +## Next Steps + +- [Developer Guide](developer.md) - Guide for your team members +- [Features](../features.md) - Explore all Bison features diff --git a/website/versioned_sidebars/version-0.0.11-sidebars.json b/website/versioned_sidebars/version-0.0.11-sidebars.json new file mode 100644 index 0000000..caea0c0 --- /dev/null +++ b/website/versioned_sidebars/version-0.0.11-sidebars.json @@ -0,0 +1,8 @@ +{ + "tutorialSidebar": [ + { + "type": "autogenerated", + "dirName": "." + } + ] +} diff --git a/website/versions.json b/website/versions.json index 2b4b0b6..7f5dfc1 100644 --- a/website/versions.json +++ b/website/versions.json @@ -1,4 +1,5 @@ [ + "0.0.11", "0.0.8", "0.0.7", "0.0.6", From 3cbb1c50a9404c301ffde94fc21e568d897fb8a1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 29 Dec 2025 02:10:00 +0000 Subject: [PATCH 22/44] chore: bump version to 0.0.11 [skip ci] Auto-updated version files from release refs/tags/v0.0.11 - Updated Chart.yaml version to 0.0.11 - Updated package.json version to 0.0.11 --- deploy/charts/bison/Chart.yaml | 4 ++-- web-ui/package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/charts/bison/Chart.yaml b/deploy/charts/bison/Chart.yaml index 4bdba29..6169521 100644 --- a/deploy/charts/bison/Chart.yaml +++ b/deploy/charts/bison/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: bison description: Bison - GPU 资源计费平台,基于 Capsule 多租户 + OpenCost 成本追踪 type: application -version: 0.0.10 -appVersion: "0.0.10" +version: 0.0.11 +appVersion: "0.0.11" keywords: - gpu - billing diff --git a/web-ui/package.json b/web-ui/package.json index 3b73a5c..6d34d5f 100644 --- a/web-ui/package.json +++ b/web-ui/package.json @@ -1,6 +1,6 @@ { "name": "bison-web-ui", - "version": "0.0.10", + "version": "0.0.11", "private": true, "scripts": { "dev": "vite", From 1f0073118849d0c20906cb87f47bebc76f008550 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sat, 31 Jan 2026 15:19:29 +0800 Subject: [PATCH 23/44] add --- website/docusaurus.config.ts | 4 ++++ website/static/robots.txt | 6 ++++++ 2 files changed, 10 insertions(+) create mode 100644 website/static/robots.txt diff --git a/website/docusaurus.config.ts b/website/docusaurus.config.ts index 37ca0de..41b1c0c 100644 --- a/website/docusaurus.config.ts +++ b/website/docusaurus.config.ts @@ -64,6 +64,10 @@ const config: Config = { themeConfig: { // Replace with your project's social card image: "img/bison-social-card.png", + metadata: [ + { name: 'google-site-verification', content: 'AKwgEpVubW0iVPeZN2RkopUMJbx4K-Yy8aWwCZq-T34' }, + { name: 'baidu-site-verification', content: 'codeva-E3148CbKk3' }, + ], colorMode: { respectPrefersColorScheme: true, }, diff --git a/website/static/robots.txt b/website/static/robots.txt new file mode 100644 index 0000000..44699ff --- /dev/null +++ b/website/static/robots.txt @@ -0,0 +1,6 @@ +User-agent: * +Allow: / + +# Sitemap +Sitemap: https://bison.lei6393.com/sitemap.xml +Sitemap: https://bison.lei6393.com/zh-Hans/sitemap.xml From 8e947972e1ac5339625a9b482d0617963cca41d7 Mon Sep 17 00:00:00 2001 From: supermario_yl Date: Sat, 7 Feb 2026 15:41:00 +0800 Subject: [PATCH 24/44] add --- Makefile | 8 +- api-server/cmd/main.go | 34 +- api-server/internal/config/config.go | 13 +- .../internal/handler/config_transfer.go | 116 +++ api-server/internal/handler/onboarding.go | 277 ++++++ api-server/internal/handler/settings.go | 11 +- .../service/config_transfer_service.go | 664 +++++++++++++ .../internal/service/init_script_service.go | 924 ++++++++++++++++++ .../internal/service/onboarding_service.go | 760 ++++++++++++++ .../internal/service/settings_service.go | 213 +++- api-server/internal/ssh/executor.go | 370 +++++++ deploy/charts/bison/templates/NOTES.txt | 24 +- deploy/charts/bison/templates/_helpers.tpl | 12 +- .../deployment.yaml => api-deployment.yaml} | 5 +- .../bison/templates/api-server/ingress.yaml | 40 - deploy/charts/bison/templates/ingress.yaml | 34 + .../templates/{api-server => }/rbac.yaml | 0 .../auth-secret.yaml => secret.yaml} | 1 - .../templates/{api-server => }/service.yaml | 22 +- .../deployment.yaml => web-deployment.yaml} | 3 +- .../bison/templates/web-ui/ingress.yaml | 40 - .../bison/templates/web-ui/service.yaml | 21 - deploy/charts/bison/values.yaml | 54 +- web-ui/src/App.tsx | 27 +- web-ui/src/components/NodeOnboardingModal.tsx | 183 ++++ .../components/OnboardingProgressDrawer.tsx | 295 ++++++ web-ui/src/layouts/BasicLayout.tsx | 52 +- web-ui/src/pages/Cluster/ClusterNodes.tsx | 42 +- web-ui/src/pages/Cluster/NodeDetail.tsx | 281 ++++-- web-ui/src/pages/Dashboard/index.tsx | 95 +- web-ui/src/pages/Settings/ConfigTransfer.tsx | 484 +++++++++ .../src/pages/Settings/ControlPlaneConfig.tsx | 234 +++++ web-ui/src/pages/Settings/GeneralSettings.tsx | 36 +- web-ui/src/pages/Settings/NodeInitConfig.tsx | 504 ++++++++++ web-ui/src/pages/Settings/index.tsx | 55 +- web-ui/src/services/api.ts | 199 +++- 36 files changed, 5769 insertions(+), 364 deletions(-) create mode 100644 api-server/internal/handler/config_transfer.go create mode 100644 api-server/internal/handler/onboarding.go create mode 100644 api-server/internal/service/config_transfer_service.go create mode 100644 api-server/internal/service/init_script_service.go create mode 100644 api-server/internal/service/onboarding_service.go create mode 100644 api-server/internal/ssh/executor.go rename deploy/charts/bison/templates/{api-server/deployment.yaml => api-deployment.yaml} (91%) delete mode 100644 deploy/charts/bison/templates/api-server/ingress.yaml create mode 100644 deploy/charts/bison/templates/ingress.yaml rename deploy/charts/bison/templates/{api-server => }/rbac.yaml (100%) rename deploy/charts/bison/templates/{api-server/auth-secret.yaml => secret.yaml} (99%) rename deploy/charts/bison/templates/{api-server => }/service.yaml (50%) rename deploy/charts/bison/templates/{web-ui/deployment.yaml => web-deployment.yaml} (93%) delete mode 100644 deploy/charts/bison/templates/web-ui/ingress.yaml delete mode 100644 deploy/charts/bison/templates/web-ui/service.yaml create mode 100644 web-ui/src/components/NodeOnboardingModal.tsx create mode 100644 web-ui/src/components/OnboardingProgressDrawer.tsx create mode 100644 web-ui/src/pages/Settings/ConfigTransfer.tsx create mode 100644 web-ui/src/pages/Settings/ControlPlaneConfig.tsx create mode 100644 web-ui/src/pages/Settings/NodeInitConfig.tsx diff --git a/Makefile b/Makefile index 9453481..62ef8b6 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # 基于 Capsule + OpenCost 架构 # ==================== 配置 ==================== -REGISTRY ?= docker.io +REGISTRY ?= ghcr.io/supermarioyl REPO ?= bison VERSION ?= latest HELM_RELEASE ?= bison @@ -304,9 +304,8 @@ deploy: ## 部署 Bison helm upgrade --install $(HELM_RELEASE) ./deploy/charts/bison \ --namespace $(NAMESPACE) \ --create-namespace \ - --set apiServer.image.repository=$(REGISTRY)/$(REPO)/api-server \ + --set global.imageRegistry=$(REGISTRY) \ --set apiServer.image.tag=$(VERSION) \ - --set webUI.image.repository=$(REGISTRY)/$(REPO)/web-ui \ --set webUI.image.tag=$(VERSION) .PHONY: deploy-with-auth @@ -316,9 +315,8 @@ deploy-with-auth: ## 部署 Bison (启用认证) --create-namespace \ --set auth.enabled=true \ --set auth.admin.password=$$(openssl rand -base64 12) \ - --set apiServer.image.repository=$(REGISTRY)/$(REPO)/api-server \ + --set global.imageRegistry=$(REGISTRY) \ --set apiServer.image.tag=$(VERSION) \ - --set webUI.image.repository=$(REGISTRY)/$(REPO)/web-ui \ --set webUI.image.tag=$(VERSION) .PHONY: undeploy diff --git a/api-server/cmd/main.go b/api-server/cmd/main.go index 307caf1..bb5772f 100644 --- a/api-server/cmd/main.go +++ b/api-server/cmd/main.go @@ -71,6 +71,9 @@ func main() { reportSvc := service.NewReportService(opencostClient, tenantSvc, projectSvc, billingSvc) nodeSvc := service.NewNodeService(k8sClient) workloadSvc := service.NewWorkloadService(k8sClient) + initScriptSvc := service.NewInitScriptService(k8sClient) + onboardingSvc := service.NewOnboardingService(k8sClient, nodeSvc, initScriptSvc) + configTransferSvc := service.NewConfigTransferService(billingSvc, alertSvc, resourceConfigSvc, initScriptSvc) // Initialize scheduler sched := scheduler.NewScheduler(billingSvc, balanceSvc, alertSvc) @@ -106,6 +109,8 @@ func main() { statusHandler := handler.NewStatusHandler(statusSvc) nodeHandler := handler.NewNodeHandler(nodeSvc) workloadHandler := handler.NewWorkloadHandler(workloadSvc, projectSvc) + onboardingHandler := handler.NewOnboardingHandler(onboardingSvc, initScriptSvc) + configTransferHandler := handler.NewConfigTransferHandler(configTransferSvc) // Setup Gin router if cfg.Mode == "release" { @@ -135,7 +140,9 @@ func main() { // Feature flags (public) api.GET("/features", func(c *gin.Context) { c.JSON(http.StatusOK, gin.H{ - "costEnabled": costSvc.IsEnabled(), + "costEnabled": costSvc.IsEnabled(), + "capsuleEnabled": cfg.CapsuleEnabled, + "prometheusEnabled": cfg.PrometheusURL != "", }) }) @@ -235,6 +242,12 @@ func main() { protected.POST("/nodes/:name/assign", nodeHandler.AssignNodeToTeam) protected.POST("/nodes/:name/release", nodeHandler.ReleaseNode) + // Node onboarding + protected.POST("/nodes/onboard", onboardingHandler.StartOnboarding) + protected.GET("/nodes/onboard", onboardingHandler.ListOnboardingJobs) + protected.GET("/nodes/onboard/:jobId", onboardingHandler.GetOnboardingJob) + protected.DELETE("/nodes/onboard/:jobId", onboardingHandler.CancelOnboardingJob) + // System settings protected.GET("/settings", settingsHandler.GetSettings) protected.GET("/settings/billing", billingHandler.GetBillingConfig) @@ -243,6 +256,25 @@ func main() { protected.PUT("/settings/alerts", alertHandler.UpdateAlertConfig) protected.POST("/settings/alerts/test", alertHandler.TestChannel) + // Control plane settings + protected.GET("/settings/control-plane", onboardingHandler.GetControlPlaneConfig) + protected.PUT("/settings/control-plane", onboardingHandler.UpdateControlPlaneConfig) + protected.POST("/settings/control-plane/test", onboardingHandler.TestControlPlaneConnection) + + // Init scripts settings + protected.GET("/settings/init-scripts", onboardingHandler.ListInitScripts) + protected.POST("/settings/init-scripts", onboardingHandler.CreateInitScript) + protected.GET("/settings/init-scripts/:id", onboardingHandler.GetInitScript) + protected.PUT("/settings/init-scripts/:id", onboardingHandler.UpdateInitScript) + protected.DELETE("/settings/init-scripts/:id", onboardingHandler.DeleteInitScript) + protected.PUT("/settings/init-scripts/:id/toggle", onboardingHandler.ToggleInitScript) + protected.PUT("/settings/init-scripts/reorder", onboardingHandler.ReorderInitScripts) + + // Configuration import/export + protected.GET("/settings/export", configTransferHandler.ExportConfig) + protected.POST("/settings/import/preview", configTransferHandler.PreviewImport) + protected.POST("/settings/import/apply", configTransferHandler.ApplyImport) + // Node metrics (from Prometheus) protected.GET("/metrics/node/:name", settingsHandler.GetNodeMetrics) diff --git a/api-server/internal/config/config.go b/api-server/internal/config/config.go index 437324d..8d4f555 100644 --- a/api-server/internal/config/config.go +++ b/api-server/internal/config/config.go @@ -21,6 +21,9 @@ type Config struct { // External services OpenCostURL string PrometheusURL string + + // Feature toggles + CapsuleEnabled bool } // Load reads configuration from environment variables @@ -32,8 +35,9 @@ func Load() (*Config, error) { AdminUsername: "admin", AdminPassword: "admin", JWTSecret: "bison-secret-key-change-in-production", - OpenCostURL: "", - PrometheusURL: "", + OpenCostURL: "", + PrometheusURL: "", + CapsuleEnabled: true, } if port := os.Getenv("PORT"); port != "" { @@ -70,5 +74,10 @@ func Load() (*Config, error) { cfg.PrometheusURL = prometheusURL } + // Feature toggles + if capsuleEnabled := os.Getenv("CAPSULE_ENABLED"); capsuleEnabled == "false" { + cfg.CapsuleEnabled = false + } + return cfg, nil } diff --git a/api-server/internal/handler/config_transfer.go b/api-server/internal/handler/config_transfer.go new file mode 100644 index 0000000..c101a72 --- /dev/null +++ b/api-server/internal/handler/config_transfer.go @@ -0,0 +1,116 @@ +package handler + +import ( + "encoding/json" + "fmt" + "net/http" + "strings" + "time" + + "github.com/gin-gonic/gin" + + "github.com/bison/api-server/internal/service" + "github.com/bison/api-server/pkg/logger" +) + +// ConfigTransferHandler handles configuration import/export requests +type ConfigTransferHandler struct { + configTransferSvc *service.ConfigTransferService +} + +// NewConfigTransferHandler creates a new ConfigTransferHandler +func NewConfigTransferHandler(svc *service.ConfigTransferService) *ConfigTransferHandler { + return &ConfigTransferHandler{ + configTransferSvc: svc, + } +} + +// ExportConfig exports configuration as a JSON file download +func (h *ConfigTransferHandler) ExportConfig(c *gin.Context) { + sectionsParam := c.DefaultQuery("sections", strings.Join(service.AllSections, ",")) + includeSensitive := c.DefaultQuery("includeSensitive", "false") == "true" + + sections := strings.Split(sectionsParam, ",") + for i := range sections { + sections[i] = strings.TrimSpace(sections[i]) + } + + operator := "admin" + if username, exists := c.Get("username"); exists { + operator = username.(string) + } + + config, err := h.configTransferSvc.Export(c.Request.Context(), sections, includeSensitive, operator) + if err != nil { + logger.Error("Failed to export config", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + data, err := json.MarshalIndent(config, "", " ") + if err != nil { + logger.Error("Failed to marshal export config", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "序列化配置失败"}) + return + } + + filename := fmt.Sprintf("bison-config-%s.json", time.Now().Format("20060102-150405")) + c.Header("Content-Type", "application/json") + c.Header("Content-Disposition", fmt.Sprintf("attachment; filename=%s", filename)) + c.Data(http.StatusOK, "application/json", data) +} + +// PreviewImport validates and previews an import configuration +func (h *ConfigTransferHandler) PreviewImport(c *gin.Context) { + var config service.ExportConfig + if err := c.ShouldBindJSON(&config); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "无效的 JSON 格式: " + err.Error()}) + return + } + + if config.Version == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "缺少 version 字段"}) + return + } + if config.Sections == nil || len(config.Sections) == 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "缺少 sections 字段"}) + return + } + + result, err := h.configTransferSvc.Preview(c.Request.Context(), &config) + if err != nil { + logger.Error("Failed to preview import", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, result) +} + +// ApplyImport applies the imported configuration +func (h *ConfigTransferHandler) ApplyImport(c *gin.Context) { + var req service.ImportRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "无效的请求格式: " + err.Error()}) + return + } + + if len(req.Sections) == 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "请选择至少一个配置模块"}) + return + } + + if req.Config.Version == "" || req.Config.Sections == nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "无效的配置数据"}) + return + } + + result, err := h.configTransferSvc.Apply(c.Request.Context(), &req) + if err != nil { + logger.Error("Failed to apply import", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, result) +} diff --git a/api-server/internal/handler/onboarding.go b/api-server/internal/handler/onboarding.go new file mode 100644 index 0000000..ef19ef8 --- /dev/null +++ b/api-server/internal/handler/onboarding.go @@ -0,0 +1,277 @@ +package handler + +import ( + "net/http" + + "github.com/gin-gonic/gin" + + "github.com/bison/api-server/internal/service" + "github.com/bison/api-server/pkg/logger" +) + +// OnboardingHandler handles node onboarding requests +type OnboardingHandler struct { + onboardingSvc *service.OnboardingService + initScriptSvc *service.InitScriptService +} + +// NewOnboardingHandler creates a new OnboardingHandler +func NewOnboardingHandler(onboardingSvc *service.OnboardingService, initScriptSvc *service.InitScriptService) *OnboardingHandler { + return &OnboardingHandler{ + onboardingSvc: onboardingSvc, + initScriptSvc: initScriptSvc, + } +} + +// StartOnboarding starts a new node onboarding job +// POST /api/v1/nodes/onboard +func (h *OnboardingHandler) StartOnboarding(c *gin.Context) { + var req service.OnboardingRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + job, err := h.onboardingSvc.StartOnboarding(c.Request.Context(), &req) + if err != nil { + logger.Error("Failed to start onboarding", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusAccepted, job) +} + +// GetOnboardingJob returns a specific onboarding job +// GET /api/v1/nodes/onboard/:jobId +func (h *OnboardingHandler) GetOnboardingJob(c *gin.Context) { + jobID := c.Param("jobId") + + job, err := h.onboardingSvc.GetJob(c.Request.Context(), jobID) + if err != nil { + c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, job) +} + +// ListOnboardingJobs returns all onboarding jobs +// GET /api/v1/nodes/onboard +func (h *OnboardingHandler) ListOnboardingJobs(c *gin.Context) { + jobs, err := h.onboardingSvc.ListJobs(c.Request.Context()) + if err != nil { + logger.Error("Failed to list onboarding jobs", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"items": jobs}) +} + +// CancelOnboardingJob cancels a running onboarding job +// DELETE /api/v1/nodes/onboard/:jobId +func (h *OnboardingHandler) CancelOnboardingJob(c *gin.Context) { + jobID := c.Param("jobId") + + err := h.onboardingSvc.CancelJob(c.Request.Context(), jobID) + if err != nil { + logger.Error("Failed to cancel onboarding job", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Job cancelled"}) +} + +// GetControlPlaneConfig returns the control plane configuration +// GET /api/v1/settings/control-plane +func (h *OnboardingHandler) GetControlPlaneConfig(c *gin.Context) { + config, err := h.initScriptSvc.GetControlPlaneConfig(c.Request.Context()) + if err != nil { + logger.Error("Failed to get control plane config", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + // Mask sensitive data + response := gin.H{ + "host": config.Host, + "sshPort": config.SSHPort, + "sshUser": config.SSHUser, + "authMethod": config.AuthMethod, + "hasPassword": config.Password != "", + "hasPrivateKey": config.PrivateKey != "", + } + + c.JSON(http.StatusOK, response) +} + +// UpdateControlPlaneConfig updates the control plane configuration +// PUT /api/v1/settings/control-plane +func (h *OnboardingHandler) UpdateControlPlaneConfig(c *gin.Context) { + var config service.ControlPlaneConfig + if err := c.ShouldBindJSON(&config); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + // Get existing config to preserve credentials if not provided + existing, _ := h.initScriptSvc.GetControlPlaneConfig(c.Request.Context()) + if existing != nil { + if config.Password == "" && existing.Password != "" { + config.Password = existing.Password + } + if config.PrivateKey == "" && existing.PrivateKey != "" { + config.PrivateKey = existing.PrivateKey + } + } + + err := h.initScriptSvc.SaveControlPlaneConfig(c.Request.Context(), &config) + if err != nil { + logger.Error("Failed to save control plane config", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Control plane configuration saved"}) +} + +// TestControlPlaneConnection tests the control plane SSH connection +// POST /api/v1/settings/control-plane/test +func (h *OnboardingHandler) TestControlPlaneConnection(c *gin.Context) { + err := h.onboardingSvc.TestControlPlaneConnection(c.Request.Context()) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Connection successful"}) +} + +// ListInitScripts returns all init script groups +// GET /api/v1/settings/init-scripts +func (h *OnboardingHandler) ListInitScripts(c *gin.Context) { + groups, err := h.initScriptSvc.GetAllScriptGroups(c.Request.Context()) + if err != nil { + logger.Error("Failed to list init scripts", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"items": groups}) +} + +// GetInitScript returns a specific init script group +// GET /api/v1/settings/init-scripts/:id +func (h *OnboardingHandler) GetInitScript(c *gin.Context) { + id := c.Param("id") + + group, err := h.initScriptSvc.GetScriptGroup(c.Request.Context(), id) + if err != nil { + c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, group) +} + +// CreateInitScript creates a new init script group +// POST /api/v1/settings/init-scripts +func (h *OnboardingHandler) CreateInitScript(c *gin.Context) { + var group service.ScriptGroup + if err := c.ShouldBindJSON(&group); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + err := h.initScriptSvc.CreateScriptGroup(c.Request.Context(), &group) + if err != nil { + logger.Error("Failed to create init script", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusCreated, group) +} + +// UpdateInitScript updates an init script group +// PUT /api/v1/settings/init-scripts/:id +func (h *OnboardingHandler) UpdateInitScript(c *gin.Context) { + id := c.Param("id") + + var group service.ScriptGroup + if err := c.ShouldBindJSON(&group); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + err := h.initScriptSvc.UpdateScriptGroup(c.Request.Context(), id, &group) + if err != nil { + logger.Error("Failed to update init script", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, group) +} + +// DeleteInitScript deletes an init script group +// DELETE /api/v1/settings/init-scripts/:id +func (h *OnboardingHandler) DeleteInitScript(c *gin.Context) { + id := c.Param("id") + + err := h.initScriptSvc.DeleteScriptGroup(c.Request.Context(), id) + if err != nil { + logger.Error("Failed to delete init script", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Script group deleted"}) +} + +// ToggleInitScript enables or disables an init script group +// PUT /api/v1/settings/init-scripts/:id/toggle +func (h *OnboardingHandler) ToggleInitScript(c *gin.Context) { + id := c.Param("id") + + var req struct { + Enabled bool `json:"enabled"` + } + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + err := h.initScriptSvc.ToggleScriptGroup(c.Request.Context(), id, req.Enabled) + if err != nil { + logger.Error("Failed to toggle init script", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Script group toggled"}) +} + +// ReorderInitScripts updates the order of init script groups +// PUT /api/v1/settings/init-scripts/reorder +func (h *OnboardingHandler) ReorderInitScripts(c *gin.Context) { + var req struct { + IDs []string `json:"ids"` + } + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + err := h.initScriptSvc.ReorderScriptGroups(c.Request.Context(), req.IDs) + if err != nil { + logger.Error("Failed to reorder init scripts", "error", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": "Script groups reordered"}) +} + diff --git a/api-server/internal/handler/settings.go b/api-server/internal/handler/settings.go index ccdf5cb..7419710 100644 --- a/api-server/internal/handler/settings.go +++ b/api-server/internal/handler/settings.go @@ -32,8 +32,17 @@ func (h *SettingsHandler) GetSettings(c *gin.Context) { func (h *SettingsHandler) GetNodeMetrics(c *gin.Context) { nodeName := c.Param("name") hours, _ := strconv.Atoi(c.DefaultQuery("hours", "24")) + hasGpu := c.DefaultQuery("hasGpu", "false") == "true" + hasNpu := c.DefaultQuery("hasNpu", "false") == "true" + + req := service.NodeMetricsRequest{ + NodeName: nodeName, + Hours: hours, + HasGpu: hasGpu, + HasNpu: hasNpu, + } - metrics, err := h.settingsSvc.GetNodeMetrics(c.Request.Context(), nodeName, hours) + metrics, err := h.settingsSvc.GetNodeMetrics(c.Request.Context(), req) if err != nil { logger.Error("Failed to get node metrics", "node", nodeName, "error", err) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) diff --git a/api-server/internal/service/config_transfer_service.go b/api-server/internal/service/config_transfer_service.go new file mode 100644 index 0000000..7127f90 --- /dev/null +++ b/api-server/internal/service/config_transfer_service.go @@ -0,0 +1,664 @@ +package service + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/bison/api-server/pkg/logger" +) + +const ( + ExportVersion = "1.0" + RedactedValue = "***REDACTED***" + SectionBilling = "billing" + SectionAlerts = "alerts" + SectionResources = "resources" + SectionCP = "controlPlane" + SectionScripts = "initScripts" +) + +var AllSections = []string{SectionBilling, SectionAlerts, SectionResources, SectionCP, SectionScripts} + +// ExportConfig represents the full export file structure +type ExportConfig struct { + Version string `json:"version"` + ExportedAt time.Time `json:"exportedAt"` + ExportedBy string `json:"exportedBy"` + Sections map[string]json.RawMessage `json:"sections"` +} + +// SectionPreview holds diff info for one config section +type SectionPreview struct { + Present bool `json:"present"` + Valid bool `json:"valid"` + HasSensitiveData bool `json:"hasSensitiveData"` + Changes map[string]*FieldChange `json:"changes,omitempty"` + Summary *ResourceSummary `json:"summary,omitempty"` + Warnings []string `json:"warnings,omitempty"` + Errors []string `json:"errors,omitempty"` +} + +// FieldChange represents a single field change +type FieldChange struct { + Current interface{} `json:"current"` + Imported interface{} `json:"imported"` +} + +// ResourceSummary for array-based configs +type ResourceSummary struct { + Added []string `json:"added,omitempty"` + Modified []string `json:"modified,omitempty"` + Removed []string `json:"removed,omitempty"` + Unchanged []string `json:"unchanged,omitempty"` +} + +// ImportPreviewResult holds the preview/diff analysis +type ImportPreviewResult struct { + Valid bool `json:"valid"` + Version string `json:"version"` + ExportedAt string `json:"exportedAt,omitempty"` + Sections map[string]*SectionPreview `json:"sections"` + Errors []string `json:"errors"` + Warnings []string `json:"warnings"` +} + +// ImportRequest holds the import apply request +type ImportRequest struct { + Config ExportConfig `json:"config"` + Sections []string `json:"sections"` + PreserveSensitive bool `json:"preserveSensitive"` +} + +// ImportResult holds the import apply result +type ImportResult struct { + Message string `json:"message"` + Applied []string `json:"applied"` + Skipped []string `json:"skipped"` + Warnings []string `json:"warnings"` +} + +// ConfigTransferService handles configuration export and import +type ConfigTransferService struct { + billingSvc *BillingService + alertSvc *AlertService + resourceConfigSvc *ResourceConfigService + initScriptSvc *InitScriptService +} + +// NewConfigTransferService creates a new ConfigTransferService +func NewConfigTransferService( + billingSvc *BillingService, + alertSvc *AlertService, + resourceConfigSvc *ResourceConfigService, + initScriptSvc *InitScriptService, +) *ConfigTransferService { + return &ConfigTransferService{ + billingSvc: billingSvc, + alertSvc: alertSvc, + resourceConfigSvc: resourceConfigSvc, + initScriptSvc: initScriptSvc, + } +} + +// Export exports selected configuration sections +func (s *ConfigTransferService) Export(ctx context.Context, sections []string, includeSensitive bool, operator string) (*ExportConfig, error) { + logger.Info("Exporting configuration", "sections", sections, "includeSensitive", includeSensitive, "operator", operator) + + sectionSet := make(map[string]bool) + for _, sec := range sections { + sectionSet[sec] = true + } + + result := &ExportConfig{ + Version: ExportVersion, + ExportedAt: time.Now(), + ExportedBy: operator, + Sections: make(map[string]json.RawMessage), + } + + if sectionSet[SectionBilling] { + config, err := s.billingSvc.GetConfig(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export billing config: %w", err) + } + data, _ := json.Marshal(config) + result.Sections[SectionBilling] = data + } + + if sectionSet[SectionAlerts] { + config, err := s.alertSvc.GetConfig(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export alert config: %w", err) + } + if !includeSensitive { + s.redactAlertChannels(config) + } + data, _ := json.Marshal(config) + result.Sections[SectionAlerts] = data + } + + if sectionSet[SectionResources] { + configs, err := s.resourceConfigSvc.GetResourceConfigs(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export resource configs: %w", err) + } + data, _ := json.Marshal(configs) + result.Sections[SectionResources] = data + } + + if sectionSet[SectionCP] { + config, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export control plane config: %w", err) + } + if !includeSensitive { + if config.Password != "" { + config.Password = RedactedValue + } + if config.PrivateKey != "" { + config.PrivateKey = RedactedValue + } + } + data, _ := json.Marshal(config) + result.Sections[SectionCP] = data + } + + if sectionSet[SectionScripts] { + groups, err := s.initScriptSvc.GetAllScriptGroups(ctx) + if err != nil { + return nil, fmt.Errorf("failed to export init scripts: %w", err) + } + data, _ := json.Marshal(groups) + result.Sections[SectionScripts] = data + } + + return result, nil +} + +// redactAlertChannels masks sensitive webhook URLs in alert channels +func (s *ConfigTransferService) redactAlertChannels(config *AlertConfig) { + sensitiveKeys := map[string]bool{ + "url": true, + "webhook": true, + "smtp": true, + } + for i := range config.Channels { + for key := range config.Channels[i].Config { + if sensitiveKeys[key] { + val := config.Channels[i].Config[key] + if len(val) > 20 { + config.Channels[i].Config[key] = val[:10] + "***" + val[len(val)-5:] + } else if val != "" { + config.Channels[i].Config[key] = RedactedValue + } + } + } + } +} + +// Preview validates and previews an import configuration +func (s *ConfigTransferService) Preview(ctx context.Context, config *ExportConfig) (*ImportPreviewResult, error) { + logger.Info("Previewing configuration import") + + result := &ImportPreviewResult{ + Valid: true, + Version: config.Version, + Sections: make(map[string]*SectionPreview), + Errors: []string{}, + Warnings: []string{}, + } + + if config.Version != ExportVersion { + result.Valid = false + result.Errors = append(result.Errors, fmt.Sprintf("不支持的版本: %s (期望 %s)", config.Version, ExportVersion)) + return result, nil + } + + if !config.ExportedAt.IsZero() { + result.ExportedAt = config.ExportedAt.Format(time.RFC3339) + } + + for section, raw := range config.Sections { + switch section { + case SectionBilling: + preview := s.previewBilling(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + case SectionAlerts: + preview := s.previewAlerts(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + case SectionResources: + preview := s.previewResources(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + case SectionCP: + preview := s.previewControlPlane(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + case SectionScripts: + preview := s.previewInitScripts(ctx, raw) + result.Sections[section] = preview + if !preview.Valid { + result.Valid = false + } + default: + result.Warnings = append(result.Warnings, fmt.Sprintf("未知的配置模块: %s (将被忽略)", section)) + } + } + + return result, nil +} + +func (s *ConfigTransferService) previewBilling(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported BillingConfig + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "计费配置格式无效: "+err.Error()) + return preview + } + + if imported.Interval <= 0 || imported.Interval > 24 { + preview.Errors = append(preview.Errors, "计费间隔必须在 1-24 小时之间") + preview.Valid = false + } + if imported.Currency == "" { + preview.Errors = append(preview.Errors, "货币代码不能为空") + preview.Valid = false + } + + current, err := s.billingSvc.GetConfig(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前计费配置进行对比") + return preview + } + + preview.Changes = make(map[string]*FieldChange) + if current.Enabled != imported.Enabled { + preview.Changes["enabled"] = &FieldChange{Current: current.Enabled, Imported: imported.Enabled} + } + if current.Interval != imported.Interval { + preview.Changes["interval"] = &FieldChange{Current: current.Interval, Imported: imported.Interval} + } + if current.Currency != imported.Currency { + preview.Changes["currency"] = &FieldChange{Current: current.Currency, Imported: imported.Currency} + } + if current.CurrencySymbol != imported.CurrencySymbol { + preview.Changes["currencySymbol"] = &FieldChange{Current: current.CurrencySymbol, Imported: imported.CurrencySymbol} + } + if current.GracePeriodValue != imported.GracePeriodValue { + preview.Changes["gracePeriodValue"] = &FieldChange{Current: current.GracePeriodValue, Imported: imported.GracePeriodValue} + } + if current.GracePeriodUnit != imported.GracePeriodUnit { + preview.Changes["gracePeriodUnit"] = &FieldChange{Current: current.GracePeriodUnit, Imported: imported.GracePeriodUnit} + } + + return preview +} + +func (s *ConfigTransferService) previewAlerts(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported AlertConfig + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "告警配置格式无效: "+err.Error()) + return preview + } + + if imported.BalanceThreshold < 0 { + preview.Errors = append(preview.Errors, "告警阈值不能为负数") + preview.Valid = false + } + + for _, ch := range imported.Channels { + if ch.ID == "" || ch.Type == "" || ch.Name == "" { + preview.Errors = append(preview.Errors, fmt.Sprintf("告警通道 '%s' 缺少必填字段 (id/type/name)", ch.Name)) + preview.Valid = false + } + for _, val := range ch.Config { + if val == RedactedValue { + preview.HasSensitiveData = true + preview.Warnings = append(preview.Warnings, "告警通道包含已脱敏的敏感数据,导入时将保留当前值") + break + } + } + } + + current, err := s.alertSvc.GetConfig(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前告警配置进行对比") + return preview + } + + preview.Changes = make(map[string]*FieldChange) + if current.BalanceThreshold != imported.BalanceThreshold { + preview.Changes["balanceThreshold"] = &FieldChange{Current: current.BalanceThreshold, Imported: imported.BalanceThreshold} + } + if len(current.Channels) != len(imported.Channels) { + preview.Changes["channels"] = &FieldChange{ + Current: fmt.Sprintf("%d 个通道", len(current.Channels)), + Imported: fmt.Sprintf("%d 个通道", len(imported.Channels)), + } + } + + return preview +} + +func (s *ConfigTransferService) previewResources(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported []ResourceDefinition + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "资源配置格式无效: "+err.Error()) + return preview + } + + for _, r := range imported { + if r.Name == "" { + preview.Errors = append(preview.Errors, "资源名称不能为空") + preview.Valid = false + } + if r.Divisor <= 0 { + preview.Errors = append(preview.Errors, fmt.Sprintf("资源 '%s' 的 divisor 必须大于 0", r.Name)) + preview.Valid = false + } + } + + current, err := s.resourceConfigSvc.GetResourceConfigs(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前资源配置进行对比") + return preview + } + + currentMap := make(map[string]ResourceDefinition) + for _, r := range current { + currentMap[r.Name] = r + } + importedMap := make(map[string]ResourceDefinition) + for _, r := range imported { + importedMap[r.Name] = r + } + + summary := &ResourceSummary{} + for _, r := range imported { + if _, exists := currentMap[r.Name]; exists { + curR := currentMap[r.Name] + if curR.DisplayName != r.DisplayName || curR.Unit != r.Unit || curR.Divisor != r.Divisor || + curR.Category != r.Category || curR.Enabled != r.Enabled || curR.Price != r.Price || + curR.SortOrder != r.SortOrder || curR.ShowInQuota != r.ShowInQuota { + summary.Modified = append(summary.Modified, r.Name) + } else { + summary.Unchanged = append(summary.Unchanged, r.Name) + } + } else { + summary.Added = append(summary.Added, r.Name) + } + } + for _, r := range current { + if _, exists := importedMap[r.Name]; !exists { + summary.Removed = append(summary.Removed, r.Name) + } + } + + if len(summary.Removed) > 0 { + preview.Warnings = append(preview.Warnings, fmt.Sprintf("以下资源将被移除: %v", summary.Removed)) + } + + preview.Summary = summary + return preview +} + +func (s *ConfigTransferService) previewControlPlane(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported ControlPlaneConfig + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "控制面配置格式无效: "+err.Error()) + return preview + } + + if imported.SSHPort < 1 || imported.SSHPort > 65535 { + preview.Errors = append(preview.Errors, "SSH 端口必须在 1-65535 之间") + preview.Valid = false + } + if imported.AuthMethod != "" && imported.AuthMethod != "password" && imported.AuthMethod != "privateKey" { + preview.Errors = append(preview.Errors, "认证方式必须为 password 或 privateKey") + preview.Valid = false + } + + if imported.Password == RedactedValue || imported.PrivateKey == RedactedValue { + preview.HasSensitiveData = true + preview.Warnings = append(preview.Warnings, "敏感数据 (密码/私钥) 已被排除,导入时将保留当前值") + } + + current, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前控制面配置进行对比") + return preview + } + + preview.Changes = make(map[string]*FieldChange) + if current.Host != imported.Host { + preview.Changes["host"] = &FieldChange{Current: current.Host, Imported: imported.Host} + } + if current.SSHPort != imported.SSHPort { + preview.Changes["sshPort"] = &FieldChange{Current: current.SSHPort, Imported: imported.SSHPort} + } + if current.SSHUser != imported.SSHUser { + preview.Changes["sshUser"] = &FieldChange{Current: current.SSHUser, Imported: imported.SSHUser} + } + if current.AuthMethod != imported.AuthMethod { + preview.Changes["authMethod"] = &FieldChange{Current: current.AuthMethod, Imported: imported.AuthMethod} + } + + return preview +} + +func (s *ConfigTransferService) previewInitScripts(ctx context.Context, raw json.RawMessage) *SectionPreview { + preview := &SectionPreview{Present: true, Valid: true} + + var imported []ScriptGroup + if err := json.Unmarshal(raw, &imported); err != nil { + preview.Valid = false + preview.Errors = append(preview.Errors, "初始化脚本配置格式无效: "+err.Error()) + return preview + } + + for _, g := range imported { + if g.ID == "" || g.Name == "" { + preview.Errors = append(preview.Errors, fmt.Sprintf("脚本组 '%s' 缺少必填字段 (id/name)", g.Name)) + preview.Valid = false + } + if g.Phase != PhasePreJoin && g.Phase != PhasePostJoin { + preview.Errors = append(preview.Errors, fmt.Sprintf("脚本组 '%s' 的 phase 必须为 pre-join 或 post-join", g.Name)) + preview.Valid = false + } + } + + current, err := s.initScriptSvc.GetAllScriptGroups(ctx) + if err != nil { + preview.Warnings = append(preview.Warnings, "无法获取当前初始化脚本进行对比") + return preview + } + + currentMap := make(map[string]ScriptGroup) + for _, g := range current { + currentMap[g.ID] = g + } + + summary := &ResourceSummary{} + for _, g := range imported { + if _, exists := currentMap[g.ID]; exists { + summary.Modified = append(summary.Modified, g.Name) + } else { + summary.Added = append(summary.Added, g.Name) + } + } + importedMap := make(map[string]bool) + for _, g := range imported { + importedMap[g.ID] = true + } + for _, g := range current { + if !importedMap[g.ID] { + summary.Removed = append(summary.Removed, g.Name) + } + } + + builtinOverwrite := 0 + for _, g := range imported { + if cur, exists := currentMap[g.ID]; exists && cur.Builtin { + builtinOverwrite++ + } + } + if builtinOverwrite > 0 { + preview.Warnings = append(preview.Warnings, fmt.Sprintf("将覆盖 %d 个内置脚本组", builtinOverwrite)) + } + + preview.Summary = summary + return preview +} + +// Apply applies the imported configuration +func (s *ConfigTransferService) Apply(ctx context.Context, req *ImportRequest) (*ImportResult, error) { + logger.Info("Applying imported configuration", "sections", req.Sections) + + result := &ImportResult{ + Applied: []string{}, + Skipped: []string{}, + Warnings: []string{}, + } + + sectionSet := make(map[string]bool) + for _, sec := range req.Sections { + sectionSet[sec] = true + } + + for _, section := range AllSections { + raw, exists := req.Config.Sections[section] + if !exists || !sectionSet[section] { + if sectionSet[section] { + result.Skipped = append(result.Skipped, section) + } + continue + } + + var err error + switch section { + case SectionBilling: + err = s.applyBilling(ctx, raw) + case SectionAlerts: + err = s.applyAlerts(ctx, raw, req.PreserveSensitive) + case SectionResources: + err = s.applyResources(ctx, raw) + case SectionCP: + err = s.applyControlPlane(ctx, raw, req.PreserveSensitive) + case SectionScripts: + err = s.applyInitScripts(ctx, raw) + } + + if err != nil { + result.Warnings = append(result.Warnings, fmt.Sprintf("%s 导入失败: %s", section, err.Error())) + result.Skipped = append(result.Skipped, section) + } else { + result.Applied = append(result.Applied, section) + } + } + + if len(result.Applied) > 0 { + result.Message = fmt.Sprintf("成功导入 %d 个配置模块", len(result.Applied)) + } else { + result.Message = "未成功导入任何配置模块" + } + + return result, nil +} + +func (s *ConfigTransferService) applyBilling(ctx context.Context, raw json.RawMessage) error { + var config BillingConfig + if err := json.Unmarshal(raw, &config); err != nil { + return fmt.Errorf("解析计费配置失败: %w", err) + } + return s.billingSvc.SetConfig(ctx, &config) +} + +func (s *ConfigTransferService) applyAlerts(ctx context.Context, raw json.RawMessage, preserveSensitive bool) error { + var config AlertConfig + if err := json.Unmarshal(raw, &config); err != nil { + return fmt.Errorf("解析告警配置失败: %w", err) + } + + if preserveSensitive { + current, err := s.alertSvc.GetConfig(ctx) + if err == nil { + currentChannelMap := make(map[string]NotifyChannel) + for _, ch := range current.Channels { + currentChannelMap[ch.ID] = ch + } + for i, ch := range config.Channels { + if curCh, exists := currentChannelMap[ch.ID]; exists { + for key, val := range ch.Config { + if val == RedactedValue || (len(val) > 8 && val[len(val)-3:] == "***") { + if curVal, ok := curCh.Config[key]; ok { + config.Channels[i].Config[key] = curVal + } + } + } + } + } + } + } + + return s.alertSvc.SetConfig(ctx, &config) +} + +func (s *ConfigTransferService) applyResources(ctx context.Context, raw json.RawMessage) error { + var configs []ResourceDefinition + if err := json.Unmarshal(raw, &configs); err != nil { + return fmt.Errorf("解析资源配置失败: %w", err) + } + return s.resourceConfigSvc.SaveResourceConfigs(ctx, configs) +} + +func (s *ConfigTransferService) applyControlPlane(ctx context.Context, raw json.RawMessage, preserveSensitive bool) error { + var config ControlPlaneConfig + if err := json.Unmarshal(raw, &config); err != nil { + return fmt.Errorf("解析控制面配置失败: %w", err) + } + + if preserveSensitive { + current, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err == nil { + if config.Password == RedactedValue { + config.Password = current.Password + } + if config.PrivateKey == RedactedValue { + config.PrivateKey = current.PrivateKey + } + } + } + + return s.initScriptSvc.SaveControlPlaneConfig(ctx, &config) +} + +func (s *ConfigTransferService) applyInitScripts(ctx context.Context, raw json.RawMessage) error { + var groups []ScriptGroup + if err := json.Unmarshal(raw, &groups); err != nil { + return fmt.Errorf("解析初始化脚本配置失败: %w", err) + } + return s.initScriptSvc.SaveAllScriptGroups(ctx, groups) +} diff --git a/api-server/internal/service/init_script_service.go b/api-server/internal/service/init_script_service.go new file mode 100644 index 0000000..1444b90 --- /dev/null +++ b/api-server/internal/service/init_script_service.go @@ -0,0 +1,924 @@ +package service + +import ( + "context" + "encoding/json" + "fmt" + "sort" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/bison/api-server/internal/k8s" + "github.com/bison/api-server/pkg/logger" +) + +const ( + InitScriptsConfigMap = "bison-init-scripts" + ControlPlaneConfigConfigMap = "bison-control-plane-config" +) + +// ScriptPhase represents when a script should be executed +type ScriptPhase string + +const ( + PhasePreJoin ScriptPhase = "pre-join" + PhasePostJoin ScriptPhase = "post-join" +) + +// Script represents a platform-specific script implementation +type Script struct { + ID string `json:"id"` + OS string `json:"os"` // "ubuntu", "centos", "debian", "*" (wildcard) + Arch string `json:"arch"` // "amd64", "arm64", "*" (wildcard) + Content string `json:"content"` // Shell script content +} + +// ScriptGroup represents a group of scripts for a specific functionality +type ScriptGroup struct { + ID string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + Phase ScriptPhase `json:"phase"` + Enabled bool `json:"enabled"` + Order int `json:"order"` + Builtin bool `json:"builtin"` + Scripts []Script `json:"scripts"` +} + +// InitScriptsConfig holds all script groups +type InitScriptsConfig struct { + Groups []ScriptGroup `json:"groups"` +} + +// NodePlatform represents the detected platform of a node +type NodePlatform struct { + OS string `json:"os"` + Version string `json:"version"` + Arch string `json:"arch"` +} + +// ControlPlaneConfig holds the control plane SSH configuration +type ControlPlaneConfig struct { + Host string `json:"host"` + SSHPort int `json:"sshPort"` + SSHUser string `json:"sshUser"` + AuthMethod string `json:"authMethod"` // "password" or "privateKey" + Password string `json:"password,omitempty"` + PrivateKey string `json:"privateKey,omitempty"` +} + +// InitScriptService handles initialization script operations +type InitScriptService struct { + k8sClient *k8s.Client +} + +// NewInitScriptService creates a new InitScriptService +func NewInitScriptService(k8sClient *k8s.Client) *InitScriptService { + return &InitScriptService{ + k8sClient: k8sClient, + } +} + +// GetAllScriptGroups returns all script groups +func (s *InitScriptService) GetAllScriptGroups(ctx context.Context) ([]ScriptGroup, error) { + logger.Debug("Getting all script groups") + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return nil, err + } + + // Sort by order + sort.Slice(config.Groups, func(i, j int) bool { + return config.Groups[i].Order < config.Groups[j].Order + }) + + return config.Groups, nil +} + +// GetScriptGroup returns a specific script group by ID +func (s *InitScriptService) GetScriptGroup(ctx context.Context, id string) (*ScriptGroup, error) { + logger.Debug("Getting script group", "id", id) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return nil, err + } + + for _, group := range config.Groups { + if group.ID == id { + return &group, nil + } + } + + return nil, fmt.Errorf("script group not found: %s", id) +} + +// CreateScriptGroup creates a new script group +func (s *InitScriptService) CreateScriptGroup(ctx context.Context, group *ScriptGroup) error { + logger.Info("Creating script group", "name", group.Name) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + // Generate ID if not provided + if group.ID == "" { + group.ID = fmt.Sprintf("custom-%d", time.Now().UnixNano()) + } + + // Check for duplicate ID + for _, existing := range config.Groups { + if existing.ID == group.ID { + return fmt.Errorf("script group with ID %s already exists", group.ID) + } + } + + // Set order to last + if group.Order == 0 { + maxOrder := 0 + for _, g := range config.Groups { + if g.Order > maxOrder { + maxOrder = g.Order + } + } + group.Order = maxOrder + 1 + } + + // Custom scripts are not builtin + group.Builtin = false + + config.Groups = append(config.Groups, *group) + + return s.saveInitScriptsConfig(ctx, config) +} + +// UpdateScriptGroup updates an existing script group +func (s *InitScriptService) UpdateScriptGroup(ctx context.Context, id string, group *ScriptGroup) error { + logger.Info("Updating script group", "id", id) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + found := false + for i, existing := range config.Groups { + if existing.ID == id { + // Preserve builtin status and ID + group.ID = id + group.Builtin = existing.Builtin + config.Groups[i] = *group + found = true + break + } + } + + if !found { + return fmt.Errorf("script group not found: %s", id) + } + + return s.saveInitScriptsConfig(ctx, config) +} + +// DeleteScriptGroup deletes a script group (only custom scripts can be deleted) +func (s *InitScriptService) DeleteScriptGroup(ctx context.Context, id string) error { + logger.Info("Deleting script group", "id", id) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + newGroups := make([]ScriptGroup, 0, len(config.Groups)) + deleted := false + + for _, group := range config.Groups { + if group.ID == id { + if group.Builtin { + return fmt.Errorf("cannot delete builtin script group: %s", id) + } + deleted = true + continue + } + newGroups = append(newGroups, group) + } + + if !deleted { + return fmt.Errorf("script group not found: %s", id) + } + + config.Groups = newGroups + return s.saveInitScriptsConfig(ctx, config) +} + +// ToggleScriptGroup enables or disables a script group +func (s *InitScriptService) ToggleScriptGroup(ctx context.Context, id string, enabled bool) error { + logger.Info("Toggling script group", "id", id, "enabled", enabled) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + found := false + for i, group := range config.Groups { + if group.ID == id { + config.Groups[i].Enabled = enabled + found = true + break + } + } + + if !found { + return fmt.Errorf("script group not found: %s", id) + } + + return s.saveInitScriptsConfig(ctx, config) +} + +// ReorderScriptGroups updates the order of script groups +func (s *InitScriptService) ReorderScriptGroups(ctx context.Context, ids []string) error { + logger.Info("Reordering script groups", "ids", ids) + + config, err := s.getInitScriptsConfig(ctx) + if err != nil { + return err + } + + // Create a map of current groups + groupMap := make(map[string]*ScriptGroup) + for i := range config.Groups { + groupMap[config.Groups[i].ID] = &config.Groups[i] + } + + // Update orders based on the provided order + for i, id := range ids { + if group, ok := groupMap[id]; ok { + group.Order = i + 1 + } + } + + return s.saveInitScriptsConfig(ctx, config) +} + +// GetMatchingScript returns the best matching script for a given platform +func (s *InitScriptService) GetMatchingScript(group *ScriptGroup, platform NodePlatform) *Script { + if len(group.Scripts) == 0 { + return nil + } + + // Priority: exact match > OS match with wildcard arch > wildcard OS with arch match > all wildcards + var exactMatch, osMatch, archMatch, wildcardMatch *Script + + for i := range group.Scripts { + script := &group.Scripts[i] + osMatches := script.OS == platform.OS || script.OS == "*" + archMatches := script.Arch == platform.Arch || script.Arch == "*" + + if !osMatches || !archMatches { + continue + } + + if script.OS == platform.OS && script.Arch == platform.Arch { + exactMatch = script + break // Best match found + } else if script.OS == platform.OS && script.Arch == "*" { + osMatch = script + } else if script.OS == "*" && script.Arch == platform.Arch { + archMatch = script + } else if script.OS == "*" && script.Arch == "*" { + wildcardMatch = script + } + } + + // Return by priority + if exactMatch != nil { + return exactMatch + } + if osMatch != nil { + return osMatch + } + if archMatch != nil { + return archMatch + } + return wildcardMatch +} + +// GetScriptsForPhase returns all enabled scripts for a specific phase, matched to the platform +func (s *InitScriptService) GetScriptsForPhase(ctx context.Context, phase ScriptPhase, platform NodePlatform) ([]struct { + Group ScriptGroup + Script Script +}, error) { + groups, err := s.GetAllScriptGroups(ctx) + if err != nil { + return nil, err + } + + var result []struct { + Group ScriptGroup + Script Script + } + + for _, group := range groups { + if group.Phase != phase || !group.Enabled { + continue + } + + script := s.GetMatchingScript(&group, platform) + if script != nil { + result = append(result, struct { + Group ScriptGroup + Script Script + }{ + Group: group, + Script: *script, + }) + } + } + + return result, nil +} + +// GetControlPlaneConfig returns the control plane SSH configuration +func (s *InitScriptService) GetControlPlaneConfig(ctx context.Context) (*ControlPlaneConfig, error) { + logger.Debug("Getting control plane config") + + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, ControlPlaneConfigConfigMap) + if err != nil { + if errors.IsNotFound(err) { + return &ControlPlaneConfig{ + SSHPort: 22, + SSHUser: "root", + }, nil + } + return nil, fmt.Errorf("failed to get control plane config: %w", err) + } + + data, ok := cm.Data["config"] + if !ok { + return &ControlPlaneConfig{ + SSHPort: 22, + SSHUser: "root", + }, nil + } + + var config ControlPlaneConfig + if err := json.Unmarshal([]byte(data), &config); err != nil { + return nil, fmt.Errorf("failed to parse control plane config: %w", err) + } + + return &config, nil +} + +// SaveControlPlaneConfig saves the control plane SSH configuration +func (s *InitScriptService) SaveControlPlaneConfig(ctx context.Context, config *ControlPlaneConfig) error { + logger.Info("Saving control plane config", "host", config.Host) + + // Set defaults + if config.SSHPort == 0 { + config.SSHPort = 22 + } + if config.SSHUser == "" { + config.SSHUser = "root" + } + + data, err := json.Marshal(config) + if err != nil { + return fmt.Errorf("failed to marshal control plane config: %w", err) + } + + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, ControlPlaneConfigConfigMap) + if err != nil { + if errors.IsNotFound(err) { + // Create new ConfigMap + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: ControlPlaneConfigConfigMap, + Namespace: BisonNamespace, + }, + Data: map[string]string{ + "config": string(data), + }, + } + return s.k8sClient.CreateConfigMap(ctx, BisonNamespace, cm) + } + return fmt.Errorf("failed to get control plane config: %w", err) + } + + // Update existing ConfigMap + if cm.Data == nil { + cm.Data = make(map[string]string) + } + cm.Data["config"] = string(data) + + return s.k8sClient.UpdateConfigMap(ctx, BisonNamespace, cm) +} + +// SaveAllScriptGroups replaces all script groups at once (used by config import) +func (s *InitScriptService) SaveAllScriptGroups(ctx context.Context, groups []ScriptGroup) error { + logger.Info("Saving all script groups", "count", len(groups)) + config := &InitScriptsConfig{Groups: groups} + return s.saveInitScriptsConfig(ctx, config) +} + +// getInitScriptsConfig returns the init scripts configuration, initializing with defaults if not found +func (s *InitScriptService) getInitScriptsConfig(ctx context.Context) (*InitScriptsConfig, error) { + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, InitScriptsConfigMap) + if err != nil { + if errors.IsNotFound(err) { + // Initialize with default builtin scripts + config := s.getDefaultInitScriptsConfig() + if err := s.saveInitScriptsConfig(ctx, config); err != nil { + return nil, err + } + return config, nil + } + return nil, fmt.Errorf("failed to get init scripts config: %w", err) + } + + data, ok := cm.Data["config"] + if !ok { + config := s.getDefaultInitScriptsConfig() + if err := s.saveInitScriptsConfig(ctx, config); err != nil { + return nil, err + } + return config, nil + } + + var config InitScriptsConfig + if err := json.Unmarshal([]byte(data), &config); err != nil { + return nil, fmt.Errorf("failed to parse init scripts config: %w", err) + } + + return &config, nil +} + +// saveInitScriptsConfig saves the init scripts configuration +func (s *InitScriptService) saveInitScriptsConfig(ctx context.Context, config *InitScriptsConfig) error { + data, err := json.Marshal(config) + if err != nil { + return fmt.Errorf("failed to marshal init scripts config: %w", err) + } + + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, InitScriptsConfigMap) + if err != nil { + if errors.IsNotFound(err) { + // Create new ConfigMap + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: InitScriptsConfigMap, + Namespace: BisonNamespace, + }, + Data: map[string]string{ + "config": string(data), + }, + } + return s.k8sClient.CreateConfigMap(ctx, BisonNamespace, cm) + } + return fmt.Errorf("failed to get init scripts config: %w", err) + } + + // Update existing ConfigMap + if cm.Data == nil { + cm.Data = make(map[string]string) + } + cm.Data["config"] = string(data) + + return s.k8sClient.UpdateConfigMap(ctx, BisonNamespace, cm) +} + +// getDefaultInitScriptsConfig returns the default builtin script groups +func (s *InitScriptService) getDefaultInitScriptsConfig() *InitScriptsConfig { + return &InitScriptsConfig{ + Groups: []ScriptGroup{ + { + ID: "disable-swap", + Name: "禁用 Swap", + Description: "禁用 Swap 分区(Kubernetes 要求)", + Phase: PhasePreJoin, + Enabled: true, + Order: 1, + Builtin: true, + Scripts: []Script{ + { + ID: "disable-swap-universal", + OS: "*", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling swap..." +swapoff -a || true +sed -i '/swap/d' /etc/fstab || true +echo "Swap disabled successfully" +`, + }, + }, + }, + { + ID: "configure-kernel", + Name: "配置内核参数", + Description: "配置 Kubernetes 所需的内核参数", + Phase: PhasePreJoin, + Enabled: true, + Order: 2, + Builtin: true, + Scripts: []Script{ + { + ID: "configure-kernel-universal", + OS: "*", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Configuring kernel parameters..." + +# Load required modules +modprobe br_netfilter || true +modprobe overlay || true + +# Ensure modules load on boot +cat > /etc/modules-load.d/k8s.conf << EOF +br_netfilter +overlay +EOF + +# Configure sysctl +cat > /etc/sysctl.d/k8s.conf << EOF +net.bridge.bridge-nf-call-iptables = 1 +net.bridge.bridge-nf-call-ip6tables = 1 +net.ipv4.ip_forward = 1 +EOF + +sysctl --system +echo "Kernel parameters configured successfully" +`, + }, + }, + }, + { + ID: "disable-firewall", + Name: "禁用防火墙", + Description: "禁用节点防火墙(firewalld/ufw)", + Phase: PhasePreJoin, + Enabled: false, + Order: 3, + Builtin: true, + Scripts: []Script{ + { + ID: "disable-firewall-debian", + OS: "ubuntu", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if command -v ufw &> /dev/null; then + ufw disable || true +fi +echo "Firewall disabled successfully" +`, + }, + { + ID: "disable-firewall-debian2", + OS: "debian", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if command -v ufw &> /dev/null; then + ufw disable || true +fi +echo "Firewall disabled successfully" +`, + }, + { + ID: "disable-firewall-rhel", + OS: "centos", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if systemctl is-active --quiet firewalld 2>/dev/null; then + systemctl stop firewalld + systemctl disable firewalld +fi +echo "Firewall disabled successfully" +`, + }, + { + ID: "disable-firewall-rhel2", + OS: "rhel", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if systemctl is-active --quiet firewalld 2>/dev/null; then + systemctl stop firewalld + systemctl disable firewalld +fi +echo "Firewall disabled successfully" +`, + }, + { + ID: "disable-firewall-openeuler", + OS: "openEuler", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Disabling firewall..." +if systemctl is-active --quiet firewalld 2>/dev/null; then + systemctl stop firewalld + systemctl disable firewalld +fi +echo "Firewall disabled successfully" +`, + }, + }, + }, + { + ID: "configure-selinux", + Name: "配置 SELinux", + Description: "设置 SELinux 为 Permissive 模式(仅 RHEL/CentOS/openEuler)", + Phase: PhasePreJoin, + Enabled: false, + Order: 4, + Builtin: true, + Scripts: []Script{ + { + ID: "configure-selinux-centos", + OS: "centos", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Configuring SELinux to permissive mode..." +if command -v setenforce &> /dev/null; then + setenforce 0 || true + if [ -f /etc/selinux/config ]; then + sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config + fi +fi +echo "SELinux configured successfully" +`, + }, + { + ID: "configure-selinux-rhel", + OS: "rhel", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Configuring SELinux to permissive mode..." +if command -v setenforce &> /dev/null; then + setenforce 0 || true + if [ -f /etc/selinux/config ]; then + sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config + fi +fi +echo "SELinux configured successfully" +`, + }, + { + ID: "configure-selinux-openeuler", + OS: "openEuler", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Configuring SELinux to permissive mode..." +if command -v setenforce &> /dev/null; then + setenforce 0 || true + if [ -f /etc/selinux/config ]; then + sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config + fi +fi +echo "SELinux configured successfully" +`, + }, + }, + }, + { + ID: "configure-timezone", + Name: "配置时区和 NTP", + Description: "设置系统时区并启用 NTP 时间同步", + Phase: PhasePreJoin, + Enabled: false, + Order: 5, + Builtin: true, + Scripts: []Script{ + { + ID: "configure-timezone-universal", + OS: "*", + Arch: "*", + Content: `#!/bin/bash +set -e +TIMEZONE="${TIMEZONE:-Asia/Shanghai}" + +echo "Configuring timezone to $TIMEZONE..." +timedatectl set-timezone $TIMEZONE || true + +echo "Enabling and starting NTP service..." +if systemctl list-unit-files | grep -q chronyd; then + systemctl enable chronyd || true + systemctl start chronyd || true +elif systemctl list-unit-files | grep -q ntpd; then + systemctl enable ntpd || true + systemctl start ntpd || true +elif systemctl list-unit-files | grep -q systemd-timesyncd; then + systemctl enable systemd-timesyncd || true + systemctl start systemd-timesyncd || true +fi + +echo "Timezone and NTP configured successfully" +`, + }, + }, + }, + { + ID: "configure-registry", + Name: "配置私有镜像仓库", + Description: "配置 containerd 使用私有镜像仓库(支持 HTTP)", + Phase: PhasePreJoin, + Enabled: false, + Order: 6, + Builtin: true, + Scripts: []Script{ + { + ID: "configure-registry-ubuntu", + OS: "ubuntu", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + { + ID: "configure-registry-debian", + OS: "debian", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + { + ID: "configure-registry-centos", + OS: "centos", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + { + ID: "configure-registry-rhel", + OS: "rhel", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + { + ID: "configure-registry-openeuler", + OS: "openEuler", + Arch: "*", + Content: `#!/bin/bash +set -e +REGISTRY_URL="${REGISTRY_URL:-registry.example.com:5000}" + +echo "Configuring private registry: $REGISTRY_URL" + +# Create registry config directory +mkdir -p /etc/containerd/certs.d/${REGISTRY_URL} + +# Configure registry mirror +cat > /etc/containerd/certs.d/${REGISTRY_URL}/hosts.toml << EOF +server = "http://${REGISTRY_URL}" + +[host."http://${REGISTRY_URL}"] + capabilities = ["pull", "resolve", "push"] + skip_verify = true +EOF + +# Restart containerd +systemctl restart containerd +echo "Private registry configured successfully" +`, + }, + }, + }, + { + ID: "add-node-labels", + Name: "添加节点标签", + Description: "为节点添加 Worker 角色标签", + Phase: PhasePostJoin, + Enabled: false, + Order: 7, + Builtin: true, + Scripts: []Script{ + { + ID: "add-node-labels-universal", + OS: "*", + Arch: "*", + Content: `#!/bin/bash +set -e +echo "Adding worker label to node ${NODE_NAME}..." + +# Wait for node to be registered +sleep 5 + +# Add worker role label +kubectl label node ${NODE_NAME} node-role.kubernetes.io/worker= --overwrite || true + +echo "Node label added successfully" +`, + }, + }, + }, + }, + } +} + +// ReplaceVariables replaces variables in the script content +func ReplaceVariables(content string, vars map[string]string) string { + result := content + for key, value := range vars { + placeholder := "${" + key + "}" + result = strings.ReplaceAll(result, placeholder, value) + } + return result +} diff --git a/api-server/internal/service/onboarding_service.go b/api-server/internal/service/onboarding_service.go new file mode 100644 index 0000000..ccf983b --- /dev/null +++ b/api-server/internal/service/onboarding_service.go @@ -0,0 +1,760 @@ +package service + +import ( + "context" + "encoding/json" + "fmt" + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/bison/api-server/internal/k8s" + "github.com/bison/api-server/internal/ssh" + "github.com/bison/api-server/pkg/logger" +) + +// Ensure metav1 is used +var _ = metav1.Now + +const ( + OnboardingJobsConfigMap = "bison-onboarding-jobs" +) + +// OnboardingJobStatus represents the status of an onboarding job +type OnboardingJobStatus string + +const ( + JobStatusPending OnboardingJobStatus = "pending" + JobStatusRunning OnboardingJobStatus = "running" + JobStatusSuccess OnboardingJobStatus = "success" + JobStatusFailed OnboardingJobStatus = "failed" + JobStatusCancelled OnboardingJobStatus = "cancelled" +) + +// SubStepStatus represents the status of a sub-step +type SubStepStatus string + +const ( + SubStepPending SubStepStatus = "pending" + SubStepRunning SubStepStatus = "running" + SubStepSuccess SubStepStatus = "success" + SubStepFailed SubStepStatus = "failed" + SubStepSkipped SubStepStatus = "skipped" +) + +// SubStep represents a sub-step within a main step +type SubStep struct { + Name string `json:"name"` + Status SubStepStatus `json:"status"` + Error string `json:"error,omitempty"` +} + +// OnboardingJob represents a node onboarding job +type OnboardingJob struct { + ID string `json:"id"` + NodeIP string `json:"nodeIP"` + NodeName string `json:"nodeName,omitempty"` + Platform NodePlatform `json:"platform"` + Status OnboardingJobStatus `json:"status"` + CurrentStep int `json:"currentStep"` + TotalSteps int `json:"totalSteps"` + StepMessage string `json:"stepMessage"` + SubSteps []SubStep `json:"subSteps,omitempty"` + ErrorMessage string `json:"errorMessage,omitempty"` + CreatedAt time.Time `json:"createdAt"` + UpdatedAt time.Time `json:"updatedAt"` + CompletedAt *time.Time `json:"completedAt,omitempty"` +} + +// OnboardingRequest represents a request to onboard a new node +type OnboardingRequest struct { + NodeIP string `json:"nodeIP" binding:"required"` + SSHPort int `json:"sshPort"` + SSHUsername string `json:"sshUsername" binding:"required"` + AuthMethod string `json:"authMethod" binding:"required,oneof=password privateKey"` + Password string `json:"password"` + PrivateKey string `json:"privateKey"` +} + +// OnboardingService handles node onboarding operations +type OnboardingService struct { + k8sClient *k8s.Client + nodeSvc *NodeService + initScriptSvc *InitScriptService + runningJobs map[string]context.CancelFunc + runningJobsMu sync.RWMutex +} + +// NewOnboardingService creates a new OnboardingService +func NewOnboardingService(k8sClient *k8s.Client, nodeSvc *NodeService, initScriptSvc *InitScriptService) *OnboardingService { + return &OnboardingService{ + k8sClient: k8sClient, + nodeSvc: nodeSvc, + initScriptSvc: initScriptSvc, + runningJobs: make(map[string]context.CancelFunc), + } +} + +// StartOnboarding starts a new node onboarding job +func (s *OnboardingService) StartOnboarding(ctx context.Context, req *OnboardingRequest) (*OnboardingJob, error) { + logger.Info("Starting node onboarding", "nodeIP", req.NodeIP) + + // Set defaults + if req.SSHPort == 0 { + req.SSHPort = 22 + } + + // Validate authentication + if req.AuthMethod == "password" && req.Password == "" { + return nil, fmt.Errorf("password is required for password authentication") + } + if req.AuthMethod == "privateKey" && req.PrivateKey == "" { + return nil, fmt.Errorf("private key is required for private key authentication") + } + + // Check if there's already a running job for this IP + jobs, err := s.ListJobs(ctx) + if err != nil { + return nil, err + } + for _, job := range jobs { + if job.NodeIP == req.NodeIP && (job.Status == JobStatusPending || job.Status == JobStatusRunning) { + return nil, fmt.Errorf("there is already a running onboarding job for this IP: %s", job.ID) + } + } + + // Create job + job := &OnboardingJob{ + ID: fmt.Sprintf("job-%d", time.Now().UnixNano()), + NodeIP: req.NodeIP, + Status: JobStatusPending, + CurrentStep: 0, + TotalSteps: 9, + StepMessage: "Job created, waiting to start", + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + + // Save job + if err := s.saveJob(ctx, job); err != nil { + return nil, err + } + + // Start async execution + jobCtx, cancel := context.WithCancel(context.Background()) + s.runningJobsMu.Lock() + s.runningJobs[job.ID] = cancel + s.runningJobsMu.Unlock() + + go s.executeOnboarding(jobCtx, job, req) + + return job, nil +} + +// GetJob returns a specific job by ID +func (s *OnboardingService) GetJob(ctx context.Context, jobID string) (*OnboardingJob, error) { + jobs, err := s.getJobsMap(ctx) + if err != nil { + return nil, err + } + + jobData, ok := jobs[jobID] + if !ok { + return nil, fmt.Errorf("job not found: %s", jobID) + } + + var job OnboardingJob + if err := json.Unmarshal([]byte(jobData), &job); err != nil { + return nil, fmt.Errorf("failed to parse job data: %w", err) + } + + return &job, nil +} + +// ListJobs returns all onboarding jobs +func (s *OnboardingService) ListJobs(ctx context.Context) ([]*OnboardingJob, error) { + jobs, err := s.getJobsMap(ctx) + if err != nil { + return nil, err + } + + result := make([]*OnboardingJob, 0, len(jobs)) + for _, jobData := range jobs { + var job OnboardingJob + if err := json.Unmarshal([]byte(jobData), &job); err != nil { + continue + } + result = append(result, &job) + } + + return result, nil +} + +// CancelJob cancels a running job +func (s *OnboardingService) CancelJob(ctx context.Context, jobID string) error { + logger.Info("Cancelling onboarding job", "jobID", jobID) + + job, err := s.GetJob(ctx, jobID) + if err != nil { + return err + } + + if job.Status != JobStatusPending && job.Status != JobStatusRunning { + return fmt.Errorf("job is not running: %s", job.Status) + } + + // Cancel the job context + s.runningJobsMu.Lock() + if cancel, ok := s.runningJobs[jobID]; ok { + cancel() + delete(s.runningJobs, jobID) + } + s.runningJobsMu.Unlock() + + // Update job status + job.Status = JobStatusCancelled + job.StepMessage = "Job cancelled by user" + job.UpdatedAt = time.Now() + now := time.Now() + job.CompletedAt = &now + + return s.saveJob(ctx, job) +} + +// executeOnboarding executes the onboarding process +func (s *OnboardingService) executeOnboarding(ctx context.Context, job *OnboardingJob, req *OnboardingRequest) { + defer func() { + s.runningJobsMu.Lock() + delete(s.runningJobs, job.ID) + s.runningJobsMu.Unlock() + }() + + // Update job status to running + job.Status = JobStatusRunning + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Create SSH executor for target node + sshConfig := &ssh.Config{ + Host: req.NodeIP, + Port: req.SSHPort, + Username: req.SSHUsername, + AuthMethod: ssh.AuthMethod(req.AuthMethod), + Password: req.Password, + PrivateKey: req.PrivateKey, + Timeout: 30 * time.Second, + } + executor := ssh.NewExecutor(sshConfig) + defer executor.Close() + + // Step 1: Connection test + if err := s.stepConnectionTest(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 2: Platform detection + if err := s.stepPlatformDetection(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 3: Environment check + if err := s.stepEnvironmentCheck(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 4: Pre-join scripts + if err := s.stepPreJoinScripts(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 5: Get join token + joinCommand, err := s.stepGetJoinToken(ctx, job) + if err != nil { + s.failJob(job, err) + return + } + + // Step 6: Execute kubeadm join + if err := s.stepKubeadmJoin(ctx, job, executor, joinCommand); err != nil { + s.failJob(job, err) + return + } + + // Step 7: Post-join scripts + if err := s.stepPostJoinScripts(ctx, job, executor); err != nil { + s.failJob(job, err) + return + } + + // Step 8: Wait for node ready + if err := s.stepWaitForNodeReady(ctx, job); err != nil { + s.failJob(job, err) + return + } + + // Step 9: Enable node + if err := s.stepEnableNode(ctx, job); err != nil { + s.failJob(job, err) + return + } + + // Mark job as successful + job.Status = JobStatusSuccess + job.StepMessage = "Node onboarding completed successfully" + job.UpdatedAt = time.Now() + now := time.Now() + job.CompletedAt = &now + s.saveJob(context.Background(), job) + + logger.Info("Node onboarding completed successfully", "nodeIP", job.NodeIP, "nodeName", job.NodeName) +} + +func (s *OnboardingService) stepConnectionTest(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 1 + job.StepMessage = "Testing SSH connection..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + if err := executor.TestConnection(ctx); err != nil { + return fmt.Errorf("SSH connection test failed: %w", err) + } + + return nil +} + +func (s *OnboardingService) stepPlatformDetection(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 2 + job.StepMessage = "Detecting node platform..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + info, err := executor.GetHostInfo(ctx) + if err != nil { + return fmt.Errorf("failed to detect platform: %w", err) + } + + job.Platform = NodePlatform{ + OS: info["os"], + Version: info["version"], + Arch: info["arch"], + } + + if info["hostname"] != "" { + job.NodeName = info["hostname"] + } + + job.StepMessage = fmt.Sprintf("Detected: %s %s (%s)", job.Platform.OS, job.Platform.Version, job.Platform.Arch) + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + return nil +} + +func (s *OnboardingService) stepEnvironmentCheck(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 3 + job.StepMessage = "Checking environment..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Check if kubeadm is installed + if !executor.CheckCommand(ctx, "kubeadm") { + return fmt.Errorf("kubeadm is not installed on the target node") + } + + // Check if kubelet is installed + if !executor.CheckCommand(ctx, "kubelet") { + return fmt.Errorf("kubelet is not installed on the target node") + } + + return nil +} + +func (s *OnboardingService) stepPreJoinScripts(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 4 + job.StepMessage = "Executing pre-join scripts..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Get init scripts for pre-join phase + scripts, err := s.initScriptSvc.GetScriptsForPhase(ctx, PhasePreJoin, job.Platform) + if err != nil { + return fmt.Errorf("failed to get pre-join scripts: %w", err) + } + + if len(scripts) == 0 { + job.StepMessage = "No pre-join scripts to execute" + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + return nil + } + + // Initialize sub-steps + job.SubSteps = make([]SubStep, len(scripts)) + for i, script := range scripts { + job.SubSteps[i] = SubStep{ + Name: script.Group.Name, + Status: SubStepPending, + } + } + s.saveJob(context.Background(), job) + + // Get variables for script replacement + cpConfig, _ := s.initScriptSvc.GetControlPlaneConfig(ctx) + controlPlaneIP := "" + if cpConfig != nil { + controlPlaneIP = cpConfig.Host + } + vars := map[string]string{ + "NODE_IP": job.NodeIP, + "NODE_NAME": job.NodeName, + "CONTROL_PLANE_IP": controlPlaneIP, + } + + // Execute scripts + for stepIdx, script := range scripts { + job.SubSteps[stepIdx].Status = SubStepRunning + job.StepMessage = fmt.Sprintf("Executing: %s", script.Group.Name) + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Replace variables in script content + content := ReplaceVariables(script.Script.Content, vars) + + // Execute script + result := executor.ExecuteScript(ctx, content) + if result.Error != nil || result.ExitCode != 0 { + job.SubSteps[stepIdx].Status = SubStepFailed + errMsg := result.Stderr + if result.Error != nil { + errMsg = result.Error.Error() + } + job.SubSteps[stepIdx].Error = errMsg + s.saveJob(context.Background(), job) + return fmt.Errorf("pre-join script '%s' failed: %s", script.Group.Name, errMsg) + } + + job.SubSteps[stepIdx].Status = SubStepSuccess + s.saveJob(context.Background(), job) + } + + job.SubSteps = nil // Clear sub-steps after completion + return nil +} + +func (s *OnboardingService) stepGetJoinToken(ctx context.Context, job *OnboardingJob) (string, error) { + job.CurrentStep = 5 + job.StepMessage = "Getting join token from control plane..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Get control plane config + cpConfig, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err != nil { + return "", fmt.Errorf("failed to get control plane config: %w", err) + } + + if cpConfig.Host == "" { + return "", fmt.Errorf("control plane host is not configured") + } + + // Create SSH executor for control plane + cpSSHConfig := &ssh.Config{ + Host: cpConfig.Host, + Port: cpConfig.SSHPort, + Username: cpConfig.SSHUser, + AuthMethod: ssh.AuthMethod(cpConfig.AuthMethod), + Password: cpConfig.Password, + PrivateKey: cpConfig.PrivateKey, + Timeout: 30 * time.Second, + } + cpExecutor := ssh.NewExecutor(cpSSHConfig) + defer cpExecutor.Close() + + if err := cpExecutor.Connect(ctx); err != nil { + return "", fmt.Errorf("failed to connect to control plane: %w", err) + } + + // Generate join command + result := cpExecutor.Execute(ctx, "kubeadm token create --print-join-command") + if result.Error != nil || result.ExitCode != 0 { + errMsg := result.Stderr + if result.Error != nil { + errMsg = result.Error.Error() + } + return "", fmt.Errorf("failed to generate join command: %s", errMsg) + } + + joinCommand := result.Stdout + if joinCommand == "" { + return "", fmt.Errorf("empty join command returned") + } + + return joinCommand, nil +} + +func (s *OnboardingService) stepKubeadmJoin(ctx context.Context, job *OnboardingJob, executor *ssh.Executor, joinCommand string) error { + job.CurrentStep = 6 + job.StepMessage = "Executing kubeadm join..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Execute kubeadm join with a longer timeout + joinCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + + result := executor.Execute(joinCtx, joinCommand) + if result.Error != nil || result.ExitCode != 0 { + errMsg := result.Stderr + if result.Error != nil { + errMsg = result.Error.Error() + } + return fmt.Errorf("kubeadm join failed: %s", errMsg) + } + + return nil +} + +func (s *OnboardingService) stepPostJoinScripts(ctx context.Context, job *OnboardingJob, executor *ssh.Executor) error { + job.CurrentStep = 7 + job.StepMessage = "Executing post-join scripts..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Get init scripts for post-join phase + scripts, err := s.initScriptSvc.GetScriptsForPhase(ctx, PhasePostJoin, job.Platform) + if err != nil { + return fmt.Errorf("failed to get post-join scripts: %w", err) + } + + if len(scripts) == 0 { + job.StepMessage = "No post-join scripts to execute" + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + return nil + } + + // Initialize sub-steps + job.SubSteps = make([]SubStep, len(scripts)) + for i, script := range scripts { + job.SubSteps[i] = SubStep{ + Name: script.Group.Name, + Status: SubStepPending, + } + } + s.saveJob(context.Background(), job) + + // Get variables for script replacement + cpConfig, _ := s.initScriptSvc.GetControlPlaneConfig(ctx) + controlPlaneIP := "" + if cpConfig != nil { + controlPlaneIP = cpConfig.Host + } + vars := map[string]string{ + "NODE_IP": job.NodeIP, + "NODE_NAME": job.NodeName, + "CONTROL_PLANE_IP": controlPlaneIP, + } + + // Execute scripts + for stepIdx, script := range scripts { + job.SubSteps[stepIdx].Status = SubStepRunning + job.StepMessage = fmt.Sprintf("Executing: %s", script.Group.Name) + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Replace variables in script content + content := ReplaceVariables(script.Script.Content, vars) + + // Execute script + result := executor.ExecuteScript(ctx, content) + if result.Error != nil || result.ExitCode != 0 { + job.SubSteps[stepIdx].Status = SubStepFailed + errMsg := result.Stderr + if result.Error != nil { + errMsg = result.Error.Error() + } + job.SubSteps[stepIdx].Error = errMsg + s.saveJob(context.Background(), job) + return fmt.Errorf("post-join script '%s' failed: %s", script.Group.Name, errMsg) + } + + job.SubSteps[stepIdx].Status = SubStepSuccess + s.saveJob(context.Background(), job) + } + + job.SubSteps = nil // Clear sub-steps after completion + return nil +} + +func (s *OnboardingService) stepWaitForNodeReady(ctx context.Context, job *OnboardingJob) error { + job.CurrentStep = 8 + job.StepMessage = "Waiting for node to be ready..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + // Wait for node to appear and become ready + timeout := time.After(5 * time.Minute) + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-timeout: + return fmt.Errorf("timeout waiting for node to be ready") + case <-ticker.C: + // Try to find the node + nodes, err := s.k8sClient.ListNodes(ctx) + if err != nil { + continue + } + + for _, node := range nodes.Items { + // Match by IP or hostname + nodeIP := "" + for _, addr := range node.Status.Addresses { + if addr.Type == corev1.NodeInternalIP { + nodeIP = addr.Address + break + } + } + + if nodeIP == job.NodeIP || node.Name == job.NodeName { + job.NodeName = node.Name + + // Check if node is ready + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { + job.StepMessage = fmt.Sprintf("Node %s is ready", node.Name) + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + return nil + } + } + } + } + } + } +} + +func (s *OnboardingService) stepEnableNode(ctx context.Context, job *OnboardingJob) error { + job.CurrentStep = 9 + job.StepMessage = "Enabling node in Bison..." + job.UpdatedAt = time.Now() + s.saveJob(context.Background(), job) + + if job.NodeName == "" { + return fmt.Errorf("node name is not set") + } + + // Enable node in Bison (add to shared pool) + if err := s.nodeSvc.EnableNode(ctx, job.NodeName); err != nil { + return fmt.Errorf("failed to enable node: %w", err) + } + + return nil +} + +func (s *OnboardingService) failJob(job *OnboardingJob, err error) { + job.Status = JobStatusFailed + job.ErrorMessage = err.Error() + job.UpdatedAt = time.Now() + now := time.Now() + job.CompletedAt = &now + s.saveJob(context.Background(), job) + + logger.Error("Node onboarding failed", "nodeIP", job.NodeIP, "error", err) +} + +func (s *OnboardingService) saveJob(ctx context.Context, job *OnboardingJob) error { + data, err := json.Marshal(job) + if err != nil { + return fmt.Errorf("failed to marshal job: %w", err) + } + + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, OnboardingJobsConfigMap) + if err != nil { + if errors.IsNotFound(err) { + // Create new ConfigMap + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: OnboardingJobsConfigMap, + Namespace: BisonNamespace, + }, + Data: map[string]string{ + job.ID: string(data), + }, + } + return s.k8sClient.CreateConfigMap(ctx, BisonNamespace, cm) + } + return fmt.Errorf("failed to get jobs config: %w", err) + } + + // Update existing ConfigMap + if cm.Data == nil { + cm.Data = make(map[string]string) + } + cm.Data[job.ID] = string(data) + + return s.k8sClient.UpdateConfigMap(ctx, BisonNamespace, cm) +} + +func (s *OnboardingService) getJobsMap(ctx context.Context) (map[string]string, error) { + cm, err := s.k8sClient.GetConfigMap(ctx, BisonNamespace, OnboardingJobsConfigMap) + if err != nil { + if errors.IsNotFound(err) { + return make(map[string]string), nil + } + return nil, fmt.Errorf("failed to get jobs config: %w", err) + } + + if cm.Data == nil { + return make(map[string]string), nil + } + + return cm.Data, nil +} + +// TestControlPlaneConnection tests the SSH connection to the control plane +func (s *OnboardingService) TestControlPlaneConnection(ctx context.Context) error { + cpConfig, err := s.initScriptSvc.GetControlPlaneConfig(ctx) + if err != nil { + return fmt.Errorf("failed to get control plane config: %w", err) + } + + if cpConfig.Host == "" { + return fmt.Errorf("control plane host is not configured") + } + + sshConfig := &ssh.Config{ + Host: cpConfig.Host, + Port: cpConfig.SSHPort, + Username: cpConfig.SSHUser, + AuthMethod: ssh.AuthMethod(cpConfig.AuthMethod), + Password: cpConfig.Password, + PrivateKey: cpConfig.PrivateKey, + Timeout: 30 * time.Second, + } + + executor := ssh.NewExecutor(sshConfig) + defer executor.Close() + + if err := executor.TestConnection(ctx); err != nil { + return fmt.Errorf("SSH connection test failed: %w", err) + } + + // Also verify kubeadm is available + if !executor.CheckCommand(ctx, "kubeadm") { + return fmt.Errorf("kubeadm is not available on the control plane") + } + + return nil +} diff --git a/api-server/internal/service/settings_service.go b/api-server/internal/service/settings_service.go index 0a561f4..d9c506d 100644 --- a/api-server/internal/service/settings_service.go +++ b/api-server/internal/service/settings_service.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "net/http" + "net/url" "time" ) @@ -48,28 +49,66 @@ type PrometheusMetric struct { Value float64 `json:"value"` } +// LabeledMetricSeries represents a Prometheus metric series with labels +type LabeledMetricSeries struct { + Labels map[string]string `json:"labels"` + Metrics []PrometheusMetric `json:"metrics"` +} + // NodeMetrics represents metrics for a node type NodeMetrics struct { CPUUsage []PrometheusMetric `json:"cpuUsage"` MemoryUsage []PrometheusMetric `json:"memoryUsage"` + // Network IO + NetworkReceive []PrometheusMetric `json:"networkReceive,omitempty"` + NetworkTransmit []PrometheusMetric `json:"networkTransmit,omitempty"` + // RDMA IO + RdmaReceive []PrometheusMetric `json:"rdmaReceive,omitempty"` + RdmaTransmit []PrometheusMetric `json:"rdmaTransmit,omitempty"` + // GPU (NVIDIA DCGM) + GpuUtilization []PrometheusMetric `json:"gpuUtilization,omitempty"` + GpuMemoryUtil []PrometheusMetric `json:"gpuMemoryUtil,omitempty"` + GpuPerDevice []LabeledMetricSeries `json:"gpuPerDevice,omitempty"` + // NPU (Huawei Ascend) + NpuUtilization []PrometheusMetric `json:"npuUtilization,omitempty"` + NpuMemoryUtil []PrometheusMetric `json:"npuMemoryUtil,omitempty"` + NpuTemperature []PrometheusMetric `json:"npuTemperature,omitempty"` } -// QueryPrometheus queries Prometheus API -func (s *SettingsService) QueryPrometheus(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]PrometheusMetric, error) { +// NodeMetricsRequest holds parameters for querying node metrics +type NodeMetricsRequest struct { + NodeName string + Hours int + HasGpu bool + HasNpu bool +} + +// prometheusResponse is the JSON structure returned by Prometheus query_range API +type prometheusResponse struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []struct { + Metric map[string]string `json:"metric"` + Values [][]interface{} `json:"values"` + } `json:"result"` + } `json:"data"` +} + +// queryPrometheusRaw executes a Prometheus range query and returns the raw response +func (s *SettingsService) queryPrometheusRaw(ctx context.Context, query string, start, end time.Time, step time.Duration) (*prometheusResponse, error) { if s.prometheusURL == "" { return nil, fmt.Errorf("prometheus URL not configured") } - // Build query URL - url := fmt.Sprintf("%s/api/v1/query_range?query=%s&start=%d&end=%d&step=%d", - s.prometheusURL, - query, - start.Unix(), - end.Unix(), - int(step.Seconds()), - ) + params := url.Values{} + params.Set("query", query) + params.Set("start", fmt.Sprintf("%d", start.Unix())) + params.Set("end", fmt.Sprintf("%d", end.Unix())) + params.Set("step", fmt.Sprintf("%d", int(step.Seconds()))) + fullURL := fmt.Sprintf("%s/api/v1/query_range?%s", s.prometheusURL, params.Encode()) - req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + req, err := http.NewRequestWithContext(ctx, "GET", fullURL, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } @@ -86,17 +125,7 @@ func (s *SettingsService) QueryPrometheus(ctx context.Context, query string, sta return nil, fmt.Errorf("prometheus returned status %d: %s", resp.StatusCode, string(body)) } - var result struct { - Status string `json:"status"` - Data struct { - ResultType string `json:"resultType"` - Result []struct { - Metric map[string]string `json:"metric"` - Values [][]interface{} `json:"values"` - } `json:"result"` - } `json:"data"` - } - + var result prometheusResponse if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return nil, fmt.Errorf("failed to decode response: %w", err) } @@ -105,51 +134,127 @@ func (s *SettingsService) QueryPrometheus(ctx context.Context, query string, sta return nil, fmt.Errorf("prometheus query failed") } + return &result, nil +} + +// parseMetricValues extracts PrometheusMetric slice from raw Prometheus values +func parseMetricValues(values [][]interface{}) []PrometheusMetric { var metrics []PrometheusMetric - if len(result.Data.Result) > 0 { - for _, v := range result.Data.Result[0].Values { - if len(v) >= 2 { - ts, _ := v[0].(float64) - val := 0.0 - switch vv := v[1].(type) { - case string: - fmt.Sscanf(vv, "%f", &val) - case float64: - val = vv - } - metrics = append(metrics, PrometheusMetric{ - Timestamp: ts, - Value: val, - }) + for _, v := range values { + if len(v) >= 2 { + ts, _ := v[0].(float64) + val := 0.0 + switch vv := v[1].(type) { + case string: + fmt.Sscanf(vv, "%f", &val) + case float64: + val = vv } + metrics = append(metrics, PrometheusMetric{ + Timestamp: ts, + Value: val, + }) } } + return metrics +} + +// QueryPrometheus queries Prometheus API and returns the first result series +func (s *SettingsService) QueryPrometheus(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]PrometheusMetric, error) { + result, err := s.queryPrometheusRaw(ctx, query, start, end, step) + if err != nil { + return nil, err + } - return metrics, nil + if len(result.Data.Result) > 0 { + return parseMetricValues(result.Data.Result[0].Values), nil + } + + return nil, nil +} + +// QueryPrometheusMultiSeries queries Prometheus API and returns all result series with labels +func (s *SettingsService) QueryPrometheusMultiSeries(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]LabeledMetricSeries, error) { + result, err := s.queryPrometheusRaw(ctx, query, start, end, step) + if err != nil { + return nil, err + } + + var series []LabeledMetricSeries + for _, r := range result.Data.Result { + series = append(series, LabeledMetricSeries{ + Labels: r.Metric, + Metrics: parseMetricValues(r.Values), + }) + } + + return series, nil } // GetNodeMetrics returns metrics for a specific node -func (s *SettingsService) GetNodeMetrics(ctx context.Context, nodeName string, hours int) (*NodeMetrics, error) { +func (s *SettingsService) GetNodeMetrics(ctx context.Context, req NodeMetricsRequest) (*NodeMetrics, error) { end := time.Now() - start := end.Add(-time.Duration(hours) * time.Hour) + start := end.Add(-time.Duration(req.Hours) * time.Hour) step := time.Minute * 5 + node := req.NodeName - // Query CPU usage - cpuQuery := fmt.Sprintf(`100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle", instance=~"%s.*"}[5m])) * 100)`, nodeName) - cpuMetrics, err := s.QueryPrometheus(ctx, cpuQuery, start, end, step) - if err != nil { - cpuMetrics = nil // Non-fatal, continue + result := &NodeMetrics{} + + // --- Always query: CPU, Memory, Network, RDMA --- + + // CPU usage (%) + cpuQuery := fmt.Sprintf(`100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle", instance=~"%s.*"}[5m])) * 100)`, node) + result.CPUUsage, _ = s.QueryPrometheus(ctx, cpuQuery, start, end, step) + + // Memory usage (%) + memQuery := fmt.Sprintf(`(1 - (node_memory_MemAvailable_bytes{instance=~"%s.*"} / node_memory_MemTotal_bytes{instance=~"%s.*"})) * 100`, node, node) + result.MemoryUsage, _ = s.QueryPrometheus(ctx, memQuery, start, end, step) + + // Network receive (bytes/sec, excluding virtual interfaces) + netRecvQuery := fmt.Sprintf(`sum(rate(node_network_receive_bytes_total{instance=~"%s.*",device!~"lo|docker.*|veth.*|br.*|cni.*|flannel.*|cali.*|tunl.*|kube.*|virbr.*"}[5m]))`, node) + result.NetworkReceive, _ = s.QueryPrometheus(ctx, netRecvQuery, start, end, step) + + // Network transmit (bytes/sec) + netTransQuery := fmt.Sprintf(`sum(rate(node_network_transmit_bytes_total{instance=~"%s.*",device!~"lo|docker.*|veth.*|br.*|cni.*|flannel.*|cali.*|tunl.*|kube.*|virbr.*"}[5m]))`, node) + result.NetworkTransmit, _ = s.QueryPrometheus(ctx, netTransQuery, start, end, step) + + // RDMA receive (bytes/sec, InfiniBand via node_exporter) + rdmaRecvQuery := fmt.Sprintf(`sum(rate(node_infiniband_port_data_received_bytes_total{instance=~"%s.*"}[5m]))`, node) + result.RdmaReceive, _ = s.QueryPrometheus(ctx, rdmaRecvQuery, start, end, step) + + // RDMA transmit (bytes/sec) + rdmaTransQuery := fmt.Sprintf(`sum(rate(node_infiniband_port_data_transmitted_bytes_total{instance=~"%s.*"}[5m]))`, node) + result.RdmaTransmit, _ = s.QueryPrometheus(ctx, rdmaTransQuery, start, end, step) + + // --- Conditional: GPU (DCGM) --- + if req.HasGpu { + // Average GPU SM utilization (%) + gpuUtilQuery := fmt.Sprintf(`avg(DCGM_FI_DEV_GPU_UTIL{Hostname="%s"} or DCGM_FI_DEV_GPU_UTIL{instance=~"%s.*"})`, node, node) + result.GpuUtilization, _ = s.QueryPrometheus(ctx, gpuUtilQuery, start, end, step) + + // Average GPU memory utilization (%) + gpuMemQuery := fmt.Sprintf(`avg(DCGM_FI_DEV_MEM_COPY_UTIL{Hostname="%s"} or DCGM_FI_DEV_MEM_COPY_UTIL{instance=~"%s.*"})`, node, node) + result.GpuMemoryUtil, _ = s.QueryPrometheus(ctx, gpuMemQuery, start, end, step) + + // Per-GPU SM utilization (multi-series) + gpuPerDeviceQuery := fmt.Sprintf(`DCGM_FI_DEV_GPU_UTIL{Hostname="%s"} or DCGM_FI_DEV_GPU_UTIL{instance=~"%s.*"}`, node, node) + result.GpuPerDevice, _ = s.QueryPrometheusMultiSeries(ctx, gpuPerDeviceQuery, start, end, step) } - // Query memory usage - memQuery := fmt.Sprintf(`(1 - (node_memory_MemAvailable_bytes{instance=~"%s.*"} / node_memory_MemTotal_bytes{instance=~"%s.*"})) * 100`, nodeName, nodeName) - memMetrics, err := s.QueryPrometheus(ctx, memQuery, start, end, step) - if err != nil { - memMetrics = nil // Non-fatal, continue + // --- Conditional: NPU (Huawei Ascend) --- + if req.HasNpu { + // NPU utilization (%) + npuUtilQuery := fmt.Sprintf(`avg(npu_chip_info_utilization{id=~"%s.*"})`, node) + result.NpuUtilization, _ = s.QueryPrometheus(ctx, npuUtilQuery, start, end, step) + + // NPU HBM usage (%) + npuMemQuery := fmt.Sprintf(`avg(npu_chip_info_hbm_usage{id=~"%s.*"})`, node) + result.NpuMemoryUtil, _ = s.QueryPrometheus(ctx, npuMemQuery, start, end, step) + + // NPU temperature (°C) + npuTempQuery := fmt.Sprintf(`avg(npu_chip_info_temperature{id=~"%s.*"})`, node) + result.NpuTemperature, _ = s.QueryPrometheus(ctx, npuTempQuery, start, end, step) } - return &NodeMetrics{ - CPUUsage: cpuMetrics, - MemoryUsage: memMetrics, - }, nil + return result, nil } diff --git a/api-server/internal/ssh/executor.go b/api-server/internal/ssh/executor.go new file mode 100644 index 0000000..e1feb9e --- /dev/null +++ b/api-server/internal/ssh/executor.go @@ -0,0 +1,370 @@ +package ssh + +import ( + "bytes" + "context" + "fmt" + "io" + "net" + "strings" + "sync" + "time" + + "golang.org/x/crypto/ssh" +) + +// AuthMethod represents the SSH authentication method +type AuthMethod string + +const ( + AuthMethodPassword AuthMethod = "password" + AuthMethodPrivateKey AuthMethod = "privateKey" +) + +// Config holds SSH connection configuration +type Config struct { + Host string + Port int + Username string + AuthMethod AuthMethod + Password string + PrivateKey string // PEM encoded private key content + Timeout time.Duration +} + +// CommandResult holds the result of a remote command execution +type CommandResult struct { + Stdout string + Stderr string + ExitCode int + Error error +} + +// Executor handles SSH connections and remote command execution +type Executor struct { + config *Config + client *ssh.Client + mu sync.Mutex +} + +// NewExecutor creates a new SSH executor with the given configuration +func NewExecutor(config *Config) *Executor { + if config.Port == 0 { + config.Port = 22 + } + if config.Timeout == 0 { + config.Timeout = 30 * time.Second + } + return &Executor{ + config: config, + } +} + +// Connect establishes an SSH connection to the remote host +func (e *Executor) Connect(ctx context.Context) error { + e.mu.Lock() + defer e.mu.Unlock() + + if e.client != nil { + return nil // Already connected + } + + var authMethods []ssh.AuthMethod + + switch e.config.AuthMethod { + case AuthMethodPassword: + if e.config.Password == "" { + return fmt.Errorf("password is required for password authentication") + } + authMethods = append(authMethods, ssh.Password(e.config.Password)) + + case AuthMethodPrivateKey: + if e.config.PrivateKey == "" { + return fmt.Errorf("private key is required for private key authentication") + } + signer, err := ssh.ParsePrivateKey([]byte(e.config.PrivateKey)) + if err != nil { + return fmt.Errorf("failed to parse private key: %w", err) + } + authMethods = append(authMethods, ssh.PublicKeys(signer)) + + default: + return fmt.Errorf("unsupported authentication method: %s", e.config.AuthMethod) + } + + sshConfig := &ssh.ClientConfig{ + User: e.config.Username, + Auth: authMethods, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), // TODO: Consider using known_hosts in production + Timeout: e.config.Timeout, + } + + addr := fmt.Sprintf("%s:%d", e.config.Host, e.config.Port) + + // Use context for connection timeout + var client *ssh.Client + var err error + + done := make(chan struct{}) + go func() { + client, err = ssh.Dial("tcp", addr, sshConfig) + close(done) + }() + + select { + case <-ctx.Done(): + return ctx.Err() + case <-done: + if err != nil { + return fmt.Errorf("failed to connect to %s: %w", addr, err) + } + } + + e.client = client + return nil +} + +// Execute runs a command on the remote host and returns the result +func (e *Executor) Execute(ctx context.Context, command string) *CommandResult { + e.mu.Lock() + if e.client == nil { + e.mu.Unlock() + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("not connected"), + } + } + client := e.client + e.mu.Unlock() + + session, err := client.NewSession() + if err != nil { + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("failed to create session: %w", err), + } + } + defer session.Close() + + var stdout, stderr bytes.Buffer + session.Stdout = &stdout + session.Stderr = &stderr + + // Run command with context cancellation support + done := make(chan error, 1) + go func() { + done <- session.Run(command) + }() + + select { + case <-ctx.Done(): + // Try to close the session to stop the command + session.Close() + return &CommandResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: -1, + Error: ctx.Err(), + } + case err := <-done: + result := &CommandResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: 0, + } + + if err != nil { + if exitErr, ok := err.(*ssh.ExitError); ok { + result.ExitCode = exitErr.ExitStatus() + } else { + result.ExitCode = -1 + result.Error = err + } + } + + return result + } +} + +// ExecuteWithTimeout runs a command with a specific timeout +func (e *Executor) ExecuteWithTimeout(command string, timeout time.Duration) *CommandResult { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + return e.Execute(ctx, command) +} + +// ExecuteScript executes a shell script on the remote host +// The script content is passed via stdin to avoid escaping issues +func (e *Executor) ExecuteScript(ctx context.Context, script string) *CommandResult { + e.mu.Lock() + if e.client == nil { + e.mu.Unlock() + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("not connected"), + } + } + client := e.client + e.mu.Unlock() + + session, err := client.NewSession() + if err != nil { + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("failed to create session: %w", err), + } + } + defer session.Close() + + var stdout, stderr bytes.Buffer + session.Stdout = &stdout + session.Stderr = &stderr + + // Pass script via stdin + stdin, err := session.StdinPipe() + if err != nil { + return &CommandResult{ + ExitCode: -1, + Error: fmt.Errorf("failed to create stdin pipe: %w", err), + } + } + + done := make(chan error, 1) + go func() { + done <- session.Run("bash -s") + }() + + // Write script to stdin + go func() { + defer stdin.Close() + io.WriteString(stdin, script) + }() + + select { + case <-ctx.Done(): + session.Close() + return &CommandResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: -1, + Error: ctx.Err(), + } + case err := <-done: + result := &CommandResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: 0, + } + + if err != nil { + if exitErr, ok := err.(*ssh.ExitError); ok { + result.ExitCode = exitErr.ExitStatus() + } else { + result.ExitCode = -1 + result.Error = err + } + } + + return result + } +} + +// TestConnection tests if the SSH connection can be established +func (e *Executor) TestConnection(ctx context.Context) error { + if err := e.Connect(ctx); err != nil { + return err + } + + // Run a simple command to verify the connection works + result := e.Execute(ctx, "echo ok") + if result.Error != nil { + return result.Error + } + if result.ExitCode != 0 { + return fmt.Errorf("connection test failed: %s", result.Stderr) + } + if strings.TrimSpace(result.Stdout) != "ok" { + return fmt.Errorf("unexpected response: %s", result.Stdout) + } + + return nil +} + +// Close closes the SSH connection +func (e *Executor) Close() error { + e.mu.Lock() + defer e.mu.Unlock() + + if e.client != nil { + err := e.client.Close() + e.client = nil + return err + } + return nil +} + +// IsConnected returns true if there is an active SSH connection +func (e *Executor) IsConnected() bool { + e.mu.Lock() + defer e.mu.Unlock() + return e.client != nil +} + +// GetHostInfo retrieves basic host information (OS, architecture, etc.) +func (e *Executor) GetHostInfo(ctx context.Context) (map[string]string, error) { + info := make(map[string]string) + + // Get OS information + osResult := e.Execute(ctx, "cat /etc/os-release 2>/dev/null | grep -E '^(ID|VERSION_ID)=' | cut -d'=' -f2 | tr -d '\"'") + if osResult.Error == nil && osResult.ExitCode == 0 { + lines := strings.Split(strings.TrimSpace(osResult.Stdout), "\n") + if len(lines) >= 1 { + info["os"] = strings.TrimSpace(lines[0]) + } + if len(lines) >= 2 { + info["version"] = strings.TrimSpace(lines[1]) + } + } + + // Get architecture + archResult := e.Execute(ctx, "uname -m") + if archResult.Error == nil && archResult.ExitCode == 0 { + arch := strings.TrimSpace(archResult.Stdout) + // Normalize architecture names + switch arch { + case "x86_64": + arch = "amd64" + case "aarch64": + arch = "arm64" + } + info["arch"] = arch + } + + // Get hostname + hostnameResult := e.Execute(ctx, "hostname") + if hostnameResult.Error == nil && hostnameResult.ExitCode == 0 { + info["hostname"] = strings.TrimSpace(hostnameResult.Stdout) + } + + return info, nil +} + +// CheckCommand checks if a command exists on the remote host +func (e *Executor) CheckCommand(ctx context.Context, command string) bool { + result := e.Execute(ctx, fmt.Sprintf("command -v %s", command)) + return result.Error == nil && result.ExitCode == 0 +} + +// DialFunc returns a function that can be used as a proxy dialer +func (e *Executor) DialFunc() func(network, addr string) (net.Conn, error) { + return func(network, addr string) (net.Conn, error) { + e.mu.Lock() + client := e.client + e.mu.Unlock() + + if client == nil { + return nil, fmt.Errorf("not connected") + } + return client.Dial(network, addr) + } +} diff --git a/deploy/charts/bison/templates/NOTES.txt b/deploy/charts/bison/templates/NOTES.txt index 7ad6d8b..7806fe9 100644 --- a/deploy/charts/bison/templates/NOTES.txt +++ b/deploy/charts/bison/templates/NOTES.txt @@ -9,27 +9,21 @@ Namespace: {{ .Release.Namespace }} 数据存储在 Kubernetes ConfigMaps 中,无需外部数据库 === 访问方式 === -{{- if .Values.apiServer.enabled }} -{{- if .Values.apiServer.ingress.enabled }} - -API Server: http://{{ .Values.apiServer.ingress.host }}/api/v1 -{{- else }} +{{- if and .Values.webUI.enabled .Values.ingress.enabled }} -API Server: - kubectl port-forward svc/{{ include "bison.apiServer.fullname" . }} 8080:{{ .Values.apiServer.service.port }} -n {{ .Release.Namespace }} - 访问: http://localhost:8080/api/v1 -{{- end }} -{{- end }} -{{- if .Values.webUI.enabled }} -{{- if .Values.webUI.ingress.enabled }} - -Web UI: http://{{ .Values.webUI.ingress.host }} -{{- else }} +Bison: http://{{ .Values.ingress.host }} + Web UI nginx 会自动代理 /api 请求到 API Server +{{- else if .Values.webUI.enabled }} Web UI: kubectl port-forward svc/{{ include "bison.webUI.fullname" . }} 3000:{{ .Values.webUI.service.port }} -n {{ .Release.Namespace }} 访问: http://localhost:3000 {{- end }} +{{- if .Values.apiServer.enabled }} + +API Server (调试): + kubectl port-forward svc/{{ include "bison.apiServer.fullname" . }} 8080:{{ .Values.apiServer.service.port }} -n {{ .Release.Namespace }} + 访问: http://localhost:8080/api/v1 {{- end }} === 认证 === diff --git a/deploy/charts/bison/templates/_helpers.tpl b/deploy/charts/bison/templates/_helpers.tpl index 1247b30..553da2d 100644 --- a/deploy/charts/bison/templates/_helpers.tpl +++ b/deploy/charts/bison/templates/_helpers.tpl @@ -76,14 +76,12 @@ Web UI full name {{- end }} {{/* -Get image registry +Build full image reference: global.imageRegistry/repository:tag +Usage: include "bison.image" (dict "imageConfig" .Values.apiServer.image "global" .Values.global "appVersion" .Chart.AppVersion) */}} -{{- define "bison.imageRegistry" -}} -{{- if .Values.global.imageRegistry }} -{{- printf "%s/" .Values.global.imageRegistry }} -{{- else }} -{{- "" }} -{{- end }} +{{- define "bison.image" -}} +{{- $tag := .imageConfig.tag | default .appVersion -}} +{{- printf "%s/%s:%s" .global.imageRegistry .imageConfig.repository $tag -}} {{- end }} {{/* diff --git a/deploy/charts/bison/templates/api-server/deployment.yaml b/deploy/charts/bison/templates/api-deployment.yaml similarity index 91% rename from deploy/charts/bison/templates/api-server/deployment.yaml rename to deploy/charts/bison/templates/api-deployment.yaml index b141063..31ecc56 100644 --- a/deploy/charts/bison/templates/api-server/deployment.yaml +++ b/deploy/charts/bison/templates/api-deployment.yaml @@ -30,7 +30,7 @@ spec: - name: api securityContext: {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ include "bison.imageRegistry" . }}{{ .Values.apiServer.image.repository }}:{{ .Values.apiServer.image.tag }}" + image: "{{ include "bison.image" (dict "imageConfig" .Values.apiServer.image "global" .Values.global "appVersion" .Chart.AppVersion) }}" imagePullPolicy: {{ .Values.apiServer.image.pullPolicy }} ports: - name: http @@ -56,6 +56,9 @@ spec: name: {{ if .Values.auth.jwt.existingSecret }}{{ .Values.auth.jwt.existingSecret }}{{ else }}{{ include "bison.authSecretName" . }}{{ end }} key: jwt-secret {{- end }} + # Capsule integration + - name: CAPSULE_ENABLED + value: {{ .Values.dependencies.capsule.enabled | quote }} # OpenCost integration {{- if .Values.dependencies.opencost.enabled }} - name: OPENCOST_URL diff --git a/deploy/charts/bison/templates/api-server/ingress.yaml b/deploy/charts/bison/templates/api-server/ingress.yaml deleted file mode 100644 index e979851..0000000 --- a/deploy/charts/bison/templates/api-server/ingress.yaml +++ /dev/null @@ -1,40 +0,0 @@ -{{- if and .Values.apiServer.enabled .Values.apiServer.ingress.enabled }} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: {{ include "bison.apiServer.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "bison.labels" . | nindent 4 }} - app.kubernetes.io/component: api-server - {{- with .Values.apiServer.ingress.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if .Values.apiServer.ingress.className }} - ingressClassName: {{ .Values.apiServer.ingress.className }} - {{- end }} - {{- if .Values.apiServer.ingress.tls }} - tls: - {{- range .Values.apiServer.ingress.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - - host: {{ .Values.apiServer.ingress.host | quote }} - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: {{ include "bison.apiServer.fullname" . }} - port: - number: {{ .Values.apiServer.service.port }} -{{- end }} - diff --git a/deploy/charts/bison/templates/ingress.yaml b/deploy/charts/bison/templates/ingress.yaml new file mode 100644 index 0000000..9184908 --- /dev/null +++ b/deploy/charts/bison/templates/ingress.yaml @@ -0,0 +1,34 @@ +{{- if and .Values.webUI.enabled .Values.ingress.enabled }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "bison.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls.enabled }} + tls: + - hosts: + - {{ .Values.ingress.host | quote }} + secretName: {{ .Values.ingress.tls.secretName | default (printf "%s-tls" (include "bison.fullname" .)) }} + {{- end }} + rules: + - host: {{ .Values.ingress.host | quote }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ include "bison.webUI.fullname" . }} + port: + number: {{ .Values.webUI.service.port }} +{{- end }} diff --git a/deploy/charts/bison/templates/api-server/rbac.yaml b/deploy/charts/bison/templates/rbac.yaml similarity index 100% rename from deploy/charts/bison/templates/api-server/rbac.yaml rename to deploy/charts/bison/templates/rbac.yaml diff --git a/deploy/charts/bison/templates/api-server/auth-secret.yaml b/deploy/charts/bison/templates/secret.yaml similarity index 99% rename from deploy/charts/bison/templates/api-server/auth-secret.yaml rename to deploy/charts/bison/templates/secret.yaml index 3bf3300..7bfe456 100644 --- a/deploy/charts/bison/templates/api-server/auth-secret.yaml +++ b/deploy/charts/bison/templates/secret.yaml @@ -21,4 +21,3 @@ data: {{- end }} {{- end }} {{- end }} - diff --git a/deploy/charts/bison/templates/api-server/service.yaml b/deploy/charts/bison/templates/service.yaml similarity index 50% rename from deploy/charts/bison/templates/api-server/service.yaml rename to deploy/charts/bison/templates/service.yaml index d0c9e18..ab2bcbc 100644 --- a/deploy/charts/bison/templates/api-server/service.yaml +++ b/deploy/charts/bison/templates/service.yaml @@ -18,4 +18,24 @@ spec: {{- include "bison.selectorLabels" . | nindent 4 }} app.kubernetes.io/component: api-server {{- end }} - +--- +{{- if .Values.webUI.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "bison.webUI.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "bison.labels" . | nindent 4 }} + app.kubernetes.io/component: web-ui +spec: + type: {{ .Values.webUI.service.type }} + ports: + - port: {{ .Values.webUI.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "bison.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: web-ui +{{- end }} diff --git a/deploy/charts/bison/templates/web-ui/deployment.yaml b/deploy/charts/bison/templates/web-deployment.yaml similarity index 93% rename from deploy/charts/bison/templates/web-ui/deployment.yaml rename to deploy/charts/bison/templates/web-deployment.yaml index a637f65..3cfb358 100644 --- a/deploy/charts/bison/templates/web-ui/deployment.yaml +++ b/deploy/charts/bison/templates/web-deployment.yaml @@ -29,7 +29,7 @@ spec: - name: web securityContext: {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ include "bison.imageRegistry" . }}{{ .Values.webUI.image.repository }}:{{ .Values.webUI.image.tag }}" + image: "{{ include "bison.image" (dict "imageConfig" .Values.webUI.image "global" .Values.global "appVersion" .Chart.AppVersion) }}" imagePullPolicy: {{ .Values.webUI.image.pullPolicy }} ports: - name: http @@ -65,4 +65,3 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - diff --git a/deploy/charts/bison/templates/web-ui/ingress.yaml b/deploy/charts/bison/templates/web-ui/ingress.yaml deleted file mode 100644 index 00c86e6..0000000 --- a/deploy/charts/bison/templates/web-ui/ingress.yaml +++ /dev/null @@ -1,40 +0,0 @@ -{{- if and .Values.webUI.enabled .Values.webUI.ingress.enabled }} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: {{ include "bison.webUI.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "bison.labels" . | nindent 4 }} - app.kubernetes.io/component: web-ui - {{- with .Values.webUI.ingress.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if .Values.webUI.ingress.className }} - ingressClassName: {{ .Values.webUI.ingress.className }} - {{- end }} - {{- if .Values.webUI.ingress.tls }} - tls: - {{- range .Values.webUI.ingress.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - - host: {{ .Values.webUI.ingress.host | quote }} - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: {{ include "bison.webUI.fullname" . }} - port: - number: {{ .Values.webUI.service.port }} -{{- end }} - diff --git a/deploy/charts/bison/templates/web-ui/service.yaml b/deploy/charts/bison/templates/web-ui/service.yaml deleted file mode 100644 index 905e3b6..0000000 --- a/deploy/charts/bison/templates/web-ui/service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -{{- if .Values.webUI.enabled }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "bison.webUI.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "bison.labels" . | nindent 4 }} - app.kubernetes.io/component: web-ui -spec: - type: {{ .Values.webUI.service.type }} - ports: - - port: {{ .Values.webUI.service.port }} - targetPort: http - protocol: TCP - name: http - selector: - {{- include "bison.selectorLabels" . | nindent 4 }} - app.kubernetes.io/component: web-ui -{{- end }} - diff --git a/deploy/charts/bison/values.yaml b/deploy/charts/bison/values.yaml index 1546f16..424581d 100644 --- a/deploy/charts/bison/values.yaml +++ b/deploy/charts/bison/values.yaml @@ -2,19 +2,19 @@ # Global configuration global: - imageRegistry: "" + imageRegistry: "ghcr.io/supermarioyl" imagePullSecrets: [] # Authentication configuration auth: - enabled: false # Enable login authentication + enabled: false # Enable login authentication admin: - username: admin # Admin username - password: "" # Admin password (not recommended, use existingSecret) - existingSecret: "" # Secret containing 'password' key + username: admin # Admin username + password: "" # Admin password (not recommended, use existingSecret) + existingSecret: "" # Secret containing 'password' key jwt: - secret: "" # JWT signing secret (auto-generated if empty) - existingSecret: "" # Secret containing 'jwt-secret' key + secret: "" # JWT signing secret (auto-generated if empty) + existingSecret: "" # Secret containing 'jwt-secret' key # External dependencies # Note: Capsule and OpenCost must be installed separately before deploying Bison @@ -23,16 +23,16 @@ dependencies: capsule: # Capsule must be installed separately # helm install capsule projectcapsule/capsule -n capsule-system --create-namespace - enabled: true - + enabled: false + # OpenCost configuration opencost: # OpenCost must be installed separately # helm install opencost opencost/opencost -n opencost --create-namespace - enabled: true + enabled: false # OpenCost API URL (internal service URL) - use port 9003 for API, NOT 9090 (UI) apiUrl: "http://opencost.opencost.svc.cluster.local:9003" - + # Prometheus configuration (for node metrics, required by OpenCost) prometheus: enabled: true @@ -44,18 +44,12 @@ apiServer: enabled: true replicaCount: 2 image: - repository: ghcr.io/supermarioyl/bison/api-server - tag: 0.0.1 + repository: bison/api-server + tag: "" # Defaults to Chart.AppVersion if empty pullPolicy: IfNotPresent service: type: ClusterIP port: 8080 - ingress: - enabled: true - className: "" - annotations: {} - host: bison-api.example.com - tls: [] resources: limits: cpu: 1000m @@ -69,21 +63,15 @@ apiServer: # Web UI configuration webUI: - enabled: false + enabled: true replicaCount: 2 image: - repository: ghcr.io/supermarioyl/bison/web-ui - tag: 0.0.1 + repository: bison/web-ui + tag: "" # Defaults to Chart.AppVersion if empty pullPolicy: IfNotPresent service: type: ClusterIP port: 80 - ingress: - enabled: true - className: "" - annotations: {} - host: bison.example.com - tls: [] resources: limits: cpu: 500m @@ -95,6 +83,16 @@ webUI: tolerations: [] affinity: {} +# Ingress (unified entry point, web-ui nginx proxies /api to api-server) +ingress: + enabled: true + className: "" + annotations: {} + host: bison.example.com + tls: + enabled: false + secretName: "" # If empty, auto-generated as -tls + # Service account serviceAccount: create: true diff --git a/web-ui/src/App.tsx b/web-ui/src/App.tsx index 1c4d22c..fb716a3 100644 --- a/web-ui/src/App.tsx +++ b/web-ui/src/App.tsx @@ -17,8 +17,11 @@ import AuditList from './pages/Audit/AuditList'; import ReportCenter from './pages/Report/ReportCenter'; import Settings from './pages/Settings'; import ProtectedRoute from './components/ProtectedRoute'; +import { useFeatures } from './hooks/useFeatures'; const App: React.FC = () => { + const { data: features } = useFeatures(); + return ( } /> @@ -29,17 +32,23 @@ const App: React.FC = () => { }> } /> } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> + {features?.capsuleEnabled !== false && ( + <> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + + )} } /> } /> - } /> + {features?.costEnabled !== false && ( + } /> + )} } /> } /> diff --git a/web-ui/src/components/NodeOnboardingModal.tsx b/web-ui/src/components/NodeOnboardingModal.tsx new file mode 100644 index 0000000..b7c1029 --- /dev/null +++ b/web-ui/src/components/NodeOnboardingModal.tsx @@ -0,0 +1,183 @@ +import React, { useState } from 'react'; +import { + Modal, + Form, + Input, + InputNumber, + Select, + Alert, + Typography, +} from 'antd'; +import { useMutation } from '@tanstack/react-query'; +import { startNodeOnboarding, OnboardingRequest } from '../services/api'; + +const { TextArea } = Input; +const { Text } = Typography; + +interface NodeOnboardingModalProps { + open: boolean; + onClose: () => void; + onStarted: (jobId: string) => void; +} + +const NodeOnboardingModal: React.FC = ({ + open, + onClose, + onStarted, +}) => { + const [form] = Form.useForm(); + const [authMethod, setAuthMethod] = useState<'password' | 'privateKey'>('password'); + + const startMutation = useMutation({ + mutationFn: startNodeOnboarding, + onSuccess: (response) => { + form.resetFields(); + onStarted(response.data.id); + }, + }); + + const handleSubmit = () => { + form.validateFields().then(values => { + const request: OnboardingRequest = { + nodeIP: values.nodeIP, + sshPort: values.sshPort || 22, + sshUsername: values.sshUsername, + authMethod: values.authMethod, + password: values.authMethod === 'password' ? values.password : undefined, + privateKey: values.authMethod === 'privateKey' ? values.privateKey : undefined, + }; + startMutation.mutate(request); + }); + }; + + const handleClose = () => { + form.resetFields(); + setAuthMethod('password'); + onClose(); + }; + + // IP address validation + const validateIP = (_: unknown, value: string) => { + if (!value) { + return Promise.reject(new Error('请输入节点 IP')); + } + // Simple IP format validation + const ipRegex = /^(\d{1,3}\.){3}\d{1,3}$/; + if (!ipRegex.test(value)) { + return Promise.reject(new Error('请输入有效的 IP 地址')); + } + const parts = value.split('.').map(Number); + if (parts.some(p => p > 255)) { + return Promise.reject(new Error('请输入有效的 IP 地址')); + } + return Promise.resolve(); + }; + + return ( + + +
  • 目标节点已安装操作系统(Ubuntu/CentOS 等)
  • +
  • 目标节点已安装 kubeadm、kubelet、kubectl
  • +
  • 目标节点网络可达,支持 SSH 连接
  • + + } + type="info" + showIcon + style={{ marginBottom: 24 }} + /> + + {startMutation.isError && ( + + )} + +
    + + + + + + + + + + + + + + + + + {authMethod === 'password' ? ( + + + + ) : ( + 将私钥内容粘贴到此处} + > +