Merge pull request #18 from last9/fix/collector-release-tag-and-publish #34
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: ci-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| python: | |
| name: Python (${{ matrix.python-version }}) | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: ["3.10", "3.11", "3.12"] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| cache: pip | |
| - name: Install dev deps | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -e ".[dev,k8s]" | |
| - name: Lint (flake8) | |
| run: flake8 --per-file-ignores=l9gpu/_version.py:F401 l9gpu | |
| - name: Format check (black) | |
| run: black --check l9gpu | |
| - name: Type-check (mypy) | |
| continue-on-error: true | |
| run: mypy l9gpu | |
| - name: Unit tests (pytest) | |
| run: pytest l9gpu/tests --ignore-glob='**/tests/*_internal.py' -n auto | |
| go: | |
| name: Go (${{ matrix.module }}) | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| module: [k8sprocessor, slurmprocessor, k8shelper, shelper] | |
| defaults: | |
| run: | |
| working-directory: ${{ matrix.module }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-go@v5 | |
| with: | |
| go-version: "1.25" | |
| cache-dependency-path: ${{ matrix.module }}/go.sum | |
| - run: go mod download | |
| - run: go build ./... | |
| - run: go vet ./... | |
| - run: go test -race -count=1 ./... | |
| - name: golangci-lint | |
| uses: golangci/golangci-lint-action@v6 | |
| with: | |
| version: v1.64 | |
| working-directory: ${{ matrix.module }} | |
| args: --timeout=5m | |
| # golangci-lint v1.64 ships with Go 1.24 — skip lint until a | |
| # newer release supports the Go 1.25 toolchain required by the | |
| # upstream OTel collector v0.150 dependencies. | |
| continue-on-error: true | |
| helm: | |
| name: Helm chart lint + template | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: azure/setup-helm@v4 | |
| with: | |
| version: v3.14.0 | |
| - name: helm lint (default values) | |
| run: helm lint deploy/helm/l9gpu | |
| - name: helm lint (example values) | |
| run: | | |
| for values in deploy/helm/l9gpu/examples/*.yaml; do | |
| echo "::group::helm lint with $values" | |
| helm lint deploy/helm/l9gpu -f "$values" | |
| echo "::endgroup::" | |
| done | |
| - name: helm template (default) | |
| run: helm template l9gpu deploy/helm/l9gpu > "$GITHUB_WORKSPACE/rendered.yaml" | |
| - name: Validate rendered manifests (kubeconform) | |
| uses: docker://ghcr.io/yannh/kubeconform:latest | |
| with: | |
| args: -strict -ignore-missing-schemas -summary rendered.yaml | |
| dashboards-and-alerts: | |
| name: Dashboards + alert rules | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Validate Grafana dashboard JSON | |
| run: | | |
| fail=0 | |
| for f in $(find dashboards/grafana -name '*.json'); do | |
| if ! jq empty "$f"; then | |
| echo "::error file=$f::invalid JSON" | |
| fail=1 | |
| fi | |
| done | |
| exit $fail | |
| - name: Validate YAML (alerts + demo + helm examples) | |
| run: | | |
| pip install --quiet yamllint | |
| yamllint -d '{extends: relaxed, rules: {line-length: disable, truthy: disable}}' \ | |
| alerts/ deploy/demo/ deploy/helm/l9gpu/examples/ | |
| - name: Install promtool | |
| run: | | |
| mkdir -p /tmp/prom && cd /tmp/prom | |
| curl -sSLo prometheus.tar.gz \ | |
| https://github.com/prometheus/prometheus/releases/download/v2.53.0/prometheus-2.53.0.linux-amd64.tar.gz | |
| tar -xzf prometheus.tar.gz | |
| echo "/tmp/prom/prometheus-2.53.0.linux-amd64" >> "$GITHUB_PATH" | |
| sudo curl -sSLo /usr/local/bin/yq \ | |
| https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64 | |
| sudo chmod +x /usr/local/bin/yq | |
| - name: Validate PrometheusRule CRDs | |
| run: | | |
| fail=0 | |
| for f in $(find alerts/prometheus -name '*.yaml' -o -name '*.yml'); do | |
| extracted="/tmp/$(basename "$f").rules" | |
| yq eval '.spec' "$f" > "$extracted" | |
| if ! promtool check rules "$extracted" 2>&1; then | |
| echo "::error file=$f::promtool check failed" | |
| fail=1 | |
| fi | |
| done | |
| exit $fail | |
| collector: | |
| name: Collector OCB build | |
| runs-on: ubuntu-latest | |
| env: | |
| OCB_VERSION: v0.150.0 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-go@v5 | |
| with: | |
| go-version: "1.25" | |
| - name: Install OpenTelemetry Collector Builder | |
| env: | |
| OCB_VER: ${{ env.OCB_VERSION }} | |
| run: | | |
| go install "go.opentelemetry.io/collector/cmd/builder@${OCB_VER}" | |
| echo "$(go env GOPATH)/bin" >> "$GITHUB_PATH" | |
| - name: Build collector distribution | |
| run: builder --config deploy/collector/builder-config.yaml | |
| - name: Validate example config | |
| env: | |
| OTEL_EXPORTER_OTLP_ENDPOINT: http://localhost:4317 | |
| OTEL_EXPORTER_OTLP_HEADERS: "Basic placeholder" | |
| KUBE_NODE_NAME: ci-node | |
| run: ./_build/l9gpu-collector validate --config=deploy/collector/config.example.yaml | |
| systemd: | |
| name: systemd unit syntax | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Stub l9gpu binary (systemd-analyze checks ExecStart exists) | |
| run: | | |
| sudo install -m 0755 /dev/stdin /usr/bin/l9gpu <<'EOF' | |
| #!/bin/sh | |
| exec true | |
| EOF | |
| - name: systemd-analyze verify | |
| run: | | |
| fail=0 | |
| for f in systemd/*.service systemd/nvml/*.service; do | |
| [ -s "$f" ] || { echo "::error file=$f::empty unit"; fail=1; continue; } | |
| grep -q '^\[Service\]' "$f" || { echo "::error file=$f::missing [Service]"; fail=1; continue; } | |
| output=$(systemd-analyze verify --recursive-errors=no "$f" 2>&1 || true) | |
| echo "$output" | |
| # Only fail on true syntax errors, not unknown-key warnings | |
| # from newer systemd directives the runner image doesn't know. | |
| if echo "$output" | grep -iE 'syntax error|bad|missing|invalid' | grep -viE 'Unknown key|ignoring'; then | |
| echo "::error file=$f::systemd-analyze found syntax error" | |
| fail=1 | |
| fi | |
| done | |
| for f in systemd/*.slice systemd/nvml/*.slice; do | |
| [ -s "$f" ] || { echo "::error file=$f::empty slice"; fail=1; continue; } | |
| grep -q '^\[Slice\]' "$f" || { echo "::error file=$f::missing [Slice]"; fail=1; } | |
| done | |
| exit $fail | |
| python-wheel: | |
| name: Python wheel build | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.11" | |
| - name: Build wheel + sdist | |
| run: | | |
| python -m pip install --upgrade pip build | |
| python -m build | |
| - name: Twine check | |
| run: | | |
| python -m pip install twine | |
| twine check dist/* | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: python-dist | |
| path: dist/ | |
| docker: | |
| name: Docker build (l9gpu) | |
| # Collector Dockerfile expects a pre-built binary from goreleaser | |
| # and is exercised by the collector-release workflow. Here we only | |
| # smoke-test the standalone l9gpu Python image. | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: docker/setup-buildx-action@v3 | |
| - name: Build l9gpu image | |
| uses: docker/build-push-action@v6 | |
| with: | |
| context: . | |
| file: docker/Dockerfile | |
| push: false | |
| load: true | |
| tags: l9gpu:ci |