diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000000000..9163171c90248 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,81 @@ +name: Bug report +description: File a bug report. If you need help, contact support instead +labels: [needs-triage, bug] +body: + - type: markdown + attributes: + value: | + Need help with your tailnet? [Contact support](https://tailscale.com/contact/support) instead. + Otherwise, please check if your bug is [already filed](https://github.com/tailscale/tailscale/issues) before filing a new one. + - type: textarea + id: what-happened + attributes: + label: What is the issue? + description: What happened? What did you expect to happen? + validations: + required: true + - type: textarea + id: steps + attributes: + label: Steps to reproduce + description: What are the steps you took that hit this issue? + validations: + required: false + - type: textarea + id: changes + attributes: + label: Are there any recent changes that introduced the issue? + description: If so, what are those changes? + validations: + required: false + - type: dropdown + id: os + attributes: + label: OS + description: What OS are you using? You may select more than one. + multiple: true + options: + - Linux + - macOS + - Windows + - iOS + - Android + - Synology + - Other + validations: + required: false + - type: input + id: os-version + attributes: + label: OS version + description: What OS version are you using? + placeholder: e.g., Debian 11.0, macOS Big Sur 11.6, Synology DSM 7 + validations: + required: false + - type: input + id: ts-version + attributes: + label: Tailscale version + description: What Tailscale version are you using? + placeholder: e.g., 1.14.4 + validations: + required: false + - type: textarea + id: other-software + attributes: + label: Other software + description: What [other software](https://github.com/tailscale/tailscale/wiki/OtherSoftwareInterop) (networking, security, etc) are you running? + validations: + required: false + - type: input + id: bug-report + attributes: + label: Bug report + description: Please run [`tailscale bugreport`](https://tailscale.com/kb/1080/cli/?q=Cli#bugreport) and share the bug identifier. The identifier is a random string which allows Tailscale support to locate your account and gives a point to focus on when looking for errors. + placeholder: e.g., BUG-1b7641a16971a9cd75822c0ed8043fee70ae88cf05c52981dc220eb96a5c49a8-20210427151443Z-fbcd4fd3a4b7ad94 + validations: + required: false + - type: markdown + attributes: + value: | + Thanks for filing a bug report! diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000..3f4a31534b7d7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: true +contact_links: + - name: Support + url: https://tailscale.com/contact/support/ + about: Contact us for support + - name: Troubleshooting + url: https://tailscale.com/kb/1023/troubleshooting + about: See the troubleshooting guide for help addressing common issues \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000000000..f7538627483ab --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,42 @@ +name: Feature request +description: Propose a new feature +title: "FR: " +labels: [needs-triage, fr] +body: + - type: markdown + attributes: + value: | + Please check if your feature request is [already filed](https://github.com/tailscale/tailscale/issues). + Tell us about your idea! + - type: textarea + id: problem + attributes: + label: What are you trying to do? + description: Tell us about the problem you're trying to solve. + validations: + required: false + - type: textarea + id: solution + attributes: + label: How should we solve this? + description: If you have an idea of how you'd like to see this feature work, let us know. + validations: + required: false + - type: textarea + id: alternative + attributes: + label: What is the impact of not solving this? + description: (How) Are you currently working around the issue? + validations: + required: false + - type: textarea + id: context + attributes: + label: Anything else? + description: Any additional context to share, e.g., links + validations: + required: false + - type: markdown + attributes: + value: | + Thanks for filing a feature request! diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000000..14c912905363e --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,21 @@ +# Documentation for this file can be found at: +# https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/configuration-options-for-dependency-updates +version: 2 +updates: + ## Disabled between releases. We reenable it briefly after every + ## stable release, pull in all changes, and close it again so that + ## the tree remains more stable during development and the upstream + ## changes have time to soak before the next release. + # - package-ecosystem: "gomod" + # directory: "/" + # schedule: + # interval: "daily" + # commit-message: + # prefix: "go.mod:" + # open-pull-requests-limit: 100 + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + commit-message: + prefix: ".github:" diff --git a/.github/licenses.tmpl b/.github/licenses.tmpl new file mode 100644 index 0000000000000..5fa7e8e81a2c2 --- /dev/null +++ b/.github/licenses.tmpl @@ -0,0 +1,17 @@ +# Tailscale CLI and daemon dependencies + +The following open source dependencies are used to build the [tailscale][] and +[tailscaled][] commands. These are primarily used on Linux and BSD variants as +well as an [option for macOS][]. + +[tailscale]: https://pkg.go.dev/tailscale.com/cmd/tailscale +[tailscaled]: https://pkg.go.dev/tailscale.com/cmd/tailscaled +[option for macOS]: https://tailscale.com/kb/1065/macos-variants/ + +## Go Packages + +Some packages may only be included on certain architectures or operating systems. + +{{ range . }} + - [{{.Name}}](https://pkg.go.dev/{{.Name}}) ([{{.LicenseName}}]({{.LicenseURL}})) +{{- end }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4f6068e6e33cd..ded7873aac8a1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -58,7 +58,7 @@ jobs: # See if the cache entry already exists to avoid downloading it # and doing the cache write again. - id: check-cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache # relative to workspace; see env note at top of file key: ${{ steps.hash.outputs.key }} @@ -70,7 +70,7 @@ jobs: run: go mod download - name: Cache Go modules if: steps.check-cache.outputs.cache-hit != 'true' - uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache # relative to workspace; see env note at top of file key: ${{ steps.hash.outputs.key }} @@ -93,7 +93,7 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -131,14 +131,14 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} enableCrossOsArchive: true - name: Restore Cache id: restore-cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: # Note: this is only restoring the build cache. Mod cache is shared amongst # all jobs in the workflow. @@ -209,7 +209,7 @@ jobs: - name: Save Cache # Save cache even on failure, but only on cache miss and main branch to avoid thrashing. if: always() && steps.restore-cache.outputs.cache-hit != 'true' && github.ref == 'refs/heads/main' - uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: # Note: this is only saving the build cache. Mod cache is shared amongst # all jobs in the workflow. @@ -245,7 +245,7 @@ jobs: path: ${{ github.workspace }}/src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -270,10 +270,7 @@ jobs: - name: bench all if: matrix.key == 'win-bench' working-directory: src - # Don't use -bench=. -benchtime=1x. - # Somewhere in the layers (powershell?) - # the equals signs cause great confusion. - run: ./tool/go test ./... -bench . -benchtime 1x -run "^$" + run: ./tool/go test ./... -bench=. -benchtime=1x -run="^$" env: NOPWSHDEBUG: "true" # to quiet tool/gocross/gocross-wrapper.ps1 in CI @@ -294,14 +291,14 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} enableCrossOsArchive: true - name: Restore Cache id: restore-cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: ~/Library/Caches/go-build key: ${{ runner.os }}-go-test-${{ hashFiles('**/go.sum') }}-${{ github.job }}-${{ github.run_id }} @@ -337,7 +334,7 @@ jobs: - name: Save Cache # Save cache even on failure, but only on cache miss and main branch to avoid thrashing. if: always() && steps.restore-cache.outputs.cache-hit != 'true' && github.ref == 'refs/heads/main' - uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: ~/Library/Caches/go-build key: ${{ runner.os }}-go-test-${{ hashFiles('**/go.sum') }}-${{ github.job }}-${{ github.run_id }} @@ -354,7 +351,7 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -364,7 +361,7 @@ jobs: run: chown -R $(id -u):$(id -g) $PWD - name: privileged tests working-directory: src - run: ./tool/go test ./util/linuxfw ./derp/xdp + run: ./tool/go test $(./tool/go run ./tool/listpkgs --has-root-tests) vm: needs: gomod-cache @@ -377,7 +374,7 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -433,14 +430,14 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} enableCrossOsArchive: true - name: Restore Cache id: restore-cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: # Note: this is only restoring the build cache. Mod cache is shared amongst # all jobs in the workflow. @@ -475,7 +472,7 @@ jobs: - name: Save Cache # Save cache even on failure, but only on cache miss and main branch to avoid thrashing. if: always() && steps.restore-cache.outputs.cache-hit != 'true' && github.ref == 'refs/heads/main' - uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: # Note: this is only saving the build cache. Mod cache is shared amongst # all jobs in the workflow. @@ -494,7 +491,7 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -532,14 +529,14 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} enableCrossOsArchive: true - name: Restore Cache id: restore-cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: # Note: this is only restoring the build cache. Mod cache is shared amongst # all jobs in the workflow. @@ -567,7 +564,7 @@ jobs: - name: Save Cache # Save cache even on failure, but only on cache miss and main branch to avoid thrashing. if: always() && steps.restore-cache.outputs.cache-hit != 'true' && github.ref == 'refs/heads/main' - uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: # Note: this is only saving the build cache. Mod cache is shared amongst # all jobs in the workflow. @@ -592,7 +589,7 @@ jobs: # some Android breakages early. # TODO(bradfitz): better; see https://github.com/tailscale/tailscale/issues/4482 - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -613,14 +610,14 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} enableCrossOsArchive: true - name: Restore Cache id: restore-cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: # Note: this is only restoring the build cache. Mod cache is shared amongst # all jobs in the workflow. @@ -653,7 +650,7 @@ jobs: - name: Save Cache # Save cache even on failure, but only on cache miss and main branch to avoid thrashing. if: always() && steps.restore-cache.outputs.cache-hit != 'true' && github.ref == 'refs/heads/main' - uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: # Note: this is only saving the build cache. Mod cache is shared amongst # all jobs in the workflow. @@ -671,7 +668,7 @@ jobs: - name: Set GOMODCACHE env run: echo "GOMODCACHE=$HOME/.cache/go-mod" >> $GITHUB_ENV - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -741,7 +738,7 @@ jobs: run: | echo "artifacts_path=$(realpath .)" >> $GITHUB_ENV - name: upload crash - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 if: steps.run.outcome != 'success' && steps.build.outcome == 'success' with: name: artifacts @@ -758,7 +755,7 @@ jobs: - name: Set GOMODCACHE env run: echo "GOMODCACHE=$HOME/.cache/go-mod" >> $GITHUB_ENV - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -776,7 +773,7 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -790,6 +787,14 @@ jobs: echo echo git diff --name-only --exit-code || (echo "The files above need updating. Please run 'go generate'."; exit 1) + - name: check that 'genreadme' is clean + working-directory: src + run: | + ./tool/go run ./misc/genreadme + git add -N . # ensure untracked files are noticed + echo + echo + git diff --name-only --exit-code || (echo "The files above need updating. Please run './tool/go run ./misc/genreadme'."; exit 1) make_tidy: runs-on: ubuntu-24.04 @@ -800,7 +805,7 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -822,7 +827,7 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} @@ -876,7 +881,7 @@ jobs: with: path: src - name: Restore Go module cache - uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: gomodcache key: ${{ needs.gomod-cache.outputs.cache-key }} diff --git a/VERSION.txt b/VERSION.txt index 27a0f1d276599..c67d041e14bb0 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.96.5 +1.98.5 diff --git a/cmd/containerboot/kube.go b/cmd/containerboot/kube.go index 73f5819b406db..3e97710da6c92 100644 --- a/cmd/containerboot/kube.go +++ b/cmd/containerboot/kube.go @@ -21,6 +21,7 @@ import ( "github.com/fsnotify/fsnotify" "tailscale.com/client/local" "tailscale.com/ipn" + "tailscale.com/kube/authkey" "tailscale.com/kube/egressservices" "tailscale.com/kube/ingressservices" "tailscale.com/kube/kubeapi" @@ -32,7 +33,6 @@ import ( ) const fieldManager = "tailscale-container" -const kubeletMountedConfigLn = "..data" // kubeClient is a wrapper around Tailscale's internal kube client that knows how to talk to the kube API server. We use // this rather than any of the upstream Kubernetes client libaries to avoid extra imports. @@ -127,6 +127,9 @@ func (kc *kubeClient) deleteAuthKey(ctx context.Context) error { // resetContainerbootState resets state from previous runs of containerboot to // ensure the operator doesn't use stale state when a Pod is first recreated. +// +// Device identity keys (device_id, device_fqdn, device_ips) are preserved so +// the operator can clean up the old device from the control plane. func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string, tailscaledConfigAuthkey string) error { existingSecret, err := kc.GetSecret(ctx, kc.stateSecret) switch { @@ -139,12 +142,7 @@ func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string s := &kubeapi.Secret{ Data: map[string][]byte{ - kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion), - - // TODO(tomhjp): Perhaps shouldn't clear device ID and use a different signal, as this could leak tailnet devices. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion), kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -169,47 +167,18 @@ func (kc *kubeClient) setAndWaitForAuthKeyReissue(ctx context.Context, client *l return fmt.Errorf("error disconnecting from control: %w", err) } - err = kc.setReissueAuthKey(ctx, tailscaledConfigAuthKey) + err = authkey.SetReissueAuthKey(ctx, kc.Client, kc.stateSecret, tailscaledConfigAuthKey, authkey.TailscaleContainerFieldManager) if err != nil { return fmt.Errorf("failed to set reissue_authkey in Kubernetes Secret: %w", err) } - err = kc.waitForAuthKeyReissue(ctx, cfg.TailscaledConfigFilePath, tailscaledConfigAuthKey, 10*time.Minute) - if err != nil { - return fmt.Errorf("failed to receive new auth key: %w", err) - } - - return nil -} - -func (kc *kubeClient) setReissueAuthKey(ctx context.Context, authKey string) error { - s := &kubeapi.Secret{ - Data: map[string][]byte{ - kubetypes.KeyReissueAuthkey: []byte(authKey), - }, + clearFn := func(ctx context.Context) error { + return authkey.ClearReissueAuthKey(ctx, kc.Client, kc.stateSecret, authkey.TailscaleContainerFieldManager) } - log.Printf("Requesting a new auth key from operator") - return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) -} - -func (kc *kubeClient) waitForAuthKeyReissue(ctx context.Context, configPath string, oldAuthKey string, maxWait time.Duration) error { - log.Printf("Waiting for operator to provide new auth key (max wait: %v)", maxWait) - - ctx, cancel := context.WithTimeout(ctx, maxWait) - defer cancel() - - tailscaledCfgDir := filepath.Dir(configPath) - toWatch := filepath.Join(tailscaledCfgDir, kubeletMountedConfigLn) - - var ( - pollTicker <-chan time.Time - eventChan <-chan fsnotify.Event - ) - - pollInterval := 5 * time.Second - - // Try to use fsnotify for faster notification + getAuthKey := func() string { return authkey.AuthKeyFromConfig(cfg.TailscaledConfigFilePath) } + tailscaledCfgDir := filepath.Dir(cfg.TailscaledConfigFilePath) + var notify <-chan struct{} if w, err := fsnotify.NewWatcher(); err != nil { log.Printf("auth key reissue: fsnotify unavailable, using polling: %v", err) } else if err := w.Add(tailscaledCfgDir); err != nil { @@ -217,54 +186,28 @@ func (kc *kubeClient) waitForAuthKeyReissue(ctx context.Context, configPath stri log.Printf("auth key reissue: fsnotify watch failed, using polling: %v", err) } else { defer w.Close() + ch := make(chan struct{}, 1) + toWatch := filepath.Join(tailscaledCfgDir, "..data") + go func() { + for ev := range w.Events { + if ev.Name == toWatch { + select { + case ch <- struct{}{}: + default: + } + } + } + }() + notify = ch log.Printf("auth key reissue: watching for config changes via fsnotify") - eventChan = w.Events } - // still keep polling if using fsnotify, for logging and in case fsnotify fails - pt := time.NewTicker(pollInterval) - defer pt.Stop() - pollTicker = pt.C - - start := time.Now() - - for { - select { - case <-ctx.Done(): - return fmt.Errorf("timeout waiting for auth key reissue after %v", maxWait) - case <-pollTicker: // Waits for polling tick, continues when received - case event := <-eventChan: - if event.Name != toWatch { - continue - } - } - - newAuthKey := authkeyFromTailscaledConfig(configPath) - if newAuthKey != "" && newAuthKey != oldAuthKey { - log.Printf("New auth key received from operator after %v", time.Since(start).Round(time.Second)) - - if err := kc.clearReissueAuthKeyRequest(ctx); err != nil { - log.Printf("Warning: failed to clear reissue request: %v", err) - } - - return nil - } - - if eventChan == nil && pollTicker != nil { - log.Printf("Waiting for new auth key from operator (%v elapsed)", time.Since(start).Round(time.Second)) - } + err = authkey.WaitForAuthKeyReissue(ctx, tailscaledConfigAuthKey, 10*time.Minute, getAuthKey, clearFn, notify) + if err != nil { + return fmt.Errorf("failed to receive new auth key: %w", err) } -} -// clearReissueAuthKeyRequest removes the reissue_authkey marker from the Secret -// to signal to the operator that we've successfully received the new key. -func (kc *kubeClient) clearReissueAuthKeyRequest(ctx context.Context) error { - s := &kubeapi.Secret{ - Data: map[string][]byte{ - kubetypes.KeyReissueAuthkey: nil, - }, - } - return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) + return nil } // waitForConsistentState waits for tailscaled to finish writing state if it diff --git a/cmd/containerboot/kube_test.go b/cmd/containerboot/kube_test.go index 6acaa60e1588e..fec0b74f7d8f1 100644 --- a/cmd/containerboot/kube_test.go +++ b/cmd/containerboot/kube_test.go @@ -31,7 +31,7 @@ func TestSetupKube(t *testing.T) { kc *kubeClient }{ { - name: "TS_AUTHKEY set, state Secret exists", + name: "authkey-set-secret-exists", cfg: &settings{ AuthKey: "foo", KubeSecret: "foo", @@ -50,7 +50,7 @@ func TestSetupKube(t *testing.T) { }, }, { - name: "TS_AUTHKEY set, state Secret does not exist, we have permissions to create it", + name: "authkey-set-secret-missing-can-create", cfg: &settings{ AuthKey: "foo", KubeSecret: "foo", @@ -69,7 +69,7 @@ func TestSetupKube(t *testing.T) { }, }, { - name: "TS_AUTHKEY set, state Secret does not exist, we do not have permissions to create it", + name: "authkey-set-secret-missing-cannot-create", cfg: &settings{ AuthKey: "foo", KubeSecret: "foo", @@ -89,7 +89,7 @@ func TestSetupKube(t *testing.T) { wantErr: true, }, { - name: "TS_AUTHKEY set, we encounter a non-404 error when trying to retrieve the state Secret", + name: "authkey-set-get-secret-non-404-error", cfg: &settings{ AuthKey: "foo", KubeSecret: "foo", @@ -109,7 +109,7 @@ func TestSetupKube(t *testing.T) { wantErr: true, }, { - name: "TS_AUTHKEY set, we encounter a non-404 error when trying to check Secret permissions", + name: "authkey-set-check-perms-error", cfg: &settings{ AuthKey: "foo", KubeSecret: "foo", @@ -127,7 +127,7 @@ func TestSetupKube(t *testing.T) { }, { // Interactive login using URL in Pod logs - name: "TS_AUTHKEY not set, state Secret does not exist, we have permissions to create it", + name: "no-authkey-secret-missing-can-create", cfg: &settings{ KubeSecret: "foo", }, @@ -145,7 +145,7 @@ func TestSetupKube(t *testing.T) { }, { // Interactive login using URL in Pod logs - name: "TS_AUTHKEY not set, state Secret exists, but does not contain auth key", + name: "no-authkey-secret-exists-no-key", cfg: &settings{ KubeSecret: "foo", }, @@ -162,7 +162,7 @@ func TestSetupKube(t *testing.T) { }}, }, { - name: "TS_AUTHKEY not set, state Secret contains auth key, we do not have RBAC to patch it", + name: "no-authkey-secret-has-key-cannot-patch", cfg: &settings{ KubeSecret: "foo", }, @@ -180,7 +180,7 @@ func TestSetupKube(t *testing.T) { wantErr: true, }, { - name: "TS_AUTHKEY not set, state Secret contains auth key, we have RBAC to patch it", + name: "no-authkey-secret-has-key-can-patch", cfg: &settings{ KubeSecret: "foo", }, @@ -257,12 +257,8 @@ func TestResetContainerbootState(t *testing.T) { authkey: "new-authkey", initial: map[string][]byte{}, expected: map[string][]byte{ - kubetypes.KeyCapVer: capver, - kubetypes.KeyPodUID: []byte("1234"), - // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyCapVer: capver, + kubetypes.KeyPodUID: []byte("1234"), kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -271,11 +267,7 @@ func TestResetContainerbootState(t *testing.T) { "empty_initial_no_pod_uid": { initial: map[string][]byte{}, expected: map[string][]byte{ - kubetypes.KeyCapVer: capver, - // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyCapVer: capver, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -303,9 +295,6 @@ func TestResetContainerbootState(t *testing.T) { kubetypes.KeyCapVer: capver, kubetypes.KeyPodUID: []byte("1234"), // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -321,9 +310,6 @@ func TestResetContainerbootState(t *testing.T) { kubetypes.KeyCapVer: capver, kubetypes.KeyReissueAuthkey: nil, // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -338,9 +324,6 @@ func TestResetContainerbootState(t *testing.T) { kubetypes.KeyCapVer: capver, // reissue_authkey not cleared. // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -355,9 +338,6 @@ func TestResetContainerbootState(t *testing.T) { kubetypes.KeyCapVer: capver, // reissue_authkey not cleared. // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index c020ab0a94402..1a11c3150cdd2 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -136,11 +136,12 @@ import ( "time" "golang.org/x/sys/unix" - "tailscale.com/client/tailscale" + + "tailscale.com/client/local" "tailscale.com/health" "tailscale.com/ipn" - "tailscale.com/ipn/conffile" kubeutils "tailscale.com/k8s-operator" + "tailscale.com/kube/authkey" healthz "tailscale.com/kube/health" "tailscale.com/kube/kubetypes" klc "tailscale.com/kube/localclient" @@ -149,7 +150,6 @@ import ( "tailscale.com/tailcfg" "tailscale.com/types/logger" "tailscale.com/types/netmap" - "tailscale.com/types/ptr" "tailscale.com/util/deephash" "tailscale.com/util/dnsname" "tailscale.com/util/linuxfw" @@ -174,7 +174,6 @@ func main() { func run() error { log.SetPrefix("boot: ") - tailscale.I_Acknowledge_This_API_Is_Unstable = true cfg, err := configFromEnv() if err != nil { @@ -211,7 +210,7 @@ func run() error { var tailscaledConfigAuthkey string if isOneStepConfig(cfg) { - tailscaledConfigAuthkey = authkeyFromTailscaledConfig(cfg.TailscaledConfigFilePath) + tailscaledConfigAuthkey = authkey.AuthKeyFromConfig(cfg.TailscaledConfigFilePath) } var kc *kubeClient @@ -307,7 +306,7 @@ func run() error { } } - w, err := client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialPrefs|ipn.NotifyInitialState|ipn.NotifyInitialHealthState) + w, err := client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialPrefs|ipn.NotifyInitialState|ipn.NotifyInitialHealthState|ipn.NotifyRateLimit) if err != nil { return fmt.Errorf("failed to watch tailscaled for updates: %w", err) } @@ -347,7 +346,7 @@ func run() error { if err := tailscaleUp(bootCtx, cfg); err != nil { return fmt.Errorf("failed to auth tailscale: %w", err) } - w, err = client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState) + w, err = client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState|ipn.NotifyRateLimit) if err != nil { return fmt.Errorf("rewatching tailscaled for updates after auth: %w", err) } @@ -376,7 +375,7 @@ authLoop: if hasKubeStateStore(cfg) { log.Printf("Auth key missing or invalid (NeedsLogin state), disconnecting from control and requesting new key from operator") - err := kc.setAndWaitForAuthKeyReissue(bootCtx, client, cfg, tailscaledConfigAuthkey) + err := kc.setAndWaitForAuthKeyReissue(ctx, client, cfg, tailscaledConfigAuthkey) if err != nil { return fmt.Errorf("failed to get a reissued authkey: %w", err) } @@ -416,7 +415,7 @@ authLoop: if isOneStepConfig(cfg) && hasKubeStateStore(cfg) { log.Printf("Auth key failed to authenticate (may be expired or single-use), disconnecting from control and requesting new key from operator") - err := kc.setAndWaitForAuthKeyReissue(bootCtx, client, cfg, tailscaledConfigAuthkey) + err := kc.setAndWaitForAuthKeyReissue(ctx, client, cfg, tailscaledConfigAuthkey) if err != nil { return fmt.Errorf("failed to get a reissued authkey: %w", err) } @@ -459,7 +458,7 @@ authLoop: } } - w, err = client.WatchIPNBus(ctx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState) + w, err = client.WatchIPNBus(ctx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState|ipn.NotifyRateLimit) if err != nil { return fmt.Errorf("rewatching tailscaled for updates after auth: %w", err) } @@ -538,7 +537,7 @@ authLoop: failedResolveAttempts++ } - var egressSvcsNotify chan ipn.Notify + var egressSvcsNotify chan *netmap.NetworkMap notifyChan := make(chan ipn.Notify) errChan := make(chan error) go func() { @@ -552,10 +551,17 @@ authLoop: } } }() + // Peer set changes (Add/Remove) no longer ride on the IPN bus; poll + // periodically so egress FQDN resolution and peer-aware work picks + // them up. SelfChange covers prompt self changes. + const peerPollInterval = 15 * time.Second + peerPoll := time.NewTicker(peerPollInterval) + defer peerPoll.Stop() var wg sync.WaitGroup runLoop: for { + var processNetmap bool select { case <-ctx.Done(): // Although killTailscaled() is deferred earlier, if we @@ -568,6 +574,8 @@ runLoop: return fmt.Errorf("failed to read from tailscaled: %w", err) case err := <-cfgWatchErrChan: return fmt.Errorf("failed to watch tailscaled config: %w", err) + case <-peerPoll.C: + processNetmap = true case n := <-notifyChan: // TODO: (ChaosInTheCRD) Add node removed check when supported by ipn if n.State != nil && *n.State != ipn.Running { @@ -578,258 +586,270 @@ runLoop: // whereupon we'll go through initial auth again. return fmt.Errorf("tailscaled left running state (now in state %q), exiting", *n.State) } - if n.NetMap != nil { - addrs = n.NetMap.SelfNode.Addresses().AsSlice() - newCurrentIPs := deephash.Hash(&addrs) - ipsHaveChanged := newCurrentIPs != currentIPs - - // Store device ID in a Kubernetes Secret before - // setting up any routing rules. This ensures - // that, for containerboot instances that are - // Kubernetes operator proxies, the operator is - // able to retrieve the device ID from the - // Kubernetes Secret to clean up tailnet nodes - // for proxies whose route setup continuously - // fails. - deviceID := n.NetMap.SelfNode.StableID() - if hasKubeStateStore(cfg) && deephash.Update(¤tDeviceID, &deviceID) { - if err := kc.storeDeviceID(ctx, n.NetMap.SelfNode.StableID()); err != nil { - return fmt.Errorf("storing device ID in Kubernetes Secret: %w", err) - } + if n.SelfChange != nil { + processNetmap = true + } + case <-tc: + newBackendAddrs, err := resolveDNS(ctx, cfg.ProxyTargetDNSName) + if err != nil { + log.Printf("[unexpected] error resolving DNS name %s: %v", cfg.ProxyTargetDNSName, err) + resetTimer(true) + continue + } + backendsHaveChanged := !(slices.EqualFunc(backendAddrs, newBackendAddrs, func(ip1 net.IP, ip2 net.IP) bool { + return slices.ContainsFunc(newBackendAddrs, func(ip net.IP) bool { return ip.Equal(ip1) }) + })) + if backendsHaveChanged && len(addrs) != 0 { + log.Printf("Backend address change detected, installing proxy rules for backends %v", newBackendAddrs) + if err := installIngressForwardingRuleForDNSTarget(ctx, newBackendAddrs, addrs, nfr); err != nil { + return fmt.Errorf("installing ingress proxy rules for DNS target %s: %v", cfg.ProxyTargetDNSName, err) + } + } + backendAddrs = newBackendAddrs + resetTimer(false) + continue + case e := <-egressSvcsErrorChan: + return fmt.Errorf("egress proxy failed: %v", e) + case e := <-ingressSvcsErrorChan: + return fmt.Errorf("ingress proxy failed: %v", e) + } + if !processNetmap { + continue + } + nm, err := fetchNetMap(ctx, client) + if err != nil { + log.Printf("error fetching netmap: %v", err) + continue + } + if nm != nil { + addrs = nm.SelfNode.Addresses().AsSlice() + newCurrentIPs := deephash.Hash(&addrs) + ipsHaveChanged := newCurrentIPs != currentIPs + + // Store device ID in a Kubernetes Secret before + // setting up any routing rules. This ensures + // that, for containerboot instances that are + // Kubernetes operator proxies, the operator is + // able to retrieve the device ID from the + // Kubernetes Secret to clean up tailnet nodes + // for proxies whose route setup continuously + // fails. + deviceID := nm.SelfNode.StableID() + if hasKubeStateStore(cfg) && deephash.Update(¤tDeviceID, &deviceID) { + if err := kc.storeDeviceID(ctx, nm.SelfNode.StableID()); err != nil { + return fmt.Errorf("storing device ID in Kubernetes Secret: %w", err) + } + } + if cfg.TailnetTargetFQDN != "" { + egressAddrs, err := resolveTailnetFQDN(nm, cfg.TailnetTargetFQDN) + if err != nil { + log.Print(err.Error()) + break } - if cfg.TailnetTargetFQDN != "" { - egressAddrs, err := resolveTailnetFQDN(n.NetMap, cfg.TailnetTargetFQDN) - if err != nil { - log.Print(err.Error()) - break - } - newCurentEgressIPs := deephash.Hash(&egressAddrs) - egressIPsHaveChanged := newCurentEgressIPs != currentEgressIPs - // The firewall rules get (re-)installed: - // - on startup - // - when the tailnet IPs of the tailnet target have changed - // - when the tailnet IPs of this node have changed - if (egressIPsHaveChanged || ipsHaveChanged) && len(egressAddrs) != 0 { - var rulesInstalled bool - for _, egressAddr := range egressAddrs { - ea := egressAddr.Addr() - if ea.Is4() || (ea.Is6() && nfr.HasIPV6NAT()) { - rulesInstalled = true - log.Printf("Installing forwarding rules for destination %v", ea.String()) - if err := installEgressForwardingRule(ctx, ea.String(), addrs, nfr); err != nil { - return fmt.Errorf("installing egress proxy rules for destination %s: %v", ea.String(), err) - } + newCurentEgressIPs := deephash.Hash(&egressAddrs) + egressIPsHaveChanged := newCurentEgressIPs != currentEgressIPs + // The firewall rules get (re-)installed: + // - on startup + // - when the tailnet IPs of the tailnet target have changed + // - when the tailnet IPs of this node have changed + if (egressIPsHaveChanged || ipsHaveChanged) && len(egressAddrs) != 0 { + var rulesInstalled bool + for _, egressAddr := range egressAddrs { + ea := egressAddr.Addr() + if ea.Is4() || (ea.Is6() && nfr.HasIPV6NAT()) { + rulesInstalled = true + log.Printf("Installing forwarding rules for destination %v", ea.String()) + if err := installEgressForwardingRule(ctx, ea.String(), addrs, nfr); err != nil { + return fmt.Errorf("installing egress proxy rules for destination %s: %v", ea.String(), err) } } - if !rulesInstalled { - return fmt.Errorf("no forwarding rules for egress addresses %v, host supports IPv6: %v", egressAddrs, nfr.HasIPV6NAT()) - } } - currentEgressIPs = newCurentEgressIPs - } - if cfg.ProxyTargetIP != "" && len(addrs) != 0 && ipsHaveChanged { - log.Printf("Installing proxy rules") - if err := installIngressForwardingRule(ctx, cfg.ProxyTargetIP, addrs, nfr); err != nil { - return fmt.Errorf("installing ingress proxy rules: %w", err) + if !rulesInstalled { + return fmt.Errorf("no forwarding rules for egress addresses %v, host supports IPv6: %v", egressAddrs, nfr.HasIPV6NAT()) } } - if cfg.ProxyTargetDNSName != "" && len(addrs) != 0 && ipsHaveChanged { - newBackendAddrs, err := resolveDNS(ctx, cfg.ProxyTargetDNSName) - if err != nil { - log.Printf("[unexpected] error resolving DNS name %s: %v", cfg.ProxyTargetDNSName, err) - resetTimer(true) - continue - } - backendsHaveChanged := !(slices.EqualFunc(backendAddrs, newBackendAddrs, func(ip1 net.IP, ip2 net.IP) bool { - return slices.ContainsFunc(newBackendAddrs, func(ip net.IP) bool { return ip.Equal(ip1) }) - })) - if backendsHaveChanged { - log.Printf("installing ingress proxy rules for backends %v", newBackendAddrs) - if err := installIngressForwardingRuleForDNSTarget(ctx, newBackendAddrs, addrs, nfr); err != nil { - return fmt.Errorf("error installing ingress proxy rules: %w", err) - } - } - resetTimer(false) - backendAddrs = newBackendAddrs + currentEgressIPs = newCurentEgressIPs + } + if cfg.ProxyTargetIP != "" && len(addrs) != 0 && ipsHaveChanged { + log.Printf("Installing proxy rules") + if err := installIngressForwardingRule(ctx, cfg.ProxyTargetIP, addrs, nfr); err != nil { + return fmt.Errorf("installing ingress proxy rules: %w", err) } - if cfg.ServeConfigPath != "" { - cd := certDomainFromNetmap(n.NetMap) - if cd == "" { - cd = kubetypes.ValueNoHTTPS - } - prev := certDomain.Swap(ptr.To(cd)) - if prev == nil || *prev != cd { - select { - case certDomainChanged <- true: - default: - } - } + } + if cfg.ProxyTargetDNSName != "" && len(addrs) != 0 && ipsHaveChanged { + newBackendAddrs, err := resolveDNS(ctx, cfg.ProxyTargetDNSName) + if err != nil { + log.Printf("[unexpected] error resolving DNS name %s: %v", cfg.ProxyTargetDNSName, err) + resetTimer(true) + continue } - if cfg.TailnetTargetIP != "" && ipsHaveChanged && len(addrs) != 0 { - log.Printf("Installing forwarding rules for destination %v", cfg.TailnetTargetIP) - if err := installEgressForwardingRule(ctx, cfg.TailnetTargetIP, addrs, nfr); err != nil { - return fmt.Errorf("installing egress proxy rules: %w", err) + backendsHaveChanged := !(slices.EqualFunc(backendAddrs, newBackendAddrs, func(ip1 net.IP, ip2 net.IP) bool { + return slices.ContainsFunc(newBackendAddrs, func(ip net.IP) bool { return ip.Equal(ip1) }) + })) + if backendsHaveChanged { + log.Printf("installing ingress proxy rules for backends %v", newBackendAddrs) + if err := installIngressForwardingRuleForDNSTarget(ctx, newBackendAddrs, addrs, nfr); err != nil { + return fmt.Errorf("error installing ingress proxy rules: %w", err) } } - // If this is a L7 cluster ingress proxy (set up - // by Kubernetes operator) and proxying of - // cluster traffic to the ingress target is - // enabled, set up proxy rule each time the - // tailnet IPs of this node change (including - // the first time they become available). - if cfg.AllowProxyingClusterTrafficViaIngress && cfg.ServeConfigPath != "" && ipsHaveChanged && len(addrs) != 0 { - log.Printf("installing rules to forward traffic for %s to node's tailnet IP", cfg.PodIP) - if err := installTSForwardingRuleForDestination(ctx, cfg.PodIP, addrs, nfr); err != nil { - return fmt.Errorf("installing rules to forward traffic to node's tailnet IP: %w", err) - } + resetTimer(false) + backendAddrs = newBackendAddrs + } + if cfg.ServeConfigPath != "" { + cd := certDomainFromNetmap(nm) + if cd == "" { + cd = kubetypes.ValueNoHTTPS } - currentIPs = newCurrentIPs - - // Only store device FQDN and IP addresses to - // Kubernetes Secret when any required proxy - // route setup has succeeded. IPs and FQDN are - // read from the Secret by the Tailscale - // Kubernetes operator and, for some proxy - // types, such as Tailscale Ingress, advertized - // on the Ingress status. Writing them to the - // Secret only after the proxy routing has been - // set up ensures that the operator does not - // advertize endpoints of broken proxies. - // TODO (irbekrm): instead of using the IP and FQDN, have some other mechanism for the proxy signal that it is 'Ready'. - deviceEndpoints := []any{n.NetMap.SelfNode.Name(), n.NetMap.SelfNode.Addresses()} - if hasKubeStateStore(cfg) && deephash.Update(¤tDeviceEndpoints, &deviceEndpoints) { - if err := kc.storeDeviceEndpoints(ctx, n.NetMap.SelfNode.Name(), n.NetMap.SelfNode.Addresses().AsSlice()); err != nil { - return fmt.Errorf("storing device IPs and FQDN in Kubernetes Secret: %w", err) + prev := certDomain.Swap(new(cd)) + if prev == nil || *prev != cd { + select { + case certDomainChanged <- true: + default: } } - - if healthCheck != nil { - healthCheck.Update(len(addrs) != 0) + } + if cfg.TailnetTargetIP != "" && ipsHaveChanged && len(addrs) != 0 { + log.Printf("Installing forwarding rules for destination %v", cfg.TailnetTargetIP) + if err := installEgressForwardingRule(ctx, cfg.TailnetTargetIP, addrs, nfr); err != nil { + return fmt.Errorf("installing egress proxy rules: %w", err) + } + } + // If this is a L7 cluster ingress proxy (set up + // by Kubernetes operator) and proxying of + // cluster traffic to the ingress target is + // enabled, set up proxy rule each time the + // tailnet IPs of this node change (including + // the first time they become available). + if cfg.AllowProxyingClusterTrafficViaIngress && cfg.ServeConfigPath != "" && ipsHaveChanged && len(addrs) != 0 { + log.Printf("installing rules to forward traffic for %s to node's tailnet IP", cfg.PodIP) + if err := installTSForwardingRuleForDestination(ctx, cfg.PodIP, addrs, nfr); err != nil { + return fmt.Errorf("installing rules to forward traffic to node's tailnet IP: %w", err) + } + } + currentIPs = newCurrentIPs + + // Only store device FQDN and IP addresses to + // Kubernetes Secret when any required proxy + // route setup has succeeded. IPs and FQDN are + // read from the Secret by the Tailscale + // Kubernetes operator and, for some proxy + // types, such as Tailscale Ingress, advertized + // on the Ingress status. Writing them to the + // Secret only after the proxy routing has been + // set up ensures that the operator does not + // advertize endpoints of broken proxies. + // TODO (irbekrm): instead of using the IP and FQDN, have some other mechanism for the proxy signal that it is 'Ready'. + deviceEndpoints := []any{nm.SelfNode.Name(), nm.SelfNode.Addresses()} + if hasKubeStateStore(cfg) && deephash.Update(¤tDeviceEndpoints, &deviceEndpoints) { + if err := kc.storeDeviceEndpoints(ctx, nm.SelfNode.Name(), nm.SelfNode.Addresses().AsSlice()); err != nil { + return fmt.Errorf("storing device IPs and FQDN in Kubernetes Secret: %w", err) } + } - var prevServeConfig *ipn.ServeConfig - if getAutoAdvertiseBool() { - prevServeConfig, err = client.GetServeConfig(ctx) - if err != nil { - return fmt.Errorf("autoadvertisement: failed to get serve config: %w", err) - } + if healthCheck != nil { + healthCheck.Update(len(addrs) != 0) + } - err = refreshAdvertiseServices(ctx, prevServeConfig, klc.New(client)) - if err != nil { - return fmt.Errorf("autoadvertisement: failed to refresh advertise services: %w", err) - } + var prevServeConfig *ipn.ServeConfig + if getAutoAdvertiseBool() { + prevServeConfig, err = client.GetServeConfig(ctx) + if err != nil { + return fmt.Errorf("autoadvertisement: failed to get serve config: %w", err) } - if cfg.ServeConfigPath != "" { - triggerWatchServeConfigChanges.Do(func() { - go watchServeConfigChanges(ctx, certDomainChanged, certDomain, client, kc, cfg, prevServeConfig) - }) + err = refreshAdvertiseServices(ctx, prevServeConfig, klc.New(client)) + if err != nil { + return fmt.Errorf("autoadvertisement: failed to refresh advertise services: %w", err) } + } - if egressSvcsNotify != nil { - egressSvcsNotify <- n - } + if cfg.ServeConfigPath != "" { + triggerWatchServeConfigChanges.Do(func() { + go watchServeConfigChanges(ctx, certDomainChanged, certDomain, client, kc, cfg, prevServeConfig) + }) } - if !startupTasksDone { - // For containerboot instances that act as TCP proxies (proxying traffic to an endpoint - // passed via one of the env vars that containerboot reads) and store state in a - // Kubernetes Secret, we consider startup tasks done at the point when device info has - // been successfully stored to state Secret. For all other containerboot instances, if - // we just get to this point the startup tasks can be considered done. - if !isL3Proxy(cfg) || !hasKubeStateStore(cfg) || (currentDeviceEndpoints != deephash.Sum{} && currentDeviceID != deephash.Sum{}) { - // This log message is used in tests to detect when all - // post-auth configuration is done. - log.Println("Startup complete, waiting for shutdown signal") - startupTasksDone = true - - // Configure egress proxy. Egress proxy will set up firewall rules to proxy - // traffic to tailnet targets configured in the provided configuration file. It - // will then continuously monitor the config file and netmap updates and - // reconfigure the firewall rules as needed. If any of its operations fail, it - // will crash this node. - if cfg.EgressProxiesCfgPath != "" { - log.Printf("configuring egress proxy using configuration file at %s", cfg.EgressProxiesCfgPath) - egressSvcsNotify = make(chan ipn.Notify) - opts := egressProxyRunOpts{ - cfgPath: cfg.EgressProxiesCfgPath, - nfr: nfr, - kc: kc, - tsClient: client, - stateSecret: cfg.KubeSecret, - netmapChan: egressSvcsNotify, - podIPv4: cfg.PodIPv4, - tailnetAddrs: addrs, - } - go func() { - if err := ep.run(ctx, n, opts); err != nil { - egressSvcsErrorChan <- err - } - }() + + if egressSvcsNotify != nil { + egressSvcsNotify <- nm + } + } + if !startupTasksDone { + // For containerboot instances that act as TCP proxies (proxying traffic to an endpoint + // passed via one of the env vars that containerboot reads) and store state in a + // Kubernetes Secret, we consider startup tasks done at the point when device info has + // been successfully stored to state Secret. For all other containerboot instances, if + // we just get to this point the startup tasks can be considered done. + if !isL3Proxy(cfg) || !hasKubeStateStore(cfg) || (currentDeviceEndpoints != deephash.Sum{} && currentDeviceID != deephash.Sum{}) { + // This log message is used in tests to detect when all + // post-auth configuration is done. + log.Println("Startup complete, waiting for shutdown signal") + startupTasksDone = true + + // Configure egress proxy. Egress proxy will set up firewall rules to proxy + // traffic to tailnet targets configured in the provided configuration file. It + // will then continuously monitor the config file and netmap updates and + // reconfigure the firewall rules as needed. If any of its operations fail, it + // will crash this node. + if cfg.EgressProxiesCfgPath != "" { + log.Printf("configuring egress proxy using configuration file at %s", cfg.EgressProxiesCfgPath) + egressSvcsNotify = make(chan *netmap.NetworkMap) + opts := egressProxyRunOpts{ + cfgPath: cfg.EgressProxiesCfgPath, + nfr: nfr, + kc: kc, + tsClient: client, + stateSecret: cfg.KubeSecret, + netmapChan: egressSvcsNotify, + podIPv4: cfg.PodIPv4, + tailnetAddrs: addrs, } - ip := ingressProxy{} - if cfg.IngressProxiesCfgPath != "" { - log.Printf("configuring ingress proxy using configuration file at %s", cfg.IngressProxiesCfgPath) - opts := ingressProxyOpts{ - cfgPath: cfg.IngressProxiesCfgPath, - nfr: nfr, - kc: kc, - stateSecret: cfg.KubeSecret, - podIPv4: cfg.PodIPv4, - podIPv6: cfg.PodIPv6, + go func() { + if err := ep.run(ctx, nm, opts); err != nil { + egressSvcsErrorChan <- err } - go func() { - if err := ip.run(ctx, opts); err != nil { - ingressSvcsErrorChan <- err - } - }() + }() + } + ip := ingressProxy{} + if cfg.IngressProxiesCfgPath != "" { + log.Printf("configuring ingress proxy using configuration file at %s", cfg.IngressProxiesCfgPath) + opts := ingressProxyOpts{ + cfgPath: cfg.IngressProxiesCfgPath, + nfr: nfr, + kc: kc, + stateSecret: cfg.KubeSecret, + podIPv4: cfg.PodIPv4, + podIPv6: cfg.PodIPv6, } + go func() { + if err := ip.run(ctx, opts); err != nil { + ingressSvcsErrorChan <- err + } + }() + } - // Wait on tailscaled process. It won't be cleaned up by default when the - // container exits as it is not PID1. TODO (irbekrm): perhaps we can replace the - // reaper by a running cmd.Wait in a goroutine immediately after starting - // tailscaled? - reaper := func() { - defer wg.Done() - for { - var status unix.WaitStatus - _, err := unix.Wait4(daemonProcess.Pid, &status, 0, nil) - if errors.Is(err, unix.EINTR) { - continue - } - if err != nil { - log.Fatalf("Waiting for tailscaled to exit: %v", err) - } - log.Print("tailscaled exited") - os.Exit(0) + // Wait on tailscaled process. It won't be cleaned up by default when the + // container exits as it is not PID1. TODO (irbekrm): perhaps we can replace the + // reaper by a running cmd.Wait in a goroutine immediately after starting + // tailscaled? + reaper := func() { + defer wg.Done() + for { + var status unix.WaitStatus + _, err := unix.Wait4(daemonProcess.Pid, &status, 0, nil) + if errors.Is(err, unix.EINTR) { + continue + } + if err != nil { + log.Fatalf("Waiting for tailscaled to exit: %v", err) } + log.Print("tailscaled exited") + os.Exit(0) } - wg.Add(1) - go reaper() - } - } - case <-tc: - newBackendAddrs, err := resolveDNS(ctx, cfg.ProxyTargetDNSName) - if err != nil { - log.Printf("[unexpected] error resolving DNS name %s: %v", cfg.ProxyTargetDNSName, err) - resetTimer(true) - continue - } - backendsHaveChanged := !(slices.EqualFunc(backendAddrs, newBackendAddrs, func(ip1 net.IP, ip2 net.IP) bool { - return slices.ContainsFunc(newBackendAddrs, func(ip net.IP) bool { return ip.Equal(ip1) }) - })) - if backendsHaveChanged && len(addrs) != 0 { - log.Printf("Backend address change detected, installing proxy rules for backends %v", newBackendAddrs) - if err := installIngressForwardingRuleForDNSTarget(ctx, newBackendAddrs, addrs, nfr); err != nil { - return fmt.Errorf("installing ingress proxy rules for DNS target %s: %v", cfg.ProxyTargetDNSName, err) } + wg.Add(1) + go reaper() } - backendAddrs = newBackendAddrs - resetTimer(false) - case e := <-egressSvcsErrorChan: - return fmt.Errorf("egress proxy failed: %v", e) - case e := <-ingressSvcsErrorChan: - return fmt.Errorf("ingress proxy failed: %v", e) } } wg.Wait() @@ -965,6 +985,15 @@ func runHTTPServer(mux *http.ServeMux, addr string) (close func() error) { } } +// fetchNetMap fetches the current netmap from tailscaled via the +// "current-netmap" localapi debug action. The debug action's payload +// shape is intentionally not part of any stable API; containerboot +// reads its own internal-package types out of it. New external consumers +// should not rely on this — see [local.Client.Status] and friends. +func fetchNetMap(ctx context.Context, lc *local.Client) (*netmap.NetworkMap, error) { + return local.GetDebugResultJSON[*netmap.NetworkMap](ctx, lc, "current-netmap") +} + // resolveTailnetFQDN resolves a tailnet FQDN to a list of IP prefixes, which // can be either a peer device or a Tailscale Service. func resolveTailnetFQDN(nm *netmap.NetworkMap, fqdn string) ([]netip.Prefix, error) { @@ -1026,11 +1055,3 @@ func serviceIPsFromNetMap(nm *netmap.NetworkMap, fqdn dnsname.FQDN) []netip.Pref return prefixes } - -func authkeyFromTailscaledConfig(path string) string { - if cfg, err := conffile.Load(path); err == nil && cfg.Parsed.AuthKey != nil { - return *cfg.Parsed.AuthKey - } - - return "" -} diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go index f1d892a19d118..40f575250cb1c 100644 --- a/cmd/containerboot/main_test.go +++ b/cmd/containerboot/main_test.go @@ -15,6 +15,7 @@ import ( "fmt" "io" "io/fs" + "maps" "net" "net/http" "net/http/httptest" @@ -31,6 +32,7 @@ import ( "github.com/google/go-cmp/cmp" "golang.org/x/sys/unix" + "tailscale.com/cmd/testwrapper/flakytest" "tailscale.com/health" "tailscale.com/ipn" "tailscale.com/kube/egressservices" @@ -39,12 +41,12 @@ import ( "tailscale.com/tailcfg" "tailscale.com/tstest" "tailscale.com/types/netmap" - "tailscale.com/types/ptr" ) const configFileAuthKey = "some-auth-key" func TestContainerBoot(t *testing.T) { + flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/19380") boot := filepath.Join(t.TempDir(), "containerboot") if err := exec.Command("go", "build", "-ldflags", "-X main.testSleepDuration=1ms", "-o", boot, "tailscale.com/cmd/containerboot").Run(); err != nil { t.Fatalf("Building containerboot: %v", err) @@ -69,6 +71,12 @@ func TestContainerBoot(t *testing.T) { // Waits below to be true before proceeding to the next phase. Notify *ipn.Notify + // If non-nil, install this NetMap on the fake LocalAPI before + // sending Notify. This is the replacement for the old + // Notify.NetMap field; reactive consumers fetch the current + // netmap via /localapi/v0/netmap on their own. + NetMap *netmap.NetworkMap + // WantCmds is the commands that containerboot should run in this phase. WantCmds []string @@ -102,13 +110,11 @@ func TestContainerBoot(t *testing.T) { EndpointStatuses map[string]int } runningNotify := &ipn.Notify{ - State: ptr.To(ipn.Running), - NetMap: &netmap.NetworkMap{ - SelfNode: (&tailcfg.Node{ - StableID: tailcfg.StableNodeID("myID"), - Name: "test-node.test.ts.net.", - Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, - }).View(), + State: new(ipn.Running), + SelfChange: &tailcfg.Node{ + StableID: tailcfg.StableNodeID("myID"), + Name: "test-node.test.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, }, } type testCase struct { @@ -380,24 +386,29 @@ func TestContainerBoot(t *testing.T) { }, { Notify: &ipn.Notify{ - State: ptr.To(ipn.Running), - NetMap: &netmap.NetworkMap{ - SelfNode: (&tailcfg.Node{ - StableID: tailcfg.StableNodeID("myID"), - Name: "test-node.test.ts.net.", - Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, + State: new(ipn.Running), + SelfChange: &tailcfg.Node{ + StableID: tailcfg.StableNodeID("myID"), + Name: "test-node.test.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, + }, + }, + NetMap: &netmap.NetworkMap{ + SelfNode: (&tailcfg.Node{ + StableID: tailcfg.StableNodeID("myID"), + Name: "test-node.test.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, + }).View(), + Peers: []tailcfg.NodeView{ + (&tailcfg.Node{ + StableID: tailcfg.StableNodeID("ipv6ID"), + Name: "ipv6-node.test.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("::1/128")}, }).View(), - Peers: []tailcfg.NodeView{ - (&tailcfg.Node{ - StableID: tailcfg.StableNodeID("ipv6ID"), - Name: "ipv6-node.test.ts.net.", - Addresses: []netip.Prefix{netip.MustParsePrefix("::1/128")}, - }).View(), - }, }, }, WantLog: "no forwarding rules for egress addresses [::1/128], host supports IPv6: false", - WantExitCode: ptr.To(1), + WantExitCode: new(1), }, }, } @@ -416,7 +427,7 @@ func TestContainerBoot(t *testing.T) { }, { Notify: &ipn.Notify{ - State: ptr.To(ipn.NeedsLogin), + State: new(ipn.NeedsLogin), }, WantCmds: []string{ "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key", @@ -447,7 +458,7 @@ func TestContainerBoot(t *testing.T) { }, { Notify: &ipn.Notify{ - State: ptr.To(ipn.NeedsLogin), + State: new(ipn.NeedsLogin), }, WantCmds: []string{ "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=true --authkey=tskey-key", @@ -571,7 +582,7 @@ func TestContainerBoot(t *testing.T) { }, { Notify: &ipn.Notify{ - State: ptr.To(ipn.NeedsLogin), + State: new(ipn.NeedsLogin), }, WantCmds: []string{ "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key", @@ -628,15 +639,20 @@ func TestContainerBoot(t *testing.T) { }, { Notify: &ipn.Notify{ - State: ptr.To(ipn.Running), - NetMap: &netmap.NetworkMap{ - SelfNode: (&tailcfg.Node{ - StableID: tailcfg.StableNodeID("newID"), - Name: "new-name.test.ts.net.", - Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, - }).View(), + State: new(ipn.Running), + SelfChange: &tailcfg.Node{ + StableID: tailcfg.StableNodeID("newID"), + Name: "new-name.test.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, }, }, + NetMap: &netmap.NetworkMap{ + SelfNode: (&tailcfg.Node{ + StableID: tailcfg.StableNodeID("newID"), + Name: "new-name.test.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, + }).View(), + }, WantKubeSecret: map[string]string{ "authkey": "tskey-key", "device_fqdn": "new-name.test.ts.net.", @@ -1092,20 +1108,25 @@ func TestContainerBoot(t *testing.T) { }, { Notify: &ipn.Notify{ - State: ptr.To(ipn.Running), - NetMap: &netmap.NetworkMap{ - SelfNode: (&tailcfg.Node{ - StableID: tailcfg.StableNodeID("myID"), - Name: "test-node.test.ts.net.", - Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, + State: new(ipn.Running), + SelfChange: &tailcfg.Node{ + StableID: tailcfg.StableNodeID("myID"), + Name: "test-node.test.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, + }, + }, + NetMap: &netmap.NetworkMap{ + SelfNode: (&tailcfg.Node{ + StableID: tailcfg.StableNodeID("myID"), + Name: "test-node.test.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}, + }).View(), + Peers: []tailcfg.NodeView{ + (&tailcfg.Node{ + StableID: tailcfg.StableNodeID("fooID"), + Name: "foo.tailnetxyz.ts.net.", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.2/32")}, }).View(), - Peers: []tailcfg.NodeView{ - (&tailcfg.Node{ - StableID: tailcfg.StableNodeID("fooID"), - Name: "foo.tailnetxyz.ts.net.", - Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.2/32")}, - }).View(), - }, }, }, WantKubeSecret: map[string]string{ @@ -1132,7 +1153,7 @@ func TestContainerBoot(t *testing.T) { Phases: []phase{ { WantLog: "TS_EGRESS_PROXIES_CONFIG_PATH is only supported for Tailscale running on Kubernetes", - WantExitCode: ptr.To(1), + WantExitCode: new(1), }, }, } @@ -1181,7 +1202,7 @@ func TestContainerBoot(t *testing.T) { { // SIGTERM before state is finished writing, should wait for // consistent state before propagating SIGTERM to tailscaled. - Signal: ptr.To(unix.SIGTERM), + Signal: new(unix.SIGTERM), UpdateKubeSecret: map[string]string{ "_machinekey": "foo", "_profiles": "foo", @@ -1211,7 +1232,7 @@ func TestContainerBoot(t *testing.T) { kubetypes.KeyCapVer: capver, }, WantLog: "HTTP server at [::]:9002 closed", - WantExitCode: ptr.To(0), + WantExitCode: new(0), }, }, } @@ -1274,6 +1295,18 @@ func TestContainerBoot(t *testing.T) { t.Fatalf("phase %d: updating mtime for %q: %v", i, path, err) } } + nmForFake := p.NetMap + if nmForFake == nil && p.Notify != nil && p.Notify.SelfChange != nil { + // Synthesize a minimal netmap from SelfChange so + // containerboot's NetMap() fetch returns + // something usable when the test only set Notify. + nmForFake = &netmap.NetworkMap{ + SelfNode: p.Notify.SelfChange.View(), + } + } + if nmForFake != nil { + env.lapi.SetNetMap(nmForFake) + } env.lapi.Notify(p.Notify) if p.Signal != nil { cmd.Process.Signal(*p.Signal) @@ -1396,7 +1429,7 @@ func (b *lockingBuffer) String() string { func waitLogLine(t *testing.T, timeout time.Duration, b *lockingBuffer, want string) { deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { - for _, line := range strings.Split(b.String(), "\n") { + for line := range strings.SplitSeq(b.String(), "\n") { if !strings.HasPrefix(line, "boot: ") { continue } @@ -1466,6 +1499,7 @@ type localAPI struct { sync.Mutex cond *sync.Cond notify *ipn.Notify + netmap *netmap.NetworkMap // served by /localapi/v0/netmap } func (lc *localAPI) Start() error { @@ -1502,8 +1536,44 @@ func (lc *localAPI) Notify(n *ipn.Notify) { lc.cond.Broadcast() } +// SetNetMap installs the netmap that the fake /localapi/v0/netmap endpoint +// will return. +func (lc *localAPI) SetNetMap(nm *netmap.NetworkMap) { + lc.Lock() + defer lc.Unlock() + lc.netmap = nm +} + func (lc *localAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { + case "/localapi/v0/netmap": + w.Header().Set("Content-Type", "application/json") + lc.Lock() + nm := lc.netmap + lc.Unlock() + if nm == nil { + http.Error(w, "no netmap", http.StatusServiceUnavailable) + return + } + json.NewEncoder(w).Encode(nm) + return + case "/localapi/v0/debug": + // containerboot fetches the netmap via the "current-netmap" + // debug action; serve it like /localapi/v0/netmap above. + if r.URL.Query().Get("action") != "current-netmap" { + http.Error(w, "unsupported debug action", http.StatusNotFound) + return + } + w.Header().Set("Content-Type", "application/json") + lc.Lock() + nm := lc.netmap + lc.Unlock() + if nm == nil { + http.Error(w, "no netmap", http.StatusServiceUnavailable) + return + } + json.NewEncoder(w).Encode(nm) + return case "/localapi/v0/serve-config": switch r.Method { case "GET": @@ -1592,9 +1662,7 @@ func (k *kubeServer) Secret() map[string]string { k.Lock() defer k.Unlock() ret := map[string]string{} - for k, v := range k.secret { - ret[k] = v - } + maps.Copy(ret, k.secret) return ret } diff --git a/cmd/k8s-operator/api-server-proxy-pg.go b/cmd/k8s-operator/api-server-proxy-pg.go index 0900fd0aaa264..ac2f7357768fc 100644 --- a/cmd/k8s-operator/api-server-proxy-pg.go +++ b/cmd/k8s-operator/api-server-proxy-pg.go @@ -23,10 +23,11 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/client/tailscale/v2" - "tailscale.com/internal/client/tailscale" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/k8s-proxy/conf" "tailscale.com/kube/kubetypes" "tailscale.com/tailcfg" @@ -51,7 +52,7 @@ type KubeAPIServerTSServiceReconciler struct { client.Client recorder record.EventRecorder logger *zap.SugaredLogger - tsClient tsClient + clients ClientProvider tsNamespace string defaultTags []string operatorID string // stableID of the operator's Tailscale device @@ -77,15 +78,14 @@ func (r *KubeAPIServerTSServiceReconciler) Reconcile(ctx context.Context, req re serviceName := serviceNameForAPIServerProxy(pg) logger = logger.With("Tailscale Service", serviceName) - - tailscaleClient, err := r.getClient(ctx, pg.Spec.Tailnet) + tsClient, err := r.clients.For(pg.Spec.Tailnet) if err != nil { return res, fmt.Errorf("failed to get tailscale client: %w", err) } if markedForDeletion(pg) { logger.Debugf("ProxyGroup is being deleted, ensuring any created resources are cleaned up") - if err = r.maybeCleanup(ctx, serviceName, pg, logger, tailscaleClient); err != nil && strings.Contains(err.Error(), optimisticLockErrorMsg) { + if err = r.maybeCleanup(ctx, serviceName, pg, logger, tsClient); err != nil && strings.Contains(err.Error(), optimisticLockErrorMsg) { logger.Infof("optimistic lock error, retrying: %s", err) return res, nil } @@ -93,7 +93,7 @@ func (r *KubeAPIServerTSServiceReconciler) Reconcile(ctx context.Context, req re return res, err } - err = r.maybeProvision(ctx, serviceName, pg, logger, tailscaleClient) + err = r.maybeProvision(ctx, serviceName, pg, logger, tsClient) if err != nil { if strings.Contains(err.Error(), optimisticLockErrorMsg) { logger.Infof("optimistic lock error, retrying: %s", err) @@ -105,31 +105,15 @@ func (r *KubeAPIServerTSServiceReconciler) Reconcile(ctx context.Context, req re return reconcile.Result{}, nil } -// getClient returns the appropriate Tailscale client for the given tailnet. -// If no tailnet is specified, returns the default client. -func (r *KubeAPIServerTSServiceReconciler) getClient(ctx context.Context, tailnetName string) (tsClient, - error) { - if tailnetName == "" { - return r.tsClient, nil - } - - tc, _, err := clientForTailnet(ctx, r.Client, r.tsNamespace, tailnetName) - if err != nil { - return nil, err - } - - return tc, nil -} - // maybeProvision ensures that a Tailscale Service for this ProxyGroup exists // and is up to date. // // Returns true if the operation resulted in a Tailscale Service update. -func (r *KubeAPIServerTSServiceReconciler) maybeProvision(ctx context.Context, serviceName tailcfg.ServiceName, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger, tsClient tsClient) (err error) { +func (r *KubeAPIServerTSServiceReconciler) maybeProvision(ctx context.Context, serviceName tailcfg.ServiceName, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger, tsClient tsclient.Client) (err error) { var dnsName string oldPGStatus := pg.Status.DeepCopy() defer func() { - podsAdvertising, podsErr := numberPodsAdvertising(ctx, r.Client, r.tsNamespace, pg.Name, serviceName) + podsAdvertising, podsErr := numberPodsAdvertising(ctx, r.Client, r.tsNamespace, pg.Name, serviceName.String()) if podsErr != nil { err = errors.Join(err, fmt.Errorf("failed to get number of advertised Pods: %w", podsErr)) // Continue, updating the status with the best available information. @@ -177,8 +161,8 @@ func (r *KubeAPIServerTSServiceReconciler) maybeProvision(ctx context.Context, s // 1. Check there isn't a Tailscale Service with the same hostname // already created and not owned by this ProxyGroup. - existingTSSvc, err := tsClient.GetVIPService(ctx, serviceName) - if err != nil && !isErrorTailscaleServiceNotFound(err) { + existingTSSvc, err := tsClient.VIPServices().Get(ctx, serviceName.String()) + if err != nil && !tailscale.IsNotFound(err) { return fmt.Errorf("error getting Tailscale Service %q: %w", serviceName, err) } @@ -202,8 +186,8 @@ func (r *KubeAPIServerTSServiceReconciler) maybeProvision(ctx context.Context, s serviceTags = pg.Spec.Tags.Stringify() } - tsSvc := &tailscale.VIPService{ - Name: serviceName, + tsSvc := tailscale.VIPService{ + Name: serviceName.String(), Tags: serviceTags, Ports: []string{"tcp:443"}, Comment: managedTSServiceComment, @@ -216,10 +200,10 @@ func (r *KubeAPIServerTSServiceReconciler) maybeProvision(ctx context.Context, s // 2. Ensure the Tailscale Service exists and is up to date. if existingTSSvc == nil || !slices.Equal(tsSvc.Tags, existingTSSvc.Tags) || - !ownersAreSetAndEqual(tsSvc, existingTSSvc) || + !ownersAreSetAndEqual(tsSvc, *existingTSSvc) || !slices.Equal(tsSvc.Ports, existingTSSvc.Ports) { logger.Infof("Ensuring Tailscale Service exists and is up to date") - if err = tsClient.CreateOrUpdateVIPService(ctx, tsSvc); err != nil { + if err = tsClient.VIPServices().CreateOrUpdate(ctx, tsSvc); err != nil { return fmt.Errorf("error creating Tailscale Service: %w", err) } } @@ -248,10 +232,10 @@ func (r *KubeAPIServerTSServiceReconciler) maybeProvision(ctx context.Context, s } // maybeCleanup ensures that any resources, such as a Tailscale Service created for this Service, are cleaned up when the -// Service is being deleted or is unexposed. The cleanup is safe for a multi-cluster setup- the Tailscale Service is only +// Service is being deleted or is unexposed. The cleanup is safe for a multi-cluster setup. The Tailscale Service is only // deleted if it does not contain any other owner references. If it does, the cleanup only removes the owner reference // corresponding to this Service. -func (r *KubeAPIServerTSServiceReconciler) maybeCleanup(ctx context.Context, serviceName tailcfg.ServiceName, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger, tsClient tsClient) (err error) { +func (r *KubeAPIServerTSServiceReconciler) maybeCleanup(ctx context.Context, serviceName tailcfg.ServiceName, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger, client tsclient.Client) (err error) { ix := slices.Index(pg.Finalizers, proxyPGFinalizerName) if ix < 0 { logger.Debugf("no finalizer, nothing to do") @@ -265,7 +249,7 @@ func (r *KubeAPIServerTSServiceReconciler) maybeCleanup(ctx context.Context, ser } }() - if _, err = cleanupTailscaleService(ctx, tsClient, serviceName, r.operatorID, logger); err != nil { + if _, err = cleanupTailscaleService(ctx, client, serviceName.String(), r.operatorID, logger); err != nil { return fmt.Errorf("error deleting Tailscale Service: %w", err) } @@ -278,16 +262,16 @@ func (r *KubeAPIServerTSServiceReconciler) maybeCleanup(ctx context.Context, ser // maybeDeleteStaleServices deletes Services that have previously been created for // this ProxyGroup but are no longer needed. -func (r *KubeAPIServerTSServiceReconciler) maybeDeleteStaleServices(ctx context.Context, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger, tsClient tsClient) error { +func (r *KubeAPIServerTSServiceReconciler) maybeDeleteStaleServices(ctx context.Context, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger, tsClient tsclient.Client) error { serviceName := serviceNameForAPIServerProxy(pg) - svcs, err := tsClient.ListVIPServices(ctx) + svcs, err := tsClient.VIPServices().List(ctx) if err != nil { return fmt.Errorf("error listing Tailscale Services: %w", err) } - for _, svc := range svcs.VIPServices { - if svc.Name == serviceName { + for _, svc := range svcs { + if svc.Name == serviceName.String() { continue } @@ -306,11 +290,11 @@ func (r *KubeAPIServerTSServiceReconciler) maybeDeleteStaleServices(ctx context. } logger.Infof("Deleting Tailscale Service %s", svc.Name) - if err = tsClient.DeleteVIPService(ctx, svc.Name); err != nil && !isErrorTailscaleServiceNotFound(err) { + if err = tsClient.VIPServices().Delete(ctx, svc.Name); err != nil && !tailscale.IsNotFound(err) { return fmt.Errorf("error deleting Tailscale Service %s: %w", svc.Name, err) } - if err = cleanupCertResources(ctx, r.Client, r.tsNamespace, svc.Name, pg); err != nil { + if err = cleanupCertResources(ctx, r.Client, r.tsNamespace, tailcfg.ServiceName(svc.Name), pg); err != nil { return fmt.Errorf("failed to clean up cert resources: %w", err) } } diff --git a/cmd/k8s-operator/api-server-proxy-pg_test.go b/cmd/k8s-operator/api-server-proxy-pg_test.go index f7277c70d5717..889ef064b05d9 100644 --- a/cmd/k8s-operator/api-server-proxy-pg_test.go +++ b/cmd/k8s-operator/api-server-proxy-pg_test.go @@ -16,16 +16,16 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "tailscale.com/client/tailscale/v2" - "tailscale.com/internal/client/tailscale" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/k8s-proxy/conf" "tailscale.com/kube/kubetypes" "tailscale.com/tailcfg" "tailscale.com/tstest" "tailscale.com/types/opt" - "tailscale.com/types/ptr" ) func TestAPIServerProxyReconciler(t *testing.T) { @@ -57,7 +57,7 @@ func TestAPIServerProxyReconciler(t *testing.T) { initialCfg := &conf.VersionedConfig{ Version: "v1alpha1", ConfigV1Alpha1: &conf.ConfigV1Alpha1{ - AuthKey: ptr.To("test-key"), + AuthKey: new("test-key"), APIServerProxy: &conf.APIServerProxyConfig{ Enabled: opt.NewBool(true), }, @@ -94,8 +94,10 @@ func TestAPIServerProxyReconciler(t *testing.T) { expectEqual(t, fc, pgCfgSecret) } - ft := &fakeTSClient{} - ingressTSSvc := &tailscale.VIPService{ + ft := &fakeTSClient{ + vipServices: make(map[string]tailscale.VIPService), + } + ingressTSSvc := tailscale.VIPService{ Name: "svc:some-ingress-hostname", Comment: managedTSServiceComment, Annotations: map[string]string{ @@ -106,11 +108,11 @@ func TestAPIServerProxyReconciler(t *testing.T) { Tags: []string{"tag:k8s"}, Addrs: []string{"5.6.7.8"}, } - ft.CreateOrUpdateVIPService(t.Context(), ingressTSSvc) + ft.VIPServices().CreateOrUpdate(t.Context(), ingressTSSvc) r := &KubeAPIServerTSServiceReconciler{ Client: fc, - tsClient: ft, + clients: tsclient.NewProvider(ft), defaultTags: []string{"tag:k8s"}, tsNamespace: ns, logger: zap.Must(zap.NewDevelopment()).Sugar(), @@ -120,7 +122,7 @@ func TestAPIServerProxyReconciler(t *testing.T) { } // Create a Tailscale Service that will conflict with the initial config. - if err := ft.CreateOrUpdateVIPService(t.Context(), &tailscale.VIPService{ + if err := ft.VIPServices().CreateOrUpdate(t.Context(), tailscale.VIPService{ Name: "svc:" + pgName, }); err != nil { t.Fatalf("creating initial Tailscale Service: %v", err) @@ -136,7 +138,7 @@ func TestAPIServerProxyReconciler(t *testing.T) { expectEqual(t, fc, pgCfgSecret) // Unchanged. // Delete Tailscale Service; should see Service created and valid condition updated to true. - if err := ft.DeleteVIPService(t.Context(), "svc:"+pgName); err != nil { + if err := ft.VIPServices().Delete(t.Context(), "svc:"+pgName); err != nil { t.Fatalf("deleting initial Tailscale Service: %v", err) } @@ -155,7 +157,7 @@ func TestAPIServerProxyReconciler(t *testing.T) { expectReconciled(t, r, "", pgName) - tsSvc, err := ft.GetVIPService(t.Context(), "svc:"+pgName) + tsSvc, err := ft.VIPServices().Get(t.Context(), "svc:"+pgName) if err != nil { t.Fatalf("getting Tailscale Service: %v", err) } @@ -179,7 +181,7 @@ func TestAPIServerProxyReconciler(t *testing.T) { tsoperator.SetProxyGroupCondition(pg, tsapi.KubeAPIServerProxyConfigured, metav1.ConditionFalse, reasonKubeAPIServerProxyNoBackends, "", 1, r.clock, r.logger) expectEqual(t, fc, pg, omitPGStatusConditionMessages) - expectedCfg.APIServerProxy.ServiceName = ptr.To(tailcfg.ServiceName("svc:" + pgName)) + expectedCfg.APIServerProxy.ServiceName = new(tailcfg.ServiceName("svc:" + pgName)) expectCfg(&expectedCfg) expectEqual(t, fc, certSecret(pgName, ns, defaultDomain, pg)) @@ -224,20 +226,20 @@ func TestAPIServerProxyReconciler(t *testing.T) { p.Spec.KubeAPIServer = pg.Spec.KubeAPIServer }) expectReconciled(t, r, "", pgName) - _, err = ft.GetVIPService(t.Context(), "svc:"+pgName) - if !isErrorTailscaleServiceNotFound(err) { + _, err = ft.VIPServices().Get(t.Context(), "svc:"+pgName) + if !tailscale.IsNotFound(err) { t.Fatalf("Expected 404, got: %v", err) } - tsSvc, err = ft.GetVIPService(t.Context(), updatedServiceName) + tsSvc, err = ft.VIPServices().Get(t.Context(), updatedServiceName.String()) if err != nil { t.Fatalf("Expected renamed svc, got error: %v", err) } - expectedTSSvc.Name = updatedServiceName + expectedTSSvc.Name = updatedServiceName.String() if !reflect.DeepEqual(tsSvc, expectedTSSvc) { t.Fatalf("expected Tailscale Service to be %+v, got %+v", expectedTSSvc, tsSvc) } // Check cfg and status reset until TLS certs are available again. - expectedCfg.APIServerProxy.ServiceName = ptr.To(updatedServiceName) + expectedCfg.APIServerProxy.ServiceName = new(updatedServiceName) expectedCfg.AdvertiseServices = nil expectCfg(&expectedCfg) tsoperator.SetProxyGroupCondition(pg, tsapi.KubeAPIServerProxyConfigured, metav1.ConditionFalse, reasonKubeAPIServerProxyNoBackends, "", 1, r.clock, r.logger) @@ -270,17 +272,17 @@ func TestAPIServerProxyReconciler(t *testing.T) { expectMissing[corev1.Secret](t, fc, ns, updatedDomain) expectMissing[rbacv1.Role](t, fc, ns, updatedDomain) expectMissing[rbacv1.RoleBinding](t, fc, ns, updatedDomain) - _, err = ft.GetVIPService(t.Context(), updatedServiceName) - if !isErrorTailscaleServiceNotFound(err) { + _, err = ft.VIPServices().Get(t.Context(), updatedServiceName.String()) + if !tailscale.IsNotFound(err) { t.Fatalf("Expected 404, got: %v", err) } // Ingress Tailscale Service should not be affected. - svc, err := ft.GetVIPService(t.Context(), ingressTSSvc.Name) + svc, err := ft.VIPServices().Get(t.Context(), ingressTSSvc.Name) if err != nil { t.Fatalf("getting ingress Tailscale Service: %v", err) } - if !reflect.DeepEqual(svc, ingressTSSvc) { + if !reflect.DeepEqual(svc, &ingressTSSvc) { t.Fatalf("expected ingress Tailscale Service to be unmodified %+v, got %+v", ingressTSSvc, svc) } } @@ -293,8 +295,7 @@ func TestExclusiveOwnerAnnotations(t *testing.T) { }, } const ( - selfOperatorID = "self-id" - pg1Owner = `{"ownerRefs":[{"operatorID":"self-id","resource":{"kind":"ProxyGroup","name":"pg1","uid":"pg1-uid"}}]}` + pg1Owner = `{"ownerRefs":[{"operatorID":"self-id","resource":{"kind":"ProxyGroup","name":"pg1","uid":"pg1-uid"}}]}` ) for name, tc := range map[string]struct { diff --git a/cmd/k8s-operator/ingress-for-pg.go b/cmd/k8s-operator/ingress-for-pg.go index 60196ce1505ff..d6872f6809177 100644 --- a/cmd/k8s-operator/ingress-for-pg.go +++ b/cmd/k8s-operator/ingress-for-pg.go @@ -10,8 +10,8 @@ import ( "encoding/json" "errors" "fmt" + "maps" "math/rand/v2" - "net/http" "reflect" "slices" "strings" @@ -29,11 +29,12 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/client/tailscale/v2" - "tailscale.com/internal/client/tailscale" "tailscale.com/ipn" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/kubetypes" "tailscale.com/tailcfg" "tailscale.com/util/clientmetric" @@ -63,7 +64,7 @@ type HAIngressReconciler struct { recorder record.EventRecorder logger *zap.SugaredLogger - tsClient tsClient + clients ClientProvider tsnetServer tsnetServer tsNamespace string defaultTags []string @@ -126,7 +127,7 @@ func (r *HAIngressReconciler) Reconcile(ctx context.Context, req reconcile.Reque return res, fmt.Errorf("getting ProxyGroup %q: %w", pgName, err) } - tailscaleClient, err := clientFromProxyGroup(ctx, r.Client, pg, r.tsNamespace, r.tsClient) + tsClient, err := r.clients.For(pg.Spec.Tailnet) if err != nil { return res, fmt.Errorf("failed to get tailscale client: %w", err) } @@ -138,9 +139,9 @@ func (r *HAIngressReconciler) Reconcile(ctx context.Context, req reconcile.Reque // resulted in another actor overwriting our Tailscale Service update. needsRequeue := false if !ing.DeletionTimestamp.IsZero() || !r.shouldExpose(ing) { - needsRequeue, err = r.maybeCleanup(ctx, hostname, ing, logger, tailscaleClient, pg) + needsRequeue, err = r.maybeCleanup(ctx, hostname, ing, logger, tsClient, pg) } else { - needsRequeue, err = r.maybeProvision(ctx, hostname, ing, logger, tailscaleClient, pg) + needsRequeue, err = r.maybeProvision(ctx, hostname, ing, logger, tsClient, pg) } if err != nil { return res, err @@ -159,12 +160,12 @@ func (r *HAIngressReconciler) Reconcile(ctx context.Context, req reconcile.Reque // If a Tailscale Service exists, but does not have an owner reference from any operator, we error // out assuming that this is an owner reference created by an unknown actor. // Returns true if the operation resulted in a Tailscale Service update. -func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname string, ing *networkingv1.Ingress, logger *zap.SugaredLogger, tsClient tsClient, pg *tsapi.ProxyGroup) (svcsChanged bool, err error) { +func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname string, ing *networkingv1.Ingress, logger *zap.SugaredLogger, tsClient tsclient.Client, pg *tsapi.ProxyGroup) (svcsChanged bool, err error) { // Currently (2025-05) Tailscale Services are behind an alpha feature flag that // needs to be explicitly enabled for a tailnet to be able to use them. serviceName := tailcfg.ServiceName("svc:" + hostname) - existingTSSvc, err := tsClient.GetVIPService(ctx, serviceName) - if err != nil && !isErrorTailscaleServiceNotFound(err) { + existingTSSvc, err := tsClient.VIPServices().Get(ctx, serviceName.String()) + if err != nil && !tailscale.IsNotFound(err) { return false, fmt.Errorf("error getting Tailscale Service %q: %w", hostname, err) } @@ -340,8 +341,8 @@ func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname strin tsSvcPorts = append(tsSvcPorts, "tcp:80") } - tsSvc := &tailscale.VIPService{ - Name: serviceName, + tsSvc := tailscale.VIPService{ + Name: serviceName.String(), Tags: tags, Ports: tsSvcPorts, Comment: managedTSServiceComment, @@ -356,9 +357,9 @@ func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname strin if existingTSSvc == nil || !reflect.DeepEqual(tsSvc.Tags, existingTSSvc.Tags) || !reflect.DeepEqual(tsSvc.Ports, existingTSSvc.Ports) || - !ownersAreSetAndEqual(tsSvc, existingTSSvc) { + !ownersAreSetAndEqual(tsSvc, *existingTSSvc) { logger.Infof("Ensuring Tailscale Service exists and is up to date") - if err := tsClient.CreateOrUpdateVIPService(ctx, tsSvc); err != nil { + if err := tsClient.VIPServices().CreateOrUpdate(ctx, tsSvc); err != nil { return false, fmt.Errorf("error creating Tailscale Service: %w", err) } } @@ -374,7 +375,7 @@ func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname strin } // 6. Update Ingress status if ProxyGroup Pods are ready. - count, err := numberPodsAdvertising(ctx, r.Client, r.tsNamespace, pg.Name, serviceName) + count, err := numberPodsAdvertising(ctx, r.Client, r.tsNamespace, pg.Name, serviceName.String()) if err != nil { return false, fmt.Errorf("failed to check if any Pods are configured: %w", err) } @@ -439,7 +440,7 @@ func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname strin // operator instances, else the owner reference is cleaned up. Returns true if // the operation resulted in an existing Tailscale Service updates (owner // reference removal). -func (r *HAIngressReconciler) maybeCleanupProxyGroup(ctx context.Context, logger *zap.SugaredLogger, tsClient tsClient, pg *tsapi.ProxyGroup) (svcsChanged bool, err error) { +func (r *HAIngressReconciler) maybeCleanupProxyGroup(ctx context.Context, logger *zap.SugaredLogger, tsClient tsclient.Client, pg *tsapi.ProxyGroup) (svcsChanged bool, err error) { // Get serve config for the ProxyGroup cm, cfg, err := r.proxyGroupServeConfig(ctx, pg.Name) if err != nil { @@ -469,11 +470,11 @@ func (r *HAIngressReconciler) maybeCleanupProxyGroup(ctx context.Context, logger if !found { logger.Infof("Tailscale Service %q is not owned by any Ingress, cleaning up", tsSvcName) - tsService, err := tsClient.GetVIPService(ctx, tsSvcName) - if isErrorTailscaleServiceNotFound(err) { + tsService, err := tsClient.VIPServices().Get(ctx, tsSvcName.String()) + switch { + case tailscale.IsNotFound(err): return false, nil - } - if err != nil { + case err != nil: return false, fmt.Errorf("getting Tailscale Service %q: %w", tsSvcName, err) } @@ -518,17 +519,19 @@ func (r *HAIngressReconciler) maybeCleanupProxyGroup(ctx context.Context, logger // Ingress is being deleted or is unexposed. The cleanup is safe for a multi-cluster setup- the Tailscale Service is only // deleted if it does not contain any other owner references. If it does the cleanup only removes the owner reference // corresponding to this Ingress. -func (r *HAIngressReconciler) maybeCleanup(ctx context.Context, hostname string, ing *networkingv1.Ingress, logger *zap.SugaredLogger, tsClient tsClient, pg *tsapi.ProxyGroup) (svcChanged bool, err error) { +func (r *HAIngressReconciler) maybeCleanup(ctx context.Context, hostname string, ing *networkingv1.Ingress, logger *zap.SugaredLogger, tsClient tsclient.Client, pg *tsapi.ProxyGroup) (svcChanged bool, err error) { logger.Debugf("Ensuring any resources for Ingress are cleaned up") ix := slices.Index(ing.Finalizers, FinalizerNamePG) if ix < 0 { logger.Debugf("no finalizer, nothing to do") return false, nil } + logger.Infof("Ensuring that Tailscale Service %q configuration is cleaned up", hostname) serviceName := tailcfg.ServiceName("svc:" + hostname) - svc, err := tsClient.GetVIPService(ctx, serviceName) - if err != nil && !isErrorTailscaleServiceNotFound(err) { + + svc, err := tsClient.VIPServices().Get(ctx, serviceName.String()) + if err != nil && !tailscale.IsNotFound(err) { return false, fmt.Errorf("error getting Tailscale Service: %w", err) } @@ -697,10 +700,7 @@ func (r *HAIngressReconciler) validateIngress(ctx context.Context, ing *networki // If a Tailscale Service is found, but contains other owner references, only removes this operator's owner reference. // If a Tailscale Service by the given name is not found or does not contain this operator's owner reference, do nothing. // It returns true if an existing Tailscale Service was updated to remove owner reference, as well as any error that occurred. -func (r *HAIngressReconciler) cleanupTailscaleService(ctx context.Context, svc *tailscale.VIPService, logger *zap.SugaredLogger, tsClient tsClient) (updated bool, _ error) { - if svc == nil { - return false, nil - } +func (r *HAIngressReconciler) cleanupTailscaleService(ctx context.Context, svc *tailscale.VIPService, logger *zap.SugaredLogger, tsClient tsclient.Client) (updated bool, _ error) { o, err := parseOwnerAnnotation(svc) if err != nil { return false, fmt.Errorf("error parsing Tailscale Service's owner annotation") @@ -720,7 +720,7 @@ func (r *HAIngressReconciler) cleanupTailscaleService(ctx context.Context, svc * } if len(o.OwnerRefs) == 1 { logger.Infof("Deleting Tailscale Service %q", svc.Name) - if err = tsClient.DeleteVIPService(ctx, svc.Name); err != nil && !isErrorTailscaleServiceNotFound(err) { + if err = tsClient.VIPServices().Delete(ctx, svc.Name); err != nil && !tailscale.IsNotFound(err) { return false, err } @@ -734,7 +734,7 @@ func (r *HAIngressReconciler) cleanupTailscaleService(ctx context.Context, svc * return false, fmt.Errorf("error marshalling updated Tailscale Service owner reference: %w", err) } svc.Annotations[ownerAnnotation] = string(json) - return true, tsClient.CreateOrUpdateVIPService(ctx, svc) + return true, tsClient.VIPServices().CreateOrUpdate(ctx, *svc) } // isHTTPEndpointEnabled returns true if the Ingress has been configured to expose an HTTP endpoint to tailnet. @@ -818,7 +818,7 @@ func (r *HAIngressReconciler) maybeUpdateAdvertiseServicesConfig(ctx context.Con return nil } -func numberPodsAdvertising(ctx context.Context, cl client.Client, tsNamespace, pgName string, serviceName tailcfg.ServiceName) (int, error) { +func numberPodsAdvertising(ctx context.Context, cl client.Client, tsNamespace, pgName string, serviceName string) (int, error) { // Get all state Secrets for this ProxyGroup. secrets := &corev1.SecretList{} if err := cl.List(ctx, secrets, client.InNamespace(tsNamespace), client.MatchingLabels(pgSecretLabels(pgName, kubetypes.LabelSecretTypeState))); err != nil { @@ -834,7 +834,7 @@ func numberPodsAdvertising(ctx context.Context, cl client.Client, tsNamespace, p if !ok { continue } - if slices.Contains(prefs.AdvertiseServices, serviceName.String()) { + if slices.Contains(prefs.AdvertiseServices, serviceName) { count++ } } @@ -904,15 +904,17 @@ func ownerAnnotations(operatorID string, svc *tailscale.VIPService) (map[string] } newAnnots := make(map[string]string, len(svc.Annotations)+1) - for k, v := range svc.Annotations { - newAnnots[k] = v - } + maps.Copy(newAnnots, svc.Annotations) newAnnots[ownerAnnotation] = string(json) return newAnnots, nil } // parseOwnerAnnotation returns nil if no valid owner found. func parseOwnerAnnotation(tsSvc *tailscale.VIPService) (*ownerAnnotationValue, error) { + if tsSvc == nil { + return nil, nil + } + if tsSvc.Annotations == nil || tsSvc.Annotations[ownerAnnotation] == "" { return nil, nil } @@ -923,9 +925,8 @@ func parseOwnerAnnotation(tsSvc *tailscale.VIPService) (*ownerAnnotationValue, e return o, nil } -func ownersAreSetAndEqual(a, b *tailscale.VIPService) bool { - return a != nil && b != nil && - a.Annotations != nil && b.Annotations != nil && +func ownersAreSetAndEqual(a, b tailscale.VIPService) bool { + return a.Annotations != nil && b.Annotations != nil && a.Annotations[ownerAnnotation] != "" && b.Annotations[ownerAnnotation] != "" && strings.EqualFold(a.Annotations[ownerAnnotation], b.Annotations[ownerAnnotation]) @@ -1080,7 +1081,7 @@ func certResourceLabels(pgName, domain string) map[string]string { return map[string]string{ kubetypes.LabelManaged: "true", labelProxyGroup: pgName, - labelDomain: domain, + labelDomain: tsoperator.TruncateLabelValue(domain), } } @@ -1108,12 +1109,6 @@ func hasCerts(ctx context.Context, cl client.Client, ns string, svc tailcfg.Serv return len(cert) > 0 && len(key) > 0, nil } -func isErrorTailscaleServiceNotFound(err error) bool { - var errResp tailscale.ErrResponse - ok := errors.As(err, &errResp) - return ok && errResp.Status == http.StatusNotFound -} - func tagViolations(obj client.Object) []string { var violations []string if obj == nil { @@ -1124,7 +1119,7 @@ func tagViolations(obj client.Object) []string { return nil } - for _, tag := range strings.Split(tags, ",") { + for tag := range strings.SplitSeq(tags, ",") { tag = strings.TrimSpace(tag) if err := tailcfg.CheckTag(tag); err != nil { violations = append(violations, fmt.Sprintf("invalid tag %q: %v", tag, err)) diff --git a/cmd/k8s-operator/proxygroup.go b/cmd/k8s-operator/proxygroup.go index 2aef6ff9e8226..9df8460b7c92b 100644 --- a/cmd/k8s-operator/proxygroup.go +++ b/cmd/k8s-operator/proxygroup.go @@ -10,7 +10,6 @@ import ( "encoding/json" "errors" "fmt" - "net/http" "net/netip" "slices" "sort" @@ -33,18 +32,18 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/client/tailscale/v2" - "tailscale.com/client/tailscale" "tailscale.com/ipn" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/egressservices" "tailscale.com/kube/k8s-proxy/conf" "tailscale.com/kube/kubetypes" "tailscale.com/tailcfg" "tailscale.com/tstime" "tailscale.com/types/opt" - "tailscale.com/types/ptr" "tailscale.com/util/clientmetric" "tailscale.com/util/mak" "tailscale.com/util/set" @@ -86,7 +85,7 @@ type ProxyGroupReconciler struct { log *zap.SugaredLogger recorder record.EventRecorder clock tstime.Clock - tsClient tsClient + clients ClientProvider // User-specified defaults from the helm installation. tsNamespace string @@ -123,7 +122,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ return reconcile.Result{}, fmt.Errorf("failed to get tailscale.com ProxyGroup: %w", err) } - tailscaleClient, loginUrl, err := r.getClientAndLoginURL(ctx, pg.Spec.Tailnet) + tsClient, err := r.clients.For(pg.Spec.Tailnet) if err != nil { oldPGStatus := pg.Status.DeepCopy() nrr := ¬ReadyReason{ @@ -142,7 +141,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ return reconcile.Result{}, nil } - if done, err := r.maybeCleanup(ctx, tailscaleClient, pg); err != nil { + if done, err := r.maybeCleanup(ctx, tsClient, pg); err != nil { if strings.Contains(err.Error(), optimisticLockErrorMsg) { logger.Infof("optimistic lock error, retrying: %s", err) return reconcile.Result{}, nil @@ -161,7 +160,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ } oldPGStatus := pg.Status.DeepCopy() - staticEndpoints, nrr, err := r.reconcilePG(ctx, tailscaleClient, loginUrl, pg, logger) + staticEndpoints, nrr, err := r.reconcilePG(ctx, tsClient, pg, logger) return reconcile.Result{}, errors.Join(err, r.maybeUpdateStatus(ctx, logger, pg, oldPGStatus, nrr, staticEndpoints)) } @@ -169,7 +168,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ // for deletion. It is separated out from Reconcile to make a clear separation // between reconciling the ProxyGroup, and posting the status of its created // resources onto the ProxyGroup status field. -func (r *ProxyGroupReconciler) reconcilePG(ctx context.Context, tailscaleClient tsClient, loginUrl string, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger) (map[string][]netip.AddrPort, *notReadyReason, error) { +func (r *ProxyGroupReconciler) reconcilePG(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger) (map[string][]netip.AddrPort, *notReadyReason, error) { if !slices.Contains(pg.Finalizers, FinalizerName) { // This log line is printed exactly once during initial provisioning, // because once the finalizer is in place this block gets skipped. So, @@ -210,7 +209,7 @@ func (r *ProxyGroupReconciler) reconcilePG(ctx context.Context, tailscaleClient return notReady(reasonProxyGroupInvalid, fmt.Sprintf("invalid ProxyGroup spec: %v", err)) } - staticEndpoints, nrr, err := r.maybeProvision(ctx, tailscaleClient, loginUrl, pg, proxyClass) + staticEndpoints, nrr, err := r.maybeProvision(ctx, tsClient, pg, proxyClass) if err != nil { return nil, nrr, err } @@ -296,7 +295,7 @@ func (r *ProxyGroupReconciler) validate(ctx context.Context, pg *tsapi.ProxyGrou return errors.Join(errs...) } -func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tailscaleClient tsClient, loginUrl string, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (map[string][]netip.AddrPort, *notReadyReason, error) { +func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (map[string][]netip.AddrPort, *notReadyReason, error) { logger := r.logger(pg.Name) r.mu.Lock() r.ensureStateAddedForProxyGroup(pg) @@ -308,8 +307,7 @@ func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tailscaleClie var err error svcToNodePorts, tailscaledPort, err = r.ensureNodePortServiceCreated(ctx, pg, proxyClass) if err != nil { - var allocatePortErr *allocatePortsErr - if errors.As(err, &allocatePortErr) { + if _, ok := errors.AsType[*allocatePortsErr](err); ok { reason := reasonProxyGroupCreationFailed msg := fmt.Sprintf("error provisioning NodePort Services for static endpoints: %v", err) r.recorder.Event(pg, corev1.EventTypeWarning, reason, msg) @@ -319,10 +317,9 @@ func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tailscaleClie } } - staticEndpoints, err := r.ensureConfigSecretsCreated(ctx, tailscaleClient, loginUrl, pg, proxyClass, svcToNodePorts) + staticEndpoints, err := r.ensureConfigSecretsCreated(ctx, tsClient, pg, proxyClass, svcToNodePorts) if err != nil { - var selectorErr *FindStaticEndpointErr - if errors.As(err, &selectorErr) { + if _, ok := errors.AsType[*FindStaticEndpointErr](err); ok { reason := reasonProxyGroupCreationFailed msg := fmt.Sprintf("error provisioning config Secrets: %v", err) r.recorder.Event(pg, corev1.EventTypeWarning, reason, msg) @@ -431,7 +428,7 @@ func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tailscaleClie return r.notReadyErrf(pg, logger, "error reconciling metrics resources: %w", err) } - if err := r.cleanupDanglingResources(ctx, tailscaleClient, pg, proxyClass); err != nil { + if err := r.cleanupDanglingResources(ctx, tsClient, pg, proxyClass); err != nil { return r.notReadyErrf(pg, logger, "error cleaning up dangling resources: %w", err) } @@ -623,12 +620,12 @@ func (r *ProxyGroupReconciler) ensureNodePortServiceCreated(ctx context.Context, } } - return svcToNodePorts, ptr.To(tailscaledPort), nil + return svcToNodePorts, new(tailscaledPort), nil } // cleanupDanglingResources ensures we don't leak config secrets, state secrets, and // tailnet devices when the number of replicas specified is reduced. -func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass) error { +func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass) error { logger := r.logger(pg.Name) metadata, err := getNodeMetadata(ctx, pg, r.Client, r.tsNamespace) if err != nil { @@ -642,7 +639,7 @@ func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tai // Dangling resource, delete the config + state Secrets, as well as // deleting the device from the tailnet. - if err := r.ensureDeviceDeleted(ctx, tailscaleClient, m.tsID, logger); err != nil { + if err := r.ensureDeviceDeleted(ctx, tsClient, m.tsID, logger); err != nil { return err } if err := r.Delete(ctx, m.stateSecret); err != nil && !apierrors.IsNotFound(err) { @@ -685,7 +682,7 @@ func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tai // maybeCleanup just deletes the device from the tailnet. All the kubernetes // resources linked to a ProxyGroup will get cleaned up via owner references // (which we can use because they are all in the same namespace). -func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup) (bool, error) { +func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup) (bool, error) { logger := r.logger(pg.Name) metadata, err := getNodeMetadata(ctx, pg, r.Client, r.tsNamespace) @@ -694,7 +691,7 @@ func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tailscaleClient } for _, m := range metadata { - if err := r.ensureDeviceDeleted(ctx, tailscaleClient, m.tsID, logger); err != nil { + if err := r.ensureDeviceDeleted(ctx, tsClient, m.tsID, logger); err != nil { return false, err } } @@ -715,26 +712,23 @@ func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tailscaleClient return true, nil } -func (r *ProxyGroupReconciler) ensureDeviceDeleted(ctx context.Context, tailscaleClient tsClient, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error { +func (r *ProxyGroupReconciler) ensureDeviceDeleted(ctx context.Context, tsClient tsclient.Client, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error { logger.Debugf("deleting device %s from control", string(id)) - if err := tailscaleClient.DeleteDevice(ctx, string(id)); err != nil { - errResp := &tailscale.ErrResponse{} - if ok := errors.As(err, errResp); ok && errResp.Status == http.StatusNotFound { - logger.Debugf("device %s not found, likely because it has already been deleted from control", string(id)) - } else { - return fmt.Errorf("error deleting device: %w", err) - } - } else { - logger.Debugf("device %s deleted from control", string(id)) + err := tsClient.Devices().Delete(ctx, string(id)) + switch { + case tailscale.IsNotFound(err): + logger.Debugf("device %s not found, likely because it has already been deleted from control", string(id)) + case err != nil: + return fmt.Errorf("error deleting device: %w", err) } + logger.Debugf("device %s deleted from control", string(id)) return nil } func (r *ProxyGroupReconciler) ensureConfigSecretsCreated( ctx context.Context, - tailscaleClient tsClient, - loginUrl string, + tsClient tsclient.Client, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass, svcToNodePorts map[string]uint16, @@ -760,7 +754,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated( return nil, err } - authKey, err := r.getAuthKey(ctx, tailscaleClient, pg, existingCfgSecret, i, logger) + authKey, err := r.getAuthKey(ctx, tsClient, pg, existingCfgSecret, i, logger) if err != nil { return nil, err } @@ -809,9 +803,9 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated( Version: "v1alpha1", ConfigV1Alpha1: &conf.ConfigV1Alpha1{ AuthKey: authKey, - State: ptr.To(fmt.Sprintf("kube:%s", pgPodName(pg.Name, i))), - App: ptr.To(kubetypes.AppProxyGroupKubeAPIServer), - LogLevel: ptr.To(logger.Level().String()), + State: new(fmt.Sprintf("kube:%s", pgPodName(pg.Name, i))), + App: new(kubetypes.AppProxyGroupKubeAPIServer), + LogLevel: new(logger.Level().String()), // Reloadable fields. Hostname: &hostname, @@ -822,7 +816,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated( // as containerboot does for ingress-pg-reconciler. IssueCerts: opt.NewBool(i == 0), }, - LocalPort: ptr.To(uint16(9002)), + LocalPort: new(uint16(9002)), HealthCheckEnabled: opt.NewBool(true), }, } @@ -842,8 +836,8 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated( } } - if loginUrl != "" { - cfg.ServerURL = new(loginUrl) + if tsClient.LoginURL() != "" { + cfg.ServerURL = new(tsClient.LoginURL()) } if proxyClass != nil && proxyClass.Spec.TailscaleConfig != nil { @@ -871,7 +865,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated( return nil, err } - configs, err := pgTailscaledConfig(pg, loginUrl, proxyClass, i, authKey, endpoints[nodePortSvcName], existingAdvertiseServices) + configs, err := pgTailscaledConfig(pg, tsClient.LoginURL(), proxyClass, i, authKey, endpoints[nodePortSvcName], existingAdvertiseServices) if err != nil { return nil, fmt.Errorf("error creating tailscaled config: %w", err) } @@ -908,7 +902,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated( // A new key is created if the config Secret doesn't exist yet, or if the // proxy has requested a reissue via its state Secret. An existing key is // retained while the device hasn't authed or a reissue is in progress. -func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup, existingCfgSecret *corev1.Secret, ordinal int32, logger *zap.SugaredLogger) (*string, error) { +func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, existingCfgSecret *corev1.Secret, ordinal int32, logger *zap.SugaredLogger) (*string, error) { // Get state Secret to check if it's already authed or has requested // a fresh auth key. stateSecret := &corev1.Secret{ @@ -935,7 +929,7 @@ func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient t if !createAuthKey { var err error - createAuthKey, err = r.shouldReissueAuthKey(ctx, tailscaleClient, pg, stateSecret, cfgAuthKey) + createAuthKey, err = r.shouldReissueAuthKey(ctx, tsClient, pg, stateSecret, cfgAuthKey) if err != nil { return nil, err } @@ -949,7 +943,7 @@ func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient t if len(tags) == 0 { tags = r.defaultTags } - key, err := newAuthKey(ctx, tailscaleClient, tags) + key, err := newAuthKey(ctx, tsClient, tags) if err != nil { return nil, err } @@ -969,7 +963,7 @@ func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient t // shouldReissueAuthKey returns true if the proxy needs a new auth key. It // tracks in-flight reissues via authKeyReissuing to avoid duplicate API calls // across reconciles. -func (r *ProxyGroupReconciler) shouldReissueAuthKey(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup, stateSecret *corev1.Secret, cfgAuthKey *string) (shouldReissue bool, err error) { +func (r *ProxyGroupReconciler) shouldReissueAuthKey(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, stateSecret *corev1.Secret, cfgAuthKey *string) (shouldReissue bool, err error) { r.mu.Lock() reissuing := r.authKeyReissuing[stateSecret.Name] r.mu.Unlock() @@ -1021,7 +1015,7 @@ func (r *ProxyGroupReconciler) shouldReissueAuthKey(ctx context.Context, tailsca r.log.Infof("Proxy failing to auth; attempting cleanup and new key") if tsID := stateSecret.Data[kubetypes.KeyDeviceID]; len(tsID) > 0 { id := tailcfg.StableNodeID(tsID) - if err := r.ensureDeviceDeleted(ctx, tailscaleClient, id, r.log); err != nil { + if err = r.ensureDeviceDeleted(ctx, tsClient, id, r.log); err != nil { return false, err } } @@ -1119,7 +1113,7 @@ func getStaticEndpointAddress(a *corev1.NodeAddress, port uint16) *netip.AddrPor return nil } - return ptr.To(netip.AddrPortFrom(addr, port)) + return new(netip.AddrPortFrom(addr, port)) } // ensureStateAddedForProxyGroup ensures the gauge metric for the ProxyGroup resource is updated when the ProxyGroup @@ -1166,6 +1160,9 @@ func (r *ProxyGroupReconciler) ensureStateRemovedForProxyGroup(pg *tsapi.ProxyGr gaugeIngressProxyGroupResources.Set(int64(r.ingressProxyGroups.Len())) gaugeAPIServerProxyGroupResources.Set(int64(r.apiServerProxyGroups.Len())) delete(r.authKeyRateLimits, pg.Name) + for i := range pgReplicas(pg) { + delete(r.authKeyReissuing, pgStateSecretName(pg.Name, i)) + } } func pgTailscaledConfig(pg *tsapi.ProxyGroup, loginServer string, pc *tsapi.ProxyClass, idx int32, authKey *string, staticEndpoints []netip.AddrPort, oldAdvertiseServices []string) (tailscaledConfigs, error) { @@ -1174,7 +1171,7 @@ func pgTailscaledConfig(pg *tsapi.ProxyGroup, loginServer string, pc *tsapi.Prox AcceptDNS: "false", AcceptRoutes: "false", // AcceptRoutes defaults to true Locked: "false", - Hostname: ptr.To(pgHostname(pg, idx)), + Hostname: new(pgHostname(pg, idx)), AdvertiseServices: oldAdvertiseServices, AuthKey: authKey, } @@ -1309,29 +1306,6 @@ func (r *ProxyGroupReconciler) getRunningProxies(ctx context.Context, pg *tsapi. return devices, nil } -// getClientAndLoginURL returns the appropriate Tailscale client and resolved login URL -// for the given tailnet name. If no tailnet is specified, returns the default client -// and login server. Applies fallback to the operator's login server if the tailnet -// doesn't specify a custom login URL. -func (r *ProxyGroupReconciler) getClientAndLoginURL(ctx context.Context, tailnetName string) (tsClient, - string, error) { - if tailnetName == "" { - return r.tsClient, r.loginServer, nil - } - - tc, loginUrl, err := clientForTailnet(ctx, r.Client, r.tsNamespace, tailnetName) - if err != nil { - return nil, "", err - } - - // Apply fallback if tailnet doesn't specify custom login URL - if loginUrl == "" { - loginUrl = r.loginServer - } - - return tc, loginUrl, nil -} - type nodeMetadata struct { ordinal int32 stateSecret *corev1.Secret diff --git a/cmd/k8s-operator/proxygroup_test.go b/cmd/k8s-operator/proxygroup_test.go index b27f5e67aa043..95eb7bd5f4723 100644 --- a/cmd/k8s-operator/proxygroup_test.go +++ b/cmd/k8s-operator/proxygroup_test.go @@ -30,21 +30,21 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - "tailscale.com/client/tailscale" + "tailscale.com/client/tailscale/v2" + "tailscale.com/ipn" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/k8s-proxy/conf" "tailscale.com/kube/kubetypes" "tailscale.com/tailcfg" "tailscale.com/tstest" "tailscale.com/types/opt" - "tailscale.com/types/ptr" ) const ( testProxyImage = "tailscale/tailscale:test" - initialCfgHash = "6632726be70cf224049580deb4d317bba065915b5fd415461d60ed621c91b196" ) var ( @@ -52,7 +52,7 @@ var ( "some-annotation": "from-the-proxy-class", } - defaultReplicas = ptr.To(int32(2)) + defaultReplicas = new(int32(2)) defaultStaticEndpointConfig = &tsapi.StaticEndpointsConfig{ NodePort: &tsapi.NodePortConfig{ Ports: []tsapi.PortRange{ @@ -110,7 +110,7 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { }, }, }, - replicas: ptr.To(int32(4)), + replicas: new(int32(4)), nodes: []testNode{ { name: "foobar", @@ -153,7 +153,7 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { }, }, }, - replicas: ptr.To(int32(4)), + replicas: new(int32(4)), nodes: []testNode{ { name: "foobar", @@ -195,7 +195,7 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { }, }, }, - replicas: ptr.To(int32(4)), + replicas: new(int32(4)), nodes: []testNode{ { name: "foobar", @@ -237,7 +237,7 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { }, }, }, - replicas: ptr.To(int32(3)), + replicas: new(int32(3)), nodes: []testNode{ {name: "node1", addresses: []testNodeAddr{{ip: "10.0.0.1", addrType: corev1.NodeExternalIP}}, labels: map[string]string{"foo/bar": "baz"}}, {name: "node2", addresses: []testNodeAddr{{ip: "10.0.0.2", addrType: corev1.NodeExternalIP}}, labels: map[string]string{"foo/bar": "baz"}}, @@ -297,7 +297,7 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { }, }, }, - replicas: ptr.To(int32(4)), + replicas: new(int32(4)), nodes: []testNode{ { name: "foobar", @@ -642,7 +642,7 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { defaultProxyClass: "default-pc", Client: fc, - tsClient: tsClient, + clients: tsclient.NewProvider(tsClient), recorder: fr, clock: cl, authKeyRateLimits: make(map[string]*rate.Limiter), @@ -650,7 +650,7 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { } for i, r := range tt.reconciles { - createdNodes := []corev1.Node{} + var createdNodes []corev1.Node t.Run(tt.name, func(t *testing.T) { for _, n := range r.nodes { no := &corev1.Node{ @@ -787,7 +787,7 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { defaultProxyClass: "default-pc", Client: fc, - tsClient: tsClient, + clients: tsclient.NewProvider(tsClient), recorder: fr, log: zl.Sugar().With("TestName", tt.name).With("Reconcile", "cleanup"), clock: cl, @@ -850,7 +850,7 @@ func TestProxyGroup(t *testing.T) { defaultProxyClass: "default-pc", Client: fc, - tsClient: tsClient, + clients: tsclient.NewProvider(tsClient), recorder: fr, log: zl.Sugar(), clock: cl, @@ -909,17 +909,13 @@ func TestProxyGroup(t *testing.T) { t.Fatalf("expected %d egress ProxyGroups, got %d", expected, reconciler.egressProxyGroups.Len()) } expectProxyGroupResources(t, fc, pg, true, pc) - keyReq := tailscale.KeyCapabilities{ - Devices: tailscale.KeyDeviceCapabilities{ - Create: tailscale.KeyDeviceCreateCapabilities{ - Reusable: false, - Ephemeral: false, - Preauthorized: true, - Tags: []string{"tag:test-tag"}, - }, - }, - } - if diff := cmp.Diff(tsClient.KeyRequests(), []tailscale.KeyCapabilities{keyReq, keyReq}); diff != "" { + var keyReq tailscale.KeyCapabilities + keyReq.Devices.Create.Reusable = false + keyReq.Devices.Create.Ephemeral = false + keyReq.Devices.Create.Preauthorized = true + keyReq.Devices.Create.Tags = []string{"tag:test-tag"} + + if diff := cmp.Diff(tsClient.keyRequests, []tailscale.KeyCapabilities{keyReq, keyReq}); diff != "" { t.Fatalf("unexpected secrets (-got +want):\n%s", diff) } }) @@ -952,7 +948,7 @@ func TestProxyGroup(t *testing.T) { }) t.Run("scale_up_to_3", func(t *testing.T) { - pg.Spec.Replicas = ptr.To[int32](3) + pg.Spec.Replicas = new(int32(3)) mustUpdate(t, fc, "", pg.Name, func(p *tsapi.ProxyGroup) { p.Spec = pg.Spec }) @@ -975,7 +971,7 @@ func TestProxyGroup(t *testing.T) { }) t.Run("scale_down_to_1", func(t *testing.T) { - pg.Spec.Replicas = ptr.To[int32](1) + pg.Spec.Replicas = new(int32(1)) mustUpdate(t, fc, "", pg.Name, func(p *tsapi.ProxyGroup) { p.Spec = pg.Spec }) @@ -1060,7 +1056,7 @@ func TestProxyGroupTypes(t *testing.T) { tsProxyImage: testProxyImage, Client: fc, log: zl.Sugar(), - tsClient: &fakeTSClient{}, + clients: tsclient.NewProvider(&fakeTSClient{}), clock: tstest.NewClock(tstest.ClockOpts{}), authKeyRateLimits: make(map[string]*rate.Limiter), authKeyReissuing: make(map[string]bool), @@ -1074,7 +1070,7 @@ func TestProxyGroupTypes(t *testing.T) { }, Spec: tsapi.ProxyGroupSpec{ Type: tsapi.ProxyGroupTypeEgress, - Replicas: ptr.To[int32](0), + Replicas: new(int32(0)), }, } mustCreate(t, fc, pg) @@ -1149,7 +1145,7 @@ func TestProxyGroupTypes(t *testing.T) { }, Spec: tsapi.ProxyGroupSpec{ Type: tsapi.ProxyGroupTypeEgress, - Replicas: ptr.To[int32](0), + Replicas: new(int32(0)), ProxyClass: "test", }, } @@ -1186,7 +1182,7 @@ func TestProxyGroupTypes(t *testing.T) { }, Spec: tsapi.ProxyGroupSpec{ Type: tsapi.ProxyGroupTypeIngress, - Replicas: ptr.To[int32](0), + Replicas: new(int32(0)), }, } if err := fc.Create(t.Context(), pg); err != nil { @@ -1240,9 +1236,9 @@ func TestProxyGroupTypes(t *testing.T) { }, Spec: tsapi.ProxyGroupSpec{ Type: tsapi.ProxyGroupTypeKubernetesAPIServer, - Replicas: ptr.To[int32](2), + Replicas: new(int32(2)), KubeAPIServer: &tsapi.KubeAPIServerConfig{ - Mode: ptr.To(tsapi.APIServerProxyModeNoAuth), + Mode: new(tsapi.APIServerProxyModeNoAuth), }, }, } @@ -1280,9 +1276,9 @@ func TestKubeAPIServerStatusConditionFlow(t *testing.T) { }, Spec: tsapi.ProxyGroupSpec{ Type: tsapi.ProxyGroupTypeKubernetesAPIServer, - Replicas: ptr.To[int32](1), + Replicas: new(int32(1)), KubeAPIServer: &tsapi.KubeAPIServerConfig{ - Mode: ptr.To(tsapi.APIServerProxyModeNoAuth), + Mode: new(tsapi.APIServerProxyModeNoAuth), }, }, } @@ -1302,7 +1298,7 @@ func TestKubeAPIServerStatusConditionFlow(t *testing.T) { tsProxyImage: testProxyImage, Client: fc, log: zap.Must(zap.NewDevelopment()).Sugar(), - tsClient: &fakeTSClient{}, + clients: tsclient.NewProvider(&fakeTSClient{}), clock: tstest.NewClock(tstest.ClockOpts{}), authKeyRateLimits: make(map[string]*rate.Limiter), authKeyReissuing: make(map[string]bool), @@ -1357,7 +1353,7 @@ func TestKubeAPIServerType_DoesNotOverwriteServicesConfig(t *testing.T) { tsProxyImage: testProxyImage, Client: fc, log: zap.Must(zap.NewDevelopment()).Sugar(), - tsClient: &fakeTSClient{}, + clients: tsclient.NewProvider(&fakeTSClient{}), clock: tstest.NewClock(tstest.ClockOpts{}), authKeyRateLimits: make(map[string]*rate.Limiter), authKeyReissuing: make(map[string]bool), @@ -1370,9 +1366,9 @@ func TestKubeAPIServerType_DoesNotOverwriteServicesConfig(t *testing.T) { }, Spec: tsapi.ProxyGroupSpec{ Type: tsapi.ProxyGroupTypeKubernetesAPIServer, - Replicas: ptr.To[int32](1), + Replicas: new(int32(1)), KubeAPIServer: &tsapi.KubeAPIServerConfig{ - Mode: ptr.To(tsapi.APIServerProxyModeNoAuth), // Avoid needing to pre-create the static ServiceAccount. + Mode: new(tsapi.APIServerProxyModeNoAuth), // Avoid needing to pre-create the static ServiceAccount. }, }, } @@ -1389,13 +1385,13 @@ func TestKubeAPIServerType_DoesNotOverwriteServicesConfig(t *testing.T) { App: new(kubetypes.AppProxyGroupKubeAPIServer), LogLevel: new("debug"), - Hostname: ptr.To("test-k8s-apiserver-0"), + Hostname: new("test-k8s-apiserver-0"), APIServerProxy: &conf.APIServerProxyConfig{ Enabled: opt.NewBool(true), - Mode: ptr.To(kubetypes.APIServerProxyModeNoAuth), + Mode: new(kubetypes.APIServerProxyModeNoAuth), IssueCerts: opt.NewBool(true), }, - LocalPort: ptr.To(uint16(9002)), + LocalPort: new(uint16(9002)), HealthCheckEnabled: opt.NewBool(true), }, } @@ -1419,7 +1415,7 @@ func TestKubeAPIServerType_DoesNotOverwriteServicesConfig(t *testing.T) { // Now simulate the kube-apiserver services reconciler updating config, // then check the proxygroup reconciler doesn't overwrite it. - cfg.APIServerProxy.ServiceName = ptr.To(tailcfg.ServiceName("svc:some-svc-name")) + cfg.APIServerProxy.ServiceName = new(tailcfg.ServiceName("svc:some-svc-name")) cfg.AdvertiseServices = []string{"svc:should-not-be-overwritten"} cfgB, err = json.Marshal(cfg) if err != nil { @@ -1444,7 +1440,7 @@ func TestIngressAdvertiseServicesConfigPreserved(t *testing.T) { tsProxyImage: testProxyImage, Client: fc, log: zap.Must(zap.NewDevelopment()).Sugar(), - tsClient: &fakeTSClient{}, + clients: tsclient.NewProvider(&fakeTSClient{}), clock: tstest.NewClock(tstest.ClockOpts{}), authKeyRateLimits: make(map[string]*rate.Limiter), authKeyReissuing: make(map[string]bool), @@ -1477,7 +1473,7 @@ func TestIngressAdvertiseServicesConfigPreserved(t *testing.T) { }, Spec: tsapi.ProxyGroupSpec{ Type: tsapi.ProxyGroupTypeIngress, - Replicas: ptr.To[int32](1), + Replicas: new(int32(1)), }, }) expectReconciled(t, reconciler, "", pgName) @@ -1491,7 +1487,7 @@ func TestIngressAdvertiseServicesConfigPreserved(t *testing.T) { AcceptDNS: "false", AcceptRoutes: "false", Locked: "false", - Hostname: ptr.To(fmt.Sprintf("%s-%d", pgName, 0)), + Hostname: new(fmt.Sprintf("%s-%d", pgName, 0)), }) if err != nil { t.Fatal(err) @@ -1627,7 +1623,7 @@ func TestValidateProxyGroup(t *testing.T) { } if tc.noauth { pg.Spec.KubeAPIServer = &tsapi.KubeAPIServerConfig{ - Mode: ptr.To(tsapi.APIServerProxyModeNoAuth), + Mode: new(tsapi.APIServerProxyModeNoAuth), } } @@ -1714,7 +1710,7 @@ func TestProxyGroupGetAuthKey(t *testing.T) { tsFirewallMode: "auto", Client: fc, - tsClient: tsClient, + clients: tsclient.NewProvider(tsClient), recorder: fr, log: zl.Sugar(), clock: cl, @@ -2036,10 +2032,10 @@ func addNodeIDToStateSecrets(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyG currentProfileKey: []byte(key), key: bytes, kubetypes.KeyDeviceIPs: []byte(`["1.2.3.4", "::1"]`), - kubetypes.KeyDeviceFQDN: []byte(fmt.Sprintf("hostname-nodeid-%d.tails-scales.ts.net", i)), + kubetypes.KeyDeviceFQDN: fmt.Appendf(nil, "hostname-nodeid-%d.tails-scales.ts.net", i), // TODO(tomhjp): We have two different mechanisms to retrieve device IDs. // Consolidate on this one. - kubetypes.KeyDeviceID: []byte(fmt.Sprintf("nodeid-%d", i)), + kubetypes.KeyDeviceID: fmt.Appendf(nil, "nodeid-%d", i), kubetypes.KeyPodUID: []byte(podUID), } }) @@ -2084,7 +2080,7 @@ func TestProxyGroupLetsEncryptStaging(t *testing.T) { }, Spec: tsapi.ProxyGroupSpec{ Type: tt.pgType, - Replicas: ptr.To[int32](1), + Replicas: new(int32(1)), ProxyClass: tt.proxyClassPerResource, }, } @@ -2110,7 +2106,7 @@ func TestProxyGroupLetsEncryptStaging(t *testing.T) { defaultTags: []string{"tag:test"}, defaultProxyClass: tt.defaultProxyClass, Client: fc, - tsClient: &fakeTSClient{}, + clients: tsclient.NewProvider(&fakeTSClient{}), log: zl.Sugar(), clock: cl, authKeyRateLimits: make(map[string]*rate.Limiter), diff --git a/cmd/k8s-operator/sts.go b/cmd/k8s-operator/sts.go index ea38ddece2749..66c2e917f376a 100644 --- a/cmd/k8s-operator/sts.go +++ b/cmd/k8s-operator/sts.go @@ -11,7 +11,7 @@ import ( "encoding/json" "errors" "fmt" - "net/http" + "maps" "os" "path" "slices" @@ -29,16 +29,16 @@ import ( "k8s.io/apiserver/pkg/storage/names" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/yaml" + "tailscale.com/client/tailscale/v2" - "tailscale.com/client/tailscale" "tailscale.com/ipn" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/kubetypes" "tailscale.com/net/netutil" "tailscale.com/tailcfg" "tailscale.com/types/opt" - "tailscale.com/types/ptr" "tailscale.com/util/mak" ) @@ -174,7 +174,7 @@ type tsnetServer interface { type tailscaleSTSReconciler struct { client.Client tsnetServer tsnetServer - tsClient tsClient + clients ClientProvider defaultTags []string operatorNamespace string proxyImage string @@ -183,9 +183,9 @@ type tailscaleSTSReconciler struct { loginServer string } -func (sts tailscaleSTSReconciler) validate() error { - if sts.tsFirewallMode != "" && !isValidFirewallMode(sts.tsFirewallMode) { - return fmt.Errorf("invalid proxy firewall mode %s, valid modes are iptables, nftables or unset", sts.tsFirewallMode) +func (r *tailscaleSTSReconciler) validate() error { + if r.tsFirewallMode != "" && !isValidFirewallMode(r.tsFirewallMode) { + return fmt.Errorf("invalid proxy firewall mode %s, valid modes are iptables, nftables or unset", r.tsFirewallMode) } return nil } @@ -197,22 +197,17 @@ func IsHTTPSEnabledOnTailnet(tsnetServer tsnetServer) bool { // Provision ensures that the StatefulSet for the given service is running and // up to date. -func (a *tailscaleSTSReconciler) Provision(ctx context.Context, logger *zap.SugaredLogger, sts *tailscaleSTSConfig) (*corev1.Service, error) { - tailscaleClient, loginUrl, err := a.getClientAndLoginURL(ctx, sts.Tailnet) - if err != nil { - return nil, fmt.Errorf("failed to get tailscale client and loginUrl: %w", err) - } - +func (r *tailscaleSTSReconciler) Provision(ctx context.Context, logger *zap.SugaredLogger, sts *tailscaleSTSConfig) (*corev1.Service, error) { // Do full reconcile. // TODO (don't create Service for the Connector) - hsvc, err := a.reconcileHeadlessService(ctx, logger, sts) + hsvc, err := r.reconcileHeadlessService(ctx, logger, sts) if err != nil { return nil, fmt.Errorf("failed to reconcile headless service: %w", err) } proxyClass := new(tsapi.ProxyClass) if sts.ProxyClassName != "" { - if err := a.Get(ctx, types.NamespacedName{Name: sts.ProxyClassName}, proxyClass); err != nil { + if err := r.Get(ctx, types.NamespacedName{Name: sts.ProxyClassName}, proxyClass); err != nil { return nil, fmt.Errorf("failed to get ProxyClass: %w", err) } if !tsoperator.ProxyClassIsReady(proxyClass) { @@ -222,12 +217,17 @@ func (a *tailscaleSTSReconciler) Provision(ctx context.Context, logger *zap.Suga } sts.ProxyClass = proxyClass - secretNames, err := a.provisionSecrets(ctx, tailscaleClient, loginUrl, sts, hsvc, logger) + tsClient, err := r.clients.For(sts.Tailnet) + if err != nil { + return nil, fmt.Errorf("failed to get tailscale client: %w", err) + } + + secretNames, err := r.provisionSecrets(ctx, tsClient, sts, hsvc, logger) if err != nil { return nil, fmt.Errorf("failed to create or get API key secret: %w", err) } - _, err = a.reconcileSTS(ctx, logger, sts, hsvc, secretNames) + _, err = r.reconcileSTS(ctx, logger, sts, hsvc, secretNames) if err != nil { return nil, fmt.Errorf("failed to reconcile statefulset: %w", err) } @@ -237,48 +237,20 @@ func (a *tailscaleSTSReconciler) Provision(ctx context.Context, logger *zap.Suga proxyLabels: hsvc.Labels, proxyType: sts.proxyType, } - if err = reconcileMetricsResources(ctx, logger, mo, sts.ProxyClass, a.Client); err != nil { + if err = reconcileMetricsResources(ctx, logger, mo, sts.ProxyClass, r.Client); err != nil { return nil, fmt.Errorf("failed to ensure metrics resources: %w", err) } return hsvc, nil } -// getClientAndLoginURL returns the appropriate Tailscale client and resolved login URL -// for the given tailnet name. If no tailnet is specified, returns the default client -// and login server. Applies fallback to the operator's login server if the tailnet -// doesn't specify a custom login URL. -func (a *tailscaleSTSReconciler) getClientAndLoginURL(ctx context.Context, tailnetName string) (tsClient, - string, error) { - if tailnetName == "" { - return a.tsClient, a.loginServer, nil - } - - tc, loginUrl, err := clientForTailnet(ctx, a.Client, a.operatorNamespace, tailnetName) - if err != nil { - return nil, "", err - } - - // Apply fallback if tailnet doesn't specify custom login URL - if loginUrl == "" { - loginUrl = a.loginServer - } - - return tc, loginUrl, nil -} - // Cleanup removes all resources associated that were created by Provision with // the given labels. It returns true when all resources have been removed, // otherwise it returns false and the caller should retry later. -func (a *tailscaleSTSReconciler) Cleanup(ctx context.Context, tailnet string, logger *zap.SugaredLogger, labels map[string]string, typ string) (done bool, _ error) { - tailscaleClient := a.tsClient - if tailnet != "" { - tc, _, err := clientForTailnet(ctx, a.Client, a.operatorNamespace, tailnet) - if err != nil { - logger.Errorf("failed to get tailscale client: %v", err) - return false, nil - } - - tailscaleClient = tc +func (r *tailscaleSTSReconciler) Cleanup(ctx context.Context, tailnet string, logger *zap.SugaredLogger, labels map[string]string, typ string) (done bool, _ error) { + tsClient, err := r.clients.For(tailnet) + if err != nil { + logger.Errorf("failed to get tailscale client: %v", err) + return false, nil } // Need to delete the StatefulSet first, and delete it with foreground @@ -287,7 +259,7 @@ func (a *tailscaleSTSReconciler) Cleanup(ctx context.Context, tailnet string, lo // assuming k8s ordering semantics don't mess with us, that should avoid // tailscale device deletion races where we fail to notice a device that // should be removed. - sts, err := getSingleObject[appsv1.StatefulSet](ctx, a.Client, a.operatorNamespace, labels) + sts, err := getSingleObject[appsv1.StatefulSet](ctx, r.Client, r.operatorNamespace, labels) if err != nil { return false, fmt.Errorf("getting statefulset: %w", err) } @@ -301,12 +273,12 @@ func (a *tailscaleSTSReconciler) Cleanup(ctx context.Context, tailnet string, lo } options := []client.DeleteAllOfOption{ - client.InNamespace(a.operatorNamespace), + client.InNamespace(r.operatorNamespace), client.MatchingLabels(labels), client.PropagationPolicy(metav1.DeletePropagationForeground), } - if err = a.DeleteAllOf(ctx, &appsv1.StatefulSet{}, options...); err != nil { + if err = r.DeleteAllOf(ctx, &appsv1.StatefulSet{}, options...); err != nil { return false, fmt.Errorf("deleting statefulset: %w", err) } @@ -314,7 +286,7 @@ func (a *tailscaleSTSReconciler) Cleanup(ctx context.Context, tailnet string, lo return false, nil } - devices, err := a.DeviceInfo(ctx, labels, logger) + devices, err := r.DeviceInfo(ctx, labels, logger) if err != nil { return false, fmt.Errorf("getting device info: %w", err) } @@ -322,34 +294,36 @@ func (a *tailscaleSTSReconciler) Cleanup(ctx context.Context, tailnet string, lo for _, dev := range devices { if dev.id != "" { logger.Debugf("deleting device %s from control", string(dev.id)) - if err = tailscaleClient.DeleteDevice(ctx, string(dev.id)); err != nil { - errResp := &tailscale.ErrResponse{} - if ok := errors.As(err, errResp); ok && errResp.Status == http.StatusNotFound { - logger.Debugf("device %s not found, likely because it has already been deleted from control", string(dev.id)) - } else { - return false, fmt.Errorf("deleting device: %w", err) - } - } else { - logger.Debugf("device %s deleted from control", string(dev.id)) + err = tsClient.Devices().Delete(ctx, string(dev.id)) + switch { + case tailscale.IsNotFound(err): + logger.Debugf("device %s not found, likely because it has already been deleted from control", string(dev.id)) + case err != nil: + return false, fmt.Errorf("deleting device: %w", err) } + + logger.Debugf("device %s deleted from control", string(dev.id)) } } - types := []client.Object{ + resourceTypes := []client.Object{ &corev1.Service{}, &corev1.Secret{}, } - for _, typ := range types { - if err := a.DeleteAllOf(ctx, typ, client.InNamespace(a.operatorNamespace), client.MatchingLabels(labels)); err != nil { + + for _, resourceType := range resourceTypes { + if err = r.DeleteAllOf(ctx, resourceType, client.InNamespace(r.operatorNamespace), client.MatchingLabels(labels)); err != nil { return false, err } } + mo := &metricsOpts{ proxyLabels: labels, - tsNamespace: a.operatorNamespace, + tsNamespace: r.operatorNamespace, proxyType: typ, } - if err = maybeCleanupMetricsResources(ctx, mo, a.Client); err != nil { + + if err = maybeCleanupMetricsResources(ctx, mo, r.Client); err != nil { return false, fmt.Errorf("error cleaning up metrics resources: %w", err) } @@ -383,12 +357,12 @@ func statefulSetNameBase(parent string) string { } } -func (a *tailscaleSTSReconciler) reconcileHeadlessService(ctx context.Context, logger *zap.SugaredLogger, sts *tailscaleSTSConfig) (*corev1.Service, error) { +func (r *tailscaleSTSReconciler) reconcileHeadlessService(ctx context.Context, logger *zap.SugaredLogger, sts *tailscaleSTSConfig) (*corev1.Service, error) { nameBase := statefulSetNameBase(sts.ParentResourceName) hsvc := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ GenerateName: nameBase, - Namespace: a.operatorNamespace, + Namespace: r.operatorNamespace, Labels: sts.ChildResourceLabels, }, Spec: corev1.ServiceSpec{ @@ -396,14 +370,14 @@ func (a *tailscaleSTSReconciler) reconcileHeadlessService(ctx context.Context, l Selector: map[string]string{ "app": sts.ParentResourceUID, }, - IPFamilyPolicy: ptr.To(corev1.IPFamilyPolicyPreferDualStack), + IPFamilyPolicy: new(corev1.IPFamilyPolicyPreferDualStack), }, } logger.Debugf("reconciling headless service for StatefulSet") - return createOrUpdate(ctx, a.Client, a.operatorNamespace, hsvc, func(svc *corev1.Service) { svc.Spec = hsvc.Spec }) + return createOrUpdate(ctx, r.Client, r.operatorNamespace, hsvc, func(svc *corev1.Service) { svc.Spec = hsvc.Spec }) } -func (a *tailscaleSTSReconciler) provisionSecrets(ctx context.Context, tailscaleClient tsClient, loginUrl string, stsC *tailscaleSTSConfig, hsvc *corev1.Service, logger *zap.SugaredLogger) ([]string, error) { +func (r *tailscaleSTSReconciler) provisionSecrets(ctx context.Context, tsClient tsclient.Client, stsC *tailscaleSTSConfig, hsvc *corev1.Service, logger *zap.SugaredLogger) ([]string, error) { secretNames := make([]string, stsC.Replicas) // Start by ensuring we have Secrets for the desired number of replicas. This will handle both creating and scaling @@ -412,7 +386,7 @@ func (a *tailscaleSTSReconciler) provisionSecrets(ctx context.Context, tailscale secret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: fmt.Sprintf("%s-%d", hsvc.Name, i), - Namespace: a.operatorNamespace, + Namespace: r.operatorNamespace, Labels: stsC.ChildResourceLabels, }, } @@ -427,7 +401,7 @@ func (a *tailscaleSTSReconciler) provisionSecrets(ctx context.Context, tailscale secretNames[i] = secret.Name var orig *corev1.Secret // unmodified copy of secret - if err := a.Get(ctx, client.ObjectKeyFromObject(secret), secret); err == nil { + if err := r.Get(ctx, client.ObjectKeyFromObject(secret), secret); err == nil { logger.Debugf("secret %s/%s already exists", secret.GetNamespace(), secret.GetName()) orig = secret.DeepCopy() } else if !apierrors.IsNotFound(err) { @@ -438,21 +412,23 @@ func (a *tailscaleSTSReconciler) provisionSecrets(ctx context.Context, tailscale authKey string err error ) + if orig == nil { // Create API Key secret which is going to be used by the statefulset // to authenticate with Tailscale. logger.Debugf("creating authkey for new tailscale proxy") tags := stsC.Tags if len(tags) == 0 { - tags = a.defaultTags + tags = r.defaultTags } - authKey, err = newAuthKey(ctx, tailscaleClient, tags) + + authKey, err = newAuthKey(ctx, tsClient, tags) if err != nil { return nil, err } } - configs, err := tailscaledConfig(stsC, loginUrl, authKey, orig, hostname) + configs, err := tailscaledConfig(stsC, tsClient.LoginURL(), authKey, orig, hostname) if err != nil { return nil, fmt.Errorf("error creating tailscaled config: %w", err) } @@ -484,12 +460,12 @@ func (a *tailscaleSTSReconciler) provisionSecrets(ctx context.Context, tailscale if orig != nil && !apiequality.Semantic.DeepEqual(latest, orig) { logger.With("config", sanitizeConfig(latestConfig)).Debugf("patching the existing proxy Secret") - if err = a.Patch(ctx, secret, client.MergeFrom(orig)); err != nil { + if err = r.Patch(ctx, secret, client.MergeFrom(orig)); err != nil { return nil, err } } else { logger.With("config", sanitizeConfig(latestConfig)).Debugf("creating a new Secret for the proxy") - if err = a.Create(ctx, secret); err != nil { + if err = r.Create(ctx, secret); err != nil { return nil, err } } @@ -498,7 +474,7 @@ func (a *tailscaleSTSReconciler) provisionSecrets(ctx context.Context, tailscale // Next, we check if we have additional secrets and remove them and their associated device. This happens when we // scale an StatefulSet down. var secrets corev1.SecretList - if err := a.List(ctx, &secrets, client.InNamespace(a.operatorNamespace), client.MatchingLabels(stsC.ChildResourceLabels)); err != nil { + if err := r.List(ctx, &secrets, client.InNamespace(r.operatorNamespace), client.MatchingLabels(stsC.ChildResourceLabels)); err != nil { return nil, err } @@ -518,19 +494,14 @@ func (a *tailscaleSTSReconciler) provisionSecrets(ctx context.Context, tailscale } if dev != nil && dev.id != "" { - var errResp *tailscale.ErrResponse - - err = tailscaleClient.DeleteDevice(ctx, string(dev.id)) - switch { - case errors.As(err, &errResp) && errResp.Status == http.StatusNotFound: - // This device has possibly already been deleted in the admin console. So we can ignore this - // and move on to removing the secret. - case err != nil: + // If we get a not found error then this device has possibly already been deleted in the admin console. + // So we can ignore this and move on to removing the secret. + if err = tsClient.Devices().Delete(ctx, string(dev.id)); err != nil && !tailscale.IsNotFound(err) { return nil, err } } - if err = a.Delete(ctx, &secret); err != nil { + if err = r.Delete(ctx, &secret); err != nil { return nil, err } } @@ -544,7 +515,7 @@ func sanitizeConfig(c ipn.ConfigVAlpha) ipn.ConfigVAlpha { // Explicitly redact AuthKey because we never want it appearing in logs. Never populate this with the // actual auth key. if c.AuthKey != nil { - c.AuthKey = ptr.To("**redacted**") + c.AuthKey = new("**redacted**") } return c @@ -554,9 +525,9 @@ func sanitizeConfig(c ipn.ConfigVAlpha) ipn.ConfigVAlpha { // It retrieves info from a Kubernetes Secret labeled with the provided labels. Capver is cross-validated against the // Pod to ensure that it is the currently running Pod that set the capver. If the Pod or the Secret does not exist, the // returned capver is -1. Either of device ID, hostname and IPs can be empty string if not found in the Secret. -func (a *tailscaleSTSReconciler) DeviceInfo(ctx context.Context, childLabels map[string]string, logger *zap.SugaredLogger) ([]*device, error) { +func (r *tailscaleSTSReconciler) DeviceInfo(ctx context.Context, childLabels map[string]string, logger *zap.SugaredLogger) ([]*device, error) { var secrets corev1.SecretList - if err := a.List(ctx, &secrets, client.InNamespace(a.operatorNamespace), client.MatchingLabels(childLabels)); err != nil { + if err := r.List(ctx, &secrets, client.InNamespace(r.operatorNamespace), client.MatchingLabels(childLabels)); err != nil { return nil, err } @@ -564,7 +535,7 @@ func (a *tailscaleSTSReconciler) DeviceInfo(ctx context.Context, childLabels map for _, sec := range secrets.Items { podUID := "" pod := new(corev1.Pod) - err := a.Get(ctx, types.NamespacedName{Namespace: sec.Namespace, Name: sec.Name}, pod) + err := r.Get(ctx, types.NamespacedName{Namespace: sec.Namespace, Name: sec.Name}, pod) switch { case apierrors.IsNotFound(err): // If the Pod is not found, we won't have its UID. We can still get the device information but the @@ -637,22 +608,18 @@ func deviceInfo(sec *corev1.Secret, podUID string, log *zap.SugaredLogger) (dev return dev, nil } -func newAuthKey(ctx context.Context, tsClient tsClient, tags []string) (string, error) { - caps := tailscale.KeyCapabilities{ - Devices: tailscale.KeyDeviceCapabilities{ - Create: tailscale.KeyDeviceCreateCapabilities{ - Reusable: false, - Preauthorized: true, - Tags: tags, - }, - }, - } +func newAuthKey(ctx context.Context, client tsclient.Client, tags []string) (string, error) { + var caps tailscale.KeyCapabilities + caps.Devices.Create.Reusable = false + caps.Devices.Create.Preauthorized = true + caps.Devices.Create.Tags = tags - key, _, err := tsClient.CreateKey(ctx, caps) + key, err := client.Keys().CreateAuthKey(ctx, tailscale.CreateKeyRequest{Capabilities: caps}) if err != nil { return "", err } - return key, nil + + return key.Key, nil } //go:embed deploy/manifests/proxy.yaml @@ -661,7 +628,7 @@ var proxyYaml []byte //go:embed deploy/manifests/userspace-proxy.yaml var userspaceProxyYaml []byte -func (a *tailscaleSTSReconciler) reconcileSTS(ctx context.Context, logger *zap.SugaredLogger, sts *tailscaleSTSConfig, headlessSvc *corev1.Service, proxySecrets []string) (*appsv1.StatefulSet, error) { +func (r *tailscaleSTSReconciler) reconcileSTS(ctx context.Context, logger *zap.SugaredLogger, sts *tailscaleSTSConfig, headlessSvc *corev1.Service, proxySecrets []string) (*appsv1.StatefulSet, error) { ss := new(appsv1.StatefulSet) if sts.ServeConfig != nil && sts.ForwardClusterTrafficViaL7IngressProxy != true { // If forwarding cluster traffic via is required we need non-userspace + NET_ADMIN + forwarding if err := yaml.Unmarshal(userspaceProxyYaml, &ss); err != nil { @@ -674,17 +641,17 @@ func (a *tailscaleSTSReconciler) reconcileSTS(ctx context.Context, logger *zap.S for i := range ss.Spec.Template.Spec.InitContainers { c := &ss.Spec.Template.Spec.InitContainers[i] if c.Name == "sysctler" { - c.Image = a.proxyImage + c.Image = r.proxyImage break } } } pod := &ss.Spec.Template container := &pod.Spec.Containers[0] - container.Image = a.proxyImage + container.Image = r.proxyImage ss.ObjectMeta = metav1.ObjectMeta{ Name: headlessSvc.Name, - Namespace: a.operatorNamespace, + Namespace: r.operatorNamespace, } for key, val := range sts.ChildResourceLabels { mak.Set(&ss.ObjectMeta.Labels, key, val) @@ -696,12 +663,11 @@ func (a *tailscaleSTSReconciler) reconcileSTS(ctx context.Context, logger *zap.S }, } mak.Set(&pod.Labels, "app", sts.ParentResourceUID) - for key, val := range sts.ChildResourceLabels { - pod.Labels[key] = val // sync StatefulSet labels to Pod to make it easier for users to select the Pod - } + // sync StatefulSet labels to Pod to make it easier for users to select the Pod + maps.Copy(pod.Labels, sts.ChildResourceLabels) if sts.Replicas > 0 { - ss.Spec.Replicas = ptr.To(sts.Replicas) + ss.Spec.Replicas = new(sts.Replicas) } // Generic containerboot configuration options. @@ -753,13 +719,13 @@ func (a *tailscaleSTSReconciler) reconcileSTS(ctx context.Context, logger *zap.S }) } - if a.tsFirewallMode != "" { + if r.tsFirewallMode != "" { container.Env = append(container.Env, corev1.EnvVar{ Name: "TS_DEBUG_FIREWALL_MODE", - Value: a.tsFirewallMode, + Value: r.tsFirewallMode, }) } - pod.Spec.PriorityClassName = a.proxyPriorityClassName + pod.Spec.PriorityClassName = r.proxyPriorityClassName // Ingress/egress proxy configuration options. if sts.ClusterTargetIP != "" { @@ -834,7 +800,7 @@ func (a *tailscaleSTSReconciler) reconcileSTS(ctx context.Context, logger *zap.S s.ObjectMeta.Labels = ss.Labels s.ObjectMeta.Annotations = ss.Annotations } - return createOrUpdate(ctx, a.Client, a.operatorNamespace, ss, updateSS) + return createOrUpdate(ctx, r.Client, r.operatorNamespace, ss, updateSS) } func appInfoForProxy(cfg *tailscaleSTSConfig) (string, error) { diff --git a/cmd/k8s-operator/svc-for-pg.go b/cmd/k8s-operator/svc-for-pg.go index 3e58db1b6cb0f..29d1a1ebd23c9 100644 --- a/cmd/k8s-operator/svc-for-pg.go +++ b/cmd/k8s-operator/svc-for-pg.go @@ -10,7 +10,6 @@ import ( "encoding/json" "errors" "fmt" - "net/http" "net/netip" "reflect" "slices" @@ -27,11 +26,12 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/client/tailscale/v2" - "tailscale.com/internal/client/tailscale" "tailscale.com/ipn" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/ingressservices" "tailscale.com/kube/kubetypes" "tailscale.com/tailcfg" @@ -57,7 +57,7 @@ type HAServiceReconciler struct { isDefaultLoadBalancer bool recorder record.EventRecorder logger *zap.SugaredLogger - tsClient tsClient + clients ClientProvider tsNamespace string defaultTags []string operatorID string // stableID of the operator's Tailscale device @@ -121,7 +121,7 @@ func (r *HAServiceReconciler) Reconcile(ctx context.Context, req reconcile.Reque return res, nil } - tailscaleClient, err := clientFromProxyGroup(ctx, r.Client, pg, r.tsNamespace, r.tsClient) + tsClient, err := r.clients.For(pg.Spec.Tailnet) if err != nil { return res, fmt.Errorf("failed to get tailscale client: %w", err) } @@ -131,7 +131,7 @@ func (r *HAServiceReconciler) Reconcile(ctx context.Context, req reconcile.Reque if !svc.DeletionTimestamp.IsZero() || !r.isTailscaleService(svc) { logger.Debugf("Service is being deleted or is (no longer) referring to Tailscale ingress/egress, ensuring any created resources are cleaned up") - _, err = r.maybeCleanup(ctx, hostname, svc, logger, tailscaleClient) + _, err = r.maybeCleanup(ctx, hostname, svc, logger, tsClient) return res, err } @@ -139,7 +139,7 @@ func (r *HAServiceReconciler) Reconcile(ctx context.Context, req reconcile.Reque // is the case, we reconcile the Ingress one more time to ensure that concurrent updates to the Tailscale Service in a // multi-cluster Ingress setup have not resulted in another actor overwriting our Tailscale Service update. needsRequeue := false - needsRequeue, err = r.maybeProvision(ctx, hostname, svc, pg, logger, tailscaleClient) + needsRequeue, err = r.maybeProvision(ctx, hostname, svc, pg, logger, tsClient) if err != nil { if strings.Contains(err.Error(), optimisticLockErrorMsg) { logger.Infof("optimistic lock error, retrying: %s", err) @@ -162,7 +162,7 @@ func (r *HAServiceReconciler) Reconcile(ctx context.Context, req reconcile.Reque // If a Tailscale Service exists, but does not have an owner reference from any operator, we error // out assuming that this is an owner reference created by an unknown actor. // Returns true if the operation resulted in a Tailscale Service update. -func (r *HAServiceReconciler) maybeProvision(ctx context.Context, hostname string, svc *corev1.Service, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger, tsClient tsClient) (svcsChanged bool, err error) { +func (r *HAServiceReconciler) maybeProvision(ctx context.Context, hostname string, svc *corev1.Service, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger, tsClient tsclient.Client) (svcsChanged bool, err error) { oldSvcStatus := svc.Status.DeepCopy() defer func() { if !apiequality.Semantic.DeepEqual(oldSvcStatus, &svc.Status) { @@ -209,8 +209,8 @@ func (r *HAServiceReconciler) maybeProvision(ctx context.Context, hostname strin // 2. Ensure that there isn't a Tailscale Service with the same hostname // already created and not owned by this Service. serviceName := tailcfg.ServiceName("svc:" + hostname) - existingTSSvc, err := tsClient.GetVIPService(ctx, serviceName) - if err != nil && !isErrorTailscaleServiceNotFound(err) { + existingTSSvc, err := tsClient.VIPServices().Get(ctx, serviceName.String()) + if err != nil && !tailscale.IsNotFound(err) { return false, fmt.Errorf("error getting Tailscale Service %q: %w", hostname, err) } @@ -233,8 +233,8 @@ func (r *HAServiceReconciler) maybeProvision(ctx context.Context, hostname strin tags = strings.Split(tstr, ",") } - tsSvc := &tailscale.VIPService{ - Name: serviceName, + tsSvc := tailscale.VIPService{ + Name: serviceName.String(), Tags: tags, Ports: []string{"do-not-validate"}, // we don't want to validate ports Comment: managedTSServiceComment, @@ -249,12 +249,13 @@ func (r *HAServiceReconciler) maybeProvision(ctx context.Context, hostname strin // with the same generation number has been reconciled ~more than N times and stop attempting to apply updates. if existingTSSvc == nil || !reflect.DeepEqual(tsSvc.Tags, existingTSSvc.Tags) || - !ownersAreSetAndEqual(tsSvc, existingTSSvc) { + !ownersAreSetAndEqual(tsSvc, *existingTSSvc) { logger.Infof("Ensuring Tailscale Service exists and is up to date") - if err := tsClient.CreateOrUpdateVIPService(ctx, tsSvc); err != nil { + if err = tsClient.VIPServices().CreateOrUpdate(ctx, tsSvc); err != nil { return false, fmt.Errorf("error creating Tailscale Service: %w", err) } - existingTSSvc = tsSvc + + existingTSSvc = &tsSvc } cm, cfgs, err := ingressSvcsConfigs(ctx, r.Client, pg.Name, r.tsNamespace) @@ -266,12 +267,12 @@ func (r *HAServiceReconciler) maybeProvision(ctx context.Context, hostname strin return false, nil } - if existingTSSvc.Addrs == nil { - existingTSSvc, err = tsClient.GetVIPService(ctx, tsSvc.Name) - if err != nil { + if len(existingTSSvc.Addrs) == 0 { + existingTSSvc, err = tsClient.VIPServices().Get(ctx, tsSvc.Name) + switch { + case err != nil: return false, fmt.Errorf("error getting Tailscale Service: %w", err) - } - if existingTSSvc.Addrs == nil { + case len(existingTSSvc.Addrs) == 0: // TODO(irbekrm): this should be a retry return false, fmt.Errorf("unexpected: Tailscale Service addresses not populated") } @@ -374,7 +375,7 @@ func (r *HAServiceReconciler) maybeProvision(ctx context.Context, hostname strin // Service is being deleted or is unexposed. The cleanup is safe for a multi-cluster setup- the Tailscale Service is only // deleted if it does not contain any other owner references. If it does the cleanup only removes the owner reference // corresponding to this Service. -func (r *HAServiceReconciler) maybeCleanup(ctx context.Context, hostname string, svc *corev1.Service, logger *zap.SugaredLogger, tsClient tsClient) (svcChanged bool, err error) { +func (r *HAServiceReconciler) maybeCleanup(ctx context.Context, hostname string, svc *corev1.Service, logger *zap.SugaredLogger, tsClient tsclient.Client) (svcChanged bool, err error) { logger.Debugf("Ensuring any resources for Service are cleaned up") ix := slices.Index(svc.Finalizers, svcPGFinalizerName) if ix < 0 { @@ -392,7 +393,7 @@ func (r *HAServiceReconciler) maybeCleanup(ctx context.Context, hostname string, serviceName := tailcfg.ServiceName("svc:" + hostname) // 1. Clean up the Tailscale Service. - svcChanged, err = cleanupTailscaleService(ctx, tsClient, serviceName, r.operatorID, logger) + svcChanged, err = cleanupTailscaleService(ctx, tsClient, serviceName.String(), r.operatorID, logger) if err != nil { return false, fmt.Errorf("error deleting Tailscale Service: %w", err) } @@ -425,7 +426,7 @@ func (r *HAServiceReconciler) maybeCleanup(ctx context.Context, hostname string, // Tailscale Services that are associated with the provided ProxyGroup and no longer managed this operator's instance are deleted, if not owned by other operator instances, else the owner reference is cleaned up. // Returns true if the operation resulted in existing Tailscale Service updates (owner reference removal). -func (r *HAServiceReconciler) maybeCleanupProxyGroup(ctx context.Context, proxyGroupName string, logger *zap.SugaredLogger, tsClient tsClient) (svcsChanged bool, err error) { +func (r *HAServiceReconciler) maybeCleanupProxyGroup(ctx context.Context, proxyGroupName string, logger *zap.SugaredLogger, tsClient tsclient.Client) (svcsChanged bool, err error) { cm, config, err := ingressSvcsConfigs(ctx, r.Client, proxyGroupName, r.tsNamespace) if err != nil { return false, fmt.Errorf("failed to get ingress service config: %s", err) @@ -453,7 +454,7 @@ func (r *HAServiceReconciler) maybeCleanupProxyGroup(ctx context.Context, proxyG return false, fmt.Errorf("failed to update tailscaled config services: %w", err) } - svcsChanged, err = cleanupTailscaleService(ctx, tsClient, tailcfg.ServiceName(tsSvcName), r.operatorID, logger) + svcsChanged, err = cleanupTailscaleService(ctx, tsClient, tsSvcName, r.operatorID, logger) if err != nil { return false, fmt.Errorf("deleting Tailscale Service %q: %w", tsSvcName, err) } @@ -517,30 +518,28 @@ func (r *HAServiceReconciler) shouldExposeClusterIP(svc *corev1.Service) bool { // If a Tailscale Service is found, but contains other owner references, only removes this operator's owner reference. // If a Tailscale Service by the given name is not found or does not contain this operator's owner reference, do nothing. // It returns true if an existing Tailscale Service was updated to remove owner reference, as well as any error that occurred. -func cleanupTailscaleService(ctx context.Context, tsClient tsClient, name tailcfg.ServiceName, operatorID string, logger *zap.SugaredLogger) (updated bool, err error) { - svc, err := tsClient.GetVIPService(ctx, name) - if err != nil { - errResp := &tailscale.ErrResponse{} - ok := errors.As(err, errResp) - if ok && errResp.Status == http.StatusNotFound { - return false, nil - } - if !ok { - return false, fmt.Errorf("unexpected error getting Tailscale Service %q: %w", name.String(), err) - } - - return false, fmt.Errorf("error getting Tailscale Service: %w", err) +func cleanupTailscaleService(ctx context.Context, tsClient tsclient.Client, name string, operatorID string, logger *zap.SugaredLogger) (updated bool, err error) { + svc, err := tsClient.VIPServices().Get(ctx, name) + switch { + case tailscale.IsNotFound(err): + return false, nil + case err != nil: + return false, fmt.Errorf("unexpected error getting Tailscale Service %q: %w", name, err) } + if svc == nil { return false, nil } + o, err := parseOwnerAnnotation(svc) if err != nil { return false, fmt.Errorf("error parsing Tailscale Service owner annotation: %w", err) } + if o == nil || len(o.OwnerRefs) == 0 { return false, nil } + // Comparing with the operatorID only means that we will not be able to // clean up Tailscale Services in cases where the operator was deleted from the // cluster before deleting the Ingress. Perhaps the comparison could be @@ -551,18 +550,22 @@ func cleanupTailscaleService(ctx context.Context, tsClient tsClient, name tailcf if ix == -1 { return false, nil } + if len(o.OwnerRefs) == 1 { logger.Infof("Deleting Tailscale Service %q", name) - return false, tsClient.DeleteVIPService(ctx, name) + return false, tsClient.VIPServices().Delete(ctx, name) } + o.OwnerRefs = slices.Delete(o.OwnerRefs, ix, ix+1) logger.Infof("Updating Tailscale Service %q", name) - json, err := json.Marshal(o) + + data, err := json.Marshal(o) if err != nil { return false, fmt.Errorf("error marshalling updated Tailscale Service owner reference: %w", err) } - svc.Annotations[ownerAnnotation] = string(json) - return true, tsClient.CreateOrUpdateVIPService(ctx, svc) + + svc.Annotations[ownerAnnotation] = string(data) + return true, tsClient.VIPServices().CreateOrUpdate(ctx, *svc) } func (r *HAServiceReconciler) backendRoutesSetup(ctx context.Context, serviceName, replicaName string, wantsCfg *ingressservices.Config, logger *zap.SugaredLogger) (bool, error) { diff --git a/cmd/k8s-operator/svc-for-pg_test.go b/cmd/k8s-operator/svc-for-pg_test.go index 07a2393115330..455d3363cb956 100644 --- a/cmd/k8s-operator/svc-for-pg_test.go +++ b/cmd/k8s-operator/svc-for-pg_test.go @@ -22,16 +22,15 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "tailscale.com/client/tailscale/v2" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/ingressservices" "tailscale.com/kube/kubetypes" "tailscale.com/tstest" - "tailscale.com/types/ptr" "tailscale.com/util/mak" - - "tailscale.com/tailcfg" ) func TestServicePGReconciler(t *testing.T) { @@ -103,11 +102,11 @@ func TestServicePGReconciler_UpdateHostname(t *testing.T) { verifyTailscaleService(t, ft, fmt.Sprintf("svc:%s", hostname), []string{"do-not-validate"}) verifyTailscaledConfig(t, fc, "test-pg", []string{fmt.Sprintf("svc:%s", hostname)}) - _, err := ft.GetVIPService(context.Background(), tailcfg.ServiceName(fmt.Sprintf("svc:default-%s", svc.Name))) + _, err := ft.VIPServices().Get(context.Background(), fmt.Sprintf("svc:default-%s", svc.Name)) if err == nil { t.Fatalf("svc:default-%s not cleaned up", svc.Name) } - if !isErrorTailscaleServiceNotFound(err) { + if !tailscale.IsNotFound(err) { t.Fatalf("unexpected error: %v", err) } } @@ -189,7 +188,9 @@ func setupServiceTest(t *testing.T) (*HAServiceReconciler, *corev1.Secret, clien t.Fatal(err) } - ft := &fakeTSClient{} + ft := &fakeTSClient{ + vipServices: make(map[string]tailscale.VIPService), + } zl, err := zap.NewDevelopment() if err != nil { t.Fatal(err) @@ -198,7 +199,7 @@ func setupServiceTest(t *testing.T) (*HAServiceReconciler, *corev1.Secret, clien cl := tstest.NewClock(tstest.ClockOpts{}) svcPGR := &HAServiceReconciler{ Client: fc, - tsClient: ft, + clients: tsclient.NewProvider(ft), clock: cl, defaultTags: []string{"tag:k8s"}, tsNamespace: "operator-ns", @@ -226,7 +227,7 @@ func TestValidateService(t *testing.T) { Spec: corev1.ServiceSpec{ ClusterIP: "1.2.3.4", Type: corev1.ServiceTypeLoadBalancer, - LoadBalancerClass: ptr.To("tailscale"), + LoadBalancerClass: new("tailscale"), }, } svc2 := &corev1.Service{ @@ -243,7 +244,7 @@ func TestValidateService(t *testing.T) { Spec: corev1.ServiceSpec{ ClusterIP: "1.2.3.5", Type: corev1.ServiceTypeLoadBalancer, - LoadBalancerClass: ptr.To("tailscale"), + LoadBalancerClass: new("tailscale"), }, } wantSvc := &corev1.Service{ @@ -276,22 +277,22 @@ func TestServicePGReconciler_MultiCluster(t *testing.T) { if i == 0 { ft = fti } else { - pgr.tsClient = ft + pgr.clients = tsclient.NewProvider(ft) } svc, _ := setupTestService(t, "test-multi-cluster", "", "4.3.2.1", fc, stateSecret) expectReconciled(t, pgr, "default", svc.Name) - tsSvcs, err := ft.ListVIPServices(context.Background()) + tsSvcs, err := ft.VIPServices().List(t.Context()) if err != nil { t.Fatalf("getting Tailscale Service: %v", err) } - if len(tsSvcs.VIPServices) != 1 { - t.Fatalf("unexpected number of Tailscale Services (%d)", len(tsSvcs.VIPServices)) + if len(tsSvcs) != 1 { + t.Fatalf("unexpected number of Tailscale Services (%d)", len(tsSvcs)) } - for _, svc := range tsSvcs.VIPServices { + for _, svc := range tsSvcs { t.Logf("found Tailscale Service with name %q", svc.Name) } } @@ -323,9 +324,9 @@ func TestIgnoreRegularService(t *testing.T) { verifyTailscaledConfig(t, fc, "test-pg", nil) - tsSvcs, err := ft.ListVIPServices(context.Background()) + tsSvcs, err := ft.VIPServices().List(t.Context()) if err == nil { - if len(tsSvcs.VIPServices) > 0 { + if len(tsSvcs) > 0 { t.Fatal("unexpected Tailscale Services found") } } @@ -380,7 +381,7 @@ func setupTestService(t *testing.T, svcName string, hostname string, clusterIP s }, Spec: corev1.ServiceSpec{ Type: corev1.ServiceTypeLoadBalancer, - LoadBalancerClass: ptr.To("tailscale"), + LoadBalancerClass: new("tailscale"), ClusterIP: clusterIP, ClusterIPs: []string{clusterIP}, }, @@ -400,7 +401,7 @@ func setupTestService(t *testing.T, svcName string, hostname string, clusterIP s { Addresses: []string{"4.3.2.1"}, Conditions: discoveryv1.EndpointConditions{ - Ready: ptr.To(true), + Ready: new(true), }, }, }, diff --git a/cmd/k8s-operator/tailnet.go b/cmd/k8s-operator/tailnet.go deleted file mode 100644 index 439489f750665..0000000000000 --- a/cmd/k8s-operator/tailnet.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) Tailscale Inc & contributors -// SPDX-License-Identifier: BSD-3-Clause - -//go:build !plan9 - -package main - -import ( - "context" - "fmt" - - "golang.org/x/oauth2" - "golang.org/x/oauth2/clientcredentials" - corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/client" - - "tailscale.com/internal/client/tailscale" - "tailscale.com/ipn" - operatorutils "tailscale.com/k8s-operator" - tsapi "tailscale.com/k8s-operator/apis/v1alpha1" -) - -func clientForTailnet(ctx context.Context, cl client.Client, namespace, name string) (tsClient, string, error) { - var tn tsapi.Tailnet - if err := cl.Get(ctx, client.ObjectKey{Name: name}, &tn); err != nil { - return nil, "", fmt.Errorf("failed to get tailnet %q: %w", name, err) - } - - if !operatorutils.TailnetIsReady(&tn) { - return nil, "", fmt.Errorf("tailnet %q is not ready", name) - } - - var secret corev1.Secret - if err := cl.Get(ctx, client.ObjectKey{Name: tn.Spec.Credentials.SecretName, Namespace: namespace}, &secret); err != nil { - return nil, "", fmt.Errorf("failed to get Secret %q in namespace %q: %w", tn.Spec.Credentials.SecretName, namespace, err) - } - - baseURL := ipn.DefaultControlURL - if tn.Spec.LoginURL != "" { - baseURL = tn.Spec.LoginURL - } - - credentials := clientcredentials.Config{ - ClientID: string(secret.Data["client_id"]), - ClientSecret: string(secret.Data["client_secret"]), - TokenURL: baseURL + "/api/v2/oauth/token", - } - - source := credentials.TokenSource(ctx) - httpClient := oauth2.NewClient(ctx, source) - - ts := tailscale.NewClient(defaultTailnet, nil) - ts.UserAgent = "tailscale-k8s-operator" - ts.HTTPClient = httpClient - ts.BaseURL = baseURL - - return ts, baseURL, nil -} - -func clientFromProxyGroup(ctx context.Context, cl client.Client, pg *tsapi.ProxyGroup, namespace string, def tsClient) (tsClient, error) { - if pg.Spec.Tailnet == "" { - return def, nil - } - - tailscaleClient, _, err := clientForTailnet(ctx, cl, namespace, pg.Spec.Tailnet) - if err != nil { - return nil, err - } - - return tailscaleClient, nil -} diff --git a/cmd/k8s-operator/testutils_test.go b/cmd/k8s-operator/testutils_test.go index 8e055e0dd164e..074d920940cf4 100644 --- a/cmd/k8s-operator/testutils_test.go +++ b/cmd/k8s-operator/testutils_test.go @@ -9,10 +9,12 @@ import ( "context" "encoding/json" "fmt" + "maps" "net/http" "net/netip" "path" "reflect" + "slices" "strings" "sync" "testing" @@ -30,13 +32,12 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/client/tailscale/v2" - "tailscale.com/internal/client/tailscale" "tailscale.com/ipn" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/kubetypes" - "tailscale.com/tailcfg" - "tailscale.com/types/ptr" "tailscale.com/util/mak" ) @@ -96,7 +97,7 @@ func expectedSTS(t *testing.T, cl client.Client, opts configOpts) *appsv1.Statef {Name: "TS_DEBUG_ACME_FORCE_RENEWAL", Value: "true"}, }, SecurityContext: &corev1.SecurityContext{ - Privileged: ptr.To(true), + Privileged: new(true), }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ @@ -231,7 +232,7 @@ func expectedSTS(t *testing.T, cl client.Client, opts configOpts) *appsv1.Statef Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Annotations: annots, - DeletionGracePeriodSeconds: ptr.To[int64](10), + DeletionGracePeriodSeconds: new(int64(10)), Labels: map[string]string{ "tailscale.com/managed": "true", "tailscale.com/parent-resource": "test", @@ -250,7 +251,7 @@ func expectedSTS(t *testing.T, cl client.Client, opts configOpts) *appsv1.Statef Command: []string{"/bin/sh", "-c"}, Args: []string{"sysctl -w net.ipv4.ip_forward=1 && if sysctl net.ipv6.conf.all.forwarding; then sysctl -w net.ipv6.conf.all.forwarding=1; fi"}, SecurityContext: &corev1.SecurityContext{ - Privileged: ptr.To(true), + Privileged: new(true), }, }, }, @@ -364,14 +365,14 @@ func expectedSTSUserspace(t *testing.T, cl client.Client, opts configOpts) *apps }, }, Spec: appsv1.StatefulSetSpec{ - Replicas: ptr.To[int32](1), + Replicas: new(int32(1)), Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{"app": "1234-UID"}, }, ServiceName: opts.stsName, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ - DeletionGracePeriodSeconds: ptr.To[int64](10), + DeletionGracePeriodSeconds: new(int64(10)), Labels: map[string]string{ "tailscale.com/managed": "true", "tailscale.com/parent-resource": "test", @@ -420,7 +421,7 @@ func expectedHeadlessService(name string, parentType string) *corev1.Service { "app": "1234-UID", }, ClusterIP: "None", - IPFamilyPolicy: ptr.To(corev1.IPFamilyPolicyPreferDualStack), + IPFamilyPolicy: new(corev1.IPFamilyPolicyPreferDualStack), }, } } @@ -480,7 +481,7 @@ func expectedServiceMonitor(t *testing.T, opts configOpts) *unstructured.Unstruc Namespace: opts.tailscaleNamespace, Labels: smLabels, ResourceVersion: opts.resourceVersion, - OwnerReferences: []metav1.OwnerReference{{APIVersion: "v1", Kind: "Service", Name: name, BlockOwnerDeletion: ptr.To(true), Controller: ptr.To(true)}}, + OwnerReferences: []metav1.OwnerReference{{APIVersion: "v1", Kind: "Service", Name: name, BlockOwnerDeletion: new(true), Controller: new(true)}}, }, TypeMeta: metav1.TypeMeta{ Kind: "ServiceMonitor", @@ -556,7 +557,7 @@ func expectedSecret(t *testing.T, cl client.Client, opts configOpts) *corev1.Sec if opts.isExitNode { r = "0.0.0.0/0,::/0," + r } - for _, rr := range strings.Split(r, ",") { + for rr := range strings.SplitSeq(r, ",") { prefix, err := netip.ParsePrefix(rr) if err != nil { t.Fatal(err) @@ -823,12 +824,9 @@ func expectEvents(t *testing.T, rec *record.FakeRecorder, wantsEvents []string) select { case gotEvent := <-rec.Events: found := false - for _, wantEvent := range wantsEvents { - if wantEvent == gotEvent { - found = true - seenEvents = append(seenEvents, gotEvent) - break - } + if slices.Contains(wantsEvents, gotEvent) { + found = true + seenEvents = append(seenEvents, gotEvent) } if !found { t.Errorf("got unexpected event %q, expected events: %+#v", gotEvent, wantsEvents) @@ -839,60 +837,137 @@ func expectEvents(t *testing.T, rec *record.FakeRecorder, wantsEvents []string) } } -type fakeTSClient struct { - sync.Mutex - keyRequests []tailscale.KeyCapabilities - deleted []string - vipServices map[tailcfg.ServiceName]*tailscale.VIPService -} -type fakeTSNetServer struct { - certDomains []string +type ( + fakeTSClient struct { + sync.Mutex + loginURL string + keyRequests []tailscale.KeyCapabilities + deleted []string + devices []tailscale.Device + vipServices map[string]tailscale.VIPService + } + + fakeVIPServices struct { + mu sync.RWMutex + vipServices map[string]tailscale.VIPService + } + + fakeKeys struct { + keyRequests *[]tailscale.KeyCapabilities + } + + fakeDevices struct { + deleted *[]string + devices *[]tailscale.Device + } +) + +func (c *fakeTSClient) VIPServices() tsclient.VIPServiceResource { + return &fakeVIPServices{ + vipServices: c.vipServices, + } } -func (f *fakeTSNetServer) CertDomains() []string { - return f.certDomains +func (m *fakeVIPServices) List(_ context.Context) ([]tailscale.VIPService, error) { + m.mu.RLock() + defer m.mu.RUnlock() + + if len(m.vipServices) == 0 { + return nil, tailscale.APIError{Status: http.StatusNotFound} + } + + return slices.Collect(maps.Values(m.vipServices)), nil } -func (c *fakeTSClient) CreateKey(ctx context.Context, caps tailscale.KeyCapabilities) (string, *tailscale.Key, error) { - c.Lock() - defer c.Unlock() - c.keyRequests = append(c.keyRequests, caps) - k := &tailscale.Key{ - ID: "key", - Created: time.Now(), - Capabilities: caps, +func (m *fakeVIPServices) Delete(_ context.Context, name string) error { + m.mu.Lock() + defer m.mu.Unlock() + + if _, ok := m.vipServices[name]; !ok { + return tailscale.APIError{Status: http.StatusNotFound} } - return "new-authkey", k, nil + + delete(m.vipServices, name) + return nil } -func (c *fakeTSClient) Device(ctx context.Context, deviceID string, fields *tailscale.DeviceFieldsOpts) (*tailscale.Device, error) { - return &tailscale.Device{ - DeviceID: deviceID, - Hostname: "hostname-" + deviceID, - Addresses: []string{ - "1.2.3.4", - "::1", - }, - }, nil +func (m *fakeVIPServices) Get(_ context.Context, name string) (*tailscale.VIPService, error) { + if svc, ok := m.vipServices[name]; ok { + return &svc, nil + } + + return nil, tailscale.APIError{Status: http.StatusNotFound} } -func (c *fakeTSClient) DeleteDevice(ctx context.Context, deviceID string) error { - c.Lock() - defer c.Unlock() - c.deleted = append(c.deleted, deviceID) +func (m *fakeVIPServices) CreateOrUpdate(_ context.Context, svc tailscale.VIPService) error { + m.mu.Lock() + defer m.mu.Unlock() + + if svc.Addrs == nil { + svc.Addrs = []string{vipTestIP} + } + + m.vipServices[svc.Name] = svc return nil } -func (c *fakeTSClient) KeyRequests() []tailscale.KeyCapabilities { - c.Lock() - defer c.Unlock() - return c.keyRequests +func (c *fakeTSClient) Devices() tsclient.DeviceResource { + return &fakeDevices{ + deleted: &c.deleted, + devices: &c.devices, + } +} + +func (m *fakeDevices) Delete(_ context.Context, id string) error { + *m.deleted = append(*m.deleted, id) + + return tailscale.APIError{Status: http.StatusNotFound} +} + +func (m *fakeDevices) List(_ context.Context, _ ...tailscale.ListDevicesOptions) ([]tailscale.Device, error) { + return *m.devices, nil } -func (c *fakeTSClient) Deleted() []string { - c.Lock() - defer c.Unlock() - return c.deleted +func (m *fakeDevices) Get(_ context.Context, id string) (*tailscale.Device, error) { + if m.devices == nil { + return nil, tailscale.APIError{Status: http.StatusNotFound} + } + + for _, dev := range *m.devices { + if dev.ID == id { + return &dev, nil + } + } + + return nil, tailscale.APIError{Status: http.StatusNotFound} +} + +func (c *fakeTSClient) Keys() tsclient.KeyResource { + return &fakeKeys{ + keyRequests: &c.keyRequests, + } +} + +func (m *fakeKeys) CreateAuthKey(_ context.Context, ckr tailscale.CreateKeyRequest) (*tailscale.Key, error) { + *m.keyRequests = append(*m.keyRequests, ckr.Capabilities) + + return &tailscale.Key{Key: "new-authkey"}, nil +} + +func (m *fakeKeys) List(_ context.Context, _ bool) ([]tailscale.Key, error) { + return nil, nil +} + +func (c *fakeTSClient) LoginURL() string { + return c.loginURL +} + +type fakeTSNetServer struct { + certDomains []string +} + +func (f *fakeTSNetServer) CertDomains() []string { + return f.certDomains } func removeResourceReqs(sts *appsv1.StatefulSet) { @@ -938,53 +1013,3 @@ func removeAuthKeyIfExistsModifier(t *testing.T) func(s *corev1.Secret) { } } } - -func (c *fakeTSClient) GetVIPService(ctx context.Context, name tailcfg.ServiceName) (*tailscale.VIPService, error) { - c.Lock() - defer c.Unlock() - if c.vipServices == nil { - return nil, tailscale.ErrResponse{Status: http.StatusNotFound} - } - svc, ok := c.vipServices[name] - if !ok { - return nil, tailscale.ErrResponse{Status: http.StatusNotFound} - } - return svc, nil -} - -func (c *fakeTSClient) ListVIPServices(ctx context.Context) (*tailscale.VIPServiceList, error) { - c.Lock() - defer c.Unlock() - if c.vipServices == nil { - return nil, &tailscale.ErrResponse{Status: http.StatusNotFound} - } - result := &tailscale.VIPServiceList{} - for _, svc := range c.vipServices { - result.VIPServices = append(result.VIPServices, *svc) - } - return result, nil -} - -func (c *fakeTSClient) CreateOrUpdateVIPService(ctx context.Context, svc *tailscale.VIPService) error { - c.Lock() - defer c.Unlock() - if c.vipServices == nil { - c.vipServices = make(map[tailcfg.ServiceName]*tailscale.VIPService) - } - - if svc.Addrs == nil { - svc.Addrs = []string{vipTestIP} - } - - c.vipServices[svc.Name] = svc - return nil -} - -func (c *fakeTSClient) DeleteVIPService(ctx context.Context, name tailcfg.ServiceName) error { - c.Lock() - defer c.Unlock() - if c.vipServices != nil { - delete(c.vipServices, name) - } - return nil -} diff --git a/cmd/k8s-operator/tsrecorder.go b/cmd/k8s-operator/tsrecorder.go index 3857908f2bc1c..86669d212e738 100644 --- a/cmd/k8s-operator/tsrecorder.go +++ b/cmd/k8s-operator/tsrecorder.go @@ -10,14 +10,15 @@ import ( "encoding/json" "errors" "fmt" - "net/http" "slices" "strconv" "strings" "sync" + "time" "go.uber.org/zap" xslices "golang.org/x/exp/slices" + "golang.org/x/time/rate" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -30,10 +31,11 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/client/tailscale/v2" - "tailscale.com/client/tailscale" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" + "tailscale.com/k8s-operator/tsclient" "tailscale.com/kube/kubetypes" "tailscale.com/tailcfg" "tailscale.com/tstime" @@ -57,15 +59,15 @@ var gaugeRecorderResources = clientmetric.NewGauge(kubetypes.MetricRecorderCount // Recorder CRs. type RecorderReconciler struct { client.Client - log *zap.SugaredLogger - recorder record.EventRecorder - clock tstime.Clock - tsNamespace string - tsClient tsClient - loginServer string - - mu sync.Mutex // protects following - recorders set.Slice[types.UID] // for recorders gauge + log *zap.SugaredLogger + recorder record.EventRecorder + clock tstime.Clock + clients ClientProvider + tsNamespace string + authKeyRateLimits map[string]*rate.Limiter // per-Recorder rate limiters for auth key re-issuance. + authKeyReissuing map[string]bool + mu sync.Mutex // protects following + recorders set.Slice[types.UID] // for recorders gauge } func (r *RecorderReconciler) logger(name string) *zap.SugaredLogger { @@ -99,7 +101,7 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques return reconcile.Result{}, nil } - tailscaleClient, loginUrl, err := r.getClientAndLoginURL(ctx, tsr.Spec.Tailnet) + tsClient, err := r.clients.For(tsr.Spec.Tailnet) if err != nil { return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderTailnetUnavailable, err.Error()) } @@ -112,7 +114,7 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques return reconcile.Result{}, nil } - if done, err := r.maybeCleanup(ctx, tsr, tailscaleClient); err != nil { + if done, err := r.maybeCleanup(ctx, tsr, tsClient); err != nil { return reconcile.Result{}, err } else if !done { logger.Debugf("Recorder resource cleanup not yet finished, will retry...") @@ -144,7 +146,7 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderInvalid, message) } - if err = r.maybeProvision(ctx, tailscaleClient, loginUrl, tsr); err != nil { + if err = r.maybeProvision(ctx, tsClient, tsr); err != nil { reason := reasonRecorderCreationFailed message := fmt.Sprintf("failed creating Recorder: %s", err) if strings.Contains(err.Error(), optimisticLockErrorMsg) { @@ -162,47 +164,33 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques return setStatusReady(tsr, metav1.ConditionTrue, reasonRecorderCreated, reasonRecorderCreated) } -// getClientAndLoginURL returns the appropriate Tailscale client and resolved login URL -// for the given tailnet name. If no tailnet is specified, returns the default client -// and login server. Applies fallback to the operator's login server if the tailnet -// doesn't specify a custom login URL. -func (r *RecorderReconciler) getClientAndLoginURL(ctx context.Context, tailnetName string) (tsClient, - string, error) { - if tailnetName == "" { - return r.tsClient, r.loginServer, nil - } - - tc, loginUrl, err := clientForTailnet(ctx, r.Client, r.tsNamespace, tailnetName) - if err != nil { - return nil, "", err - } +func (r *RecorderReconciler) maybeProvision(ctx context.Context, tsClient tsclient.Client, tsr *tsapi.Recorder) error { + logger := r.logger(tsr.Name) - // Apply fallback if tailnet doesn't specify custom login URL - if loginUrl == "" { - loginUrl = r.loginServer + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas } - return tc, loginUrl, nil -} - -func (r *RecorderReconciler) maybeProvision(ctx context.Context, tailscaleClient tsClient, loginUrl string, tsr *tsapi.Recorder) error { - logger := r.logger(tsr.Name) - r.mu.Lock() r.recorders.Add(tsr.UID) gaugeRecorderResources.Set(int64(r.recorders.Len())) + if _, ok := r.authKeyRateLimits[tsr.Name]; !ok { + r.authKeyRateLimits[tsr.Name] = rate.NewLimiter(rate.Every(30*time.Second), int(replicas)) + } + for replica := range replicas { + name := fmt.Sprintf("%s-%d", tsr.Name, replica) + if _, ok := r.authKeyReissuing[name]; !ok { + r.authKeyReissuing[name] = false + } + } r.mu.Unlock() - if err := r.ensureAuthSecretsCreated(ctx, tailscaleClient, tsr); err != nil { + if err := r.ensureAuthSecretsCreated(ctx, tsClient, tsr); err != nil { return fmt.Errorf("error creating secrets: %w", err) } // State Secrets are pre-created so we can use the Recorder CR as its owner ref. - var replicas int32 = 1 - if tsr.Spec.Replicas != nil { - replicas = *tsr.Spec.Replicas - } - for replica := range replicas { sec := tsrStateSecret(tsr, r.tsNamespace, replica) _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, sec, func(s *corev1.Secret) { @@ -252,7 +240,7 @@ func (r *RecorderReconciler) maybeProvision(ctx context.Context, tailscaleClient return fmt.Errorf("error creating RoleBinding: %w", err) } - ss := tsrStatefulSet(tsr, r.tsNamespace, loginUrl) + ss := tsrStatefulSet(tsr, r.tsNamespace, tsClient.LoginURL()) _, err = createOrUpdate(ctx, r.Client, r.tsNamespace, ss, func(s *appsv1.StatefulSet) { s.ObjectMeta.Labels = ss.ObjectMeta.Labels s.ObjectMeta.Annotations = ss.ObjectMeta.Annotations @@ -271,13 +259,13 @@ func (r *RecorderReconciler) maybeProvision(ctx context.Context, tailscaleClient // If we have scaled the recorder down, we will have dangling state secrets // that we need to clean up. - if err = r.maybeCleanupSecrets(ctx, tailscaleClient, tsr); err != nil { + if err = r.maybeCleanupSecrets(ctx, tsClient, tsr); err != nil { return fmt.Errorf("error cleaning up Secrets: %w", err) } var devices []tsapi.RecorderTailnetDevice for replica := range replicas { - dev, ok, err := r.getDeviceInfo(ctx, tailscaleClient, tsr.Name, replica) + dev, ok, err := r.getDeviceInfo(ctx, tsClient, tsr.Name, replica) switch { case err != nil: return fmt.Errorf("failed to get device info: %w", err) @@ -342,7 +330,7 @@ func (r *RecorderReconciler) maybeCleanupServiceAccounts(ctx context.Context, ts return nil } -func (r *RecorderReconciler) maybeCleanupSecrets(ctx context.Context, tailscaleClient tsClient, tsr *tsapi.Recorder) error { +func (r *RecorderReconciler) maybeCleanupSecrets(ctx context.Context, tsClient tsclient.Client, tsr *tsapi.Recorder) error { options := []client.ListOption{ client.InNamespace(r.tsNamespace), client.MatchingLabels(tsrLabels("recorder", tsr.Name, nil)), @@ -381,14 +369,12 @@ func (r *RecorderReconciler) maybeCleanupSecrets(ctx context.Context, tailscaleC } if ok { - var errResp *tailscale.ErrResponse - r.log.Debugf("deleting device %s", devicePrefs.Config.NodeID) - err = tailscaleClient.DeleteDevice(ctx, string(devicePrefs.Config.NodeID)) + err = tsClient.Devices().Delete(ctx, string(devicePrefs.Config.NodeID)) switch { - case errors.As(err, &errResp) && errResp.Status == http.StatusNotFound: - // This device has possibly already been deleted in the admin console. So we can ignore this - // and move on to removing the secret. + case tailscale.IsNotFound(err): + // This device has possibly already been deleted in the admin console. So we can ignore this + // and move on to removing the secret. case err != nil: return err } @@ -405,7 +391,7 @@ func (r *RecorderReconciler) maybeCleanupSecrets(ctx context.Context, tailscaleC // maybeCleanup just deletes the device from the tailnet. All the kubernetes // resources linked to a Recorder will get cleaned up via owner references // (which we can use because they are all in the same namespace). -func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Recorder, tailscaleClient tsClient) (bool, error) { +func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Recorder, tsClient tsclient.Client) (bool, error) { logger := r.logger(tsr.Name) var replicas int32 = 1 @@ -429,13 +415,12 @@ func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Record nodeID := string(devicePrefs.Config.NodeID) logger.Debugf("deleting device %s from control", nodeID) - if err = tailscaleClient.DeleteDevice(ctx, nodeID); err != nil { - errResp := &tailscale.ErrResponse{} - if errors.As(err, errResp) && errResp.Status == http.StatusNotFound { - logger.Debugf("device %s not found, likely because it has already been deleted from control", nodeID) - continue - } - + err = tsClient.Devices().Delete(ctx, nodeID) + switch { + case tailscale.IsNotFound(err): + logger.Debugf("device %s not found, likely because it has already been deleted from control", nodeID) + continue + case err != nil: return false, fmt.Errorf("error deleting device: %w", err) } @@ -450,12 +435,16 @@ func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Record r.mu.Lock() r.recorders.Remove(tsr.UID) gaugeRecorderResources.Set(int64(r.recorders.Len())) + delete(r.authKeyRateLimits, tsr.Name) + for replica := range replicas { + delete(r.authKeyReissuing, fmt.Sprintf("%s-%d", tsr.Name, replica)) + } r.mu.Unlock() return true, nil } -func (r *RecorderReconciler) ensureAuthSecretsCreated(ctx context.Context, tailscaleClient tsClient, tsr *tsapi.Recorder) error { +func (r *RecorderReconciler) ensureAuthSecretsCreated(ctx context.Context, tsClient tsclient.Client, tsr *tsapi.Recorder) error { var replicas int32 = 1 if tsr.Spec.Replicas != nil { replicas = *tsr.Spec.Replicas @@ -474,25 +463,119 @@ func (r *RecorderReconciler) ensureAuthSecretsCreated(ctx context.Context, tails Name: fmt.Sprintf("%s-auth-%d", tsr.Name, replica), } - err := r.Get(ctx, key, &corev1.Secret{}) + existingSecret := &corev1.Secret{} + err := r.Get(ctx, key, existingSecret) switch { case err == nil: - logger.Debugf("auth Secret %q already exists", key.Name) + reissue, err := r.shouldReissueAuthKey(ctx, tsClient, tsr, replica, existingSecret) + if err != nil { + return fmt.Errorf("error checking auth key reissue for replica %d: %w", replica, err) + } + if !reissue { + logger.Debugf("auth Secret %q already exists, no reissue needed", key.Name) + continue + } + authKey, err := newAuthKey(ctx, tsClient, tags.Stringify()) + if err != nil { + return err + } + existingSecret.Data["authkey"] = []byte(authKey) + if err = r.Update(ctx, existingSecret); err != nil { + return err + } continue - case !apierrors.IsNotFound(err): + case apierrors.IsNotFound(err): + authKey, err := newAuthKey(ctx, tsClient, tags.Stringify()) + if err != nil { + return err + } + if err := r.Create(ctx, tsrAuthSecret(tsr, r.tsNamespace, authKey, replica)); err != nil { + return err + } + default: return fmt.Errorf("failed to get Secret %q: %w", key.Name, err) } + } - authKey, err := newAuthKey(ctx, tailscaleClient, tags.Stringify()) - if err != nil { - return err + return nil +} + +// shouldReissueAuthKey returns true if the proxy needs a new auth key. It +// tracks in-flight reissues via authKeyReissuing to avoid duplicate API calls +// across reconciles. +func (r *RecorderReconciler) shouldReissueAuthKey(ctx context.Context, tsClient tsclient.Client, tsr *tsapi.Recorder, replica int32, authSecret *corev1.Secret) (shouldReissue bool, err error) { + stateSecret, err := r.getStateSecret(ctx, tsr.Name, replica) + if err != nil || stateSecret == nil { + return false, err + } + + stateSecretName := fmt.Sprintf("%s-%d", tsr.Name, replica) + + r.mu.Lock() + reissuing := r.authKeyReissuing[stateSecretName] + r.mu.Unlock() + + if reissuing { + _, requestStillPresent := stateSecret.Data[kubetypes.KeyReissueAuthkey] + if !requestStillPresent { + r.mu.Lock() + r.authKeyReissuing[stateSecretName] = false + r.mu.Unlock() + r.log.Debugf("auth key reissue completed for %q", stateSecretName) + return false, nil } + r.log.Debugf("auth key already in process of re-issuance for %q, waiting", stateSecretName) + return false, nil + } - if err = r.Create(ctx, tsrAuthSecret(tsr, r.tsNamespace, authKey, replica)); err != nil { - return err + defer func() { + r.mu.Lock() + r.authKeyReissuing[stateSecretName] = shouldReissue + r.mu.Unlock() + }() + + brokenAuthkey, ok := stateSecret.Data[kubetypes.KeyReissueAuthkey] + if !ok { + return false, nil + } + + cfgAuthKey := string(authSecret.Data["authkey"]) + empty := cfgAuthKey == "" + broken := cfgAuthKey == string(brokenAuthkey) + + if !empty && !broken { + return false, nil + } + + lim := r.authKeyRateLimits[tsr.Name] + if !lim.Allow() { + r.log.Debugf("auth key re-issuance rate limit exceeded, limit: %.2f, burst: %d, tokens: %.2f", + lim.Limit(), lim.Burst(), lim.Tokens()) + return false, fmt.Errorf("auth key re-issuance rate limit exceeded for Recorder %q, will retry with backoff", tsr.Name) + } + + r.log.Infof("Recorder replica %s failing to auth; attempting cleanup and new key", stateSecretName) + if tsID := stateSecret.Data[kubetypes.KeyDeviceID]; len(tsID) > 0 { + id := tailcfg.StableNodeID(tsID) + if err := r.ensureDeviceDeleted(ctx, tsClient, id, r.log); err != nil { + return false, err } } + return true, nil +} + +func (r *RecorderReconciler) ensureDeviceDeleted(ctx context.Context, tsClient tsclient.Client, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error { + logger.Debugf("deleting device %s from control", string(id)) + err := tsClient.Devices().Delete(ctx, string(id)) + switch { + case tailscale.IsNotFound(err): + logger.Debugf("device %s not found, likely because it has already been deleted from control", string(id)) + case err != nil: + return fmt.Errorf("error deleting device: %w", err) + default: + logger.Debugf("device %s deleted from control", string(id)) + } return nil } @@ -585,7 +668,7 @@ func getDevicePrefs(secret *corev1.Secret) (prefs prefs, ok bool, err error) { return prefs, ok, nil } -func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tailscaleClient tsClient, tsrName string, replica int32) (d tsapi.RecorderTailnetDevice, ok bool, err error) { +func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tsClient tsclient.Client, tsrName string, replica int32) (d tsapi.RecorderTailnetDevice, ok bool, err error) { secret, err := r.getStateSecret(ctx, tsrName, replica) if err != nil || secret == nil { return tsapi.RecorderTailnetDevice{}, false, err @@ -599,7 +682,7 @@ func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tailscaleClient // TODO(tomhjp): The profile info doesn't include addresses, which is why we // need the API. Should maybe update tsrecorder to write IPs to the state // Secret like containerboot does. - device, err := tailscaleClient.Device(ctx, string(prefs.Config.NodeID), nil) + device, err := tsClient.Devices().Get(ctx, string(prefs.Config.NodeID)) if err != nil { return tsapi.RecorderTailnetDevice{}, false, fmt.Errorf("failed to get device info from API: %w", err) } diff --git a/go.mod b/go.mod index 352023fefe006..11b6605bfcc5d 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module tailscale.com -go 1.26.1 +go 1.26.3 require ( filippo.io/mkcert v1.4.4 @@ -9,7 +9,6 @@ require ( github.com/akutz/memconn v0.1.0 github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa github.com/andybalholm/brotli v1.1.0 - github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be github.com/atotto/clipboard v0.1.4 github.com/aws/aws-sdk-go-v2 v1.41.0 github.com/aws/aws-sdk-go-v2/config v1.29.5 @@ -18,7 +17,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/ssm v1.44.7 github.com/axiomhq/hyperloglog v0.0.0-20240319100328-84253e514e02 github.com/bradfitz/go-tool-cache v0.0.0-20260216153636-9e5201344fe5 - github.com/bradfitz/monogok v0.0.0-20260208031948-2219c393d032 + github.com/bradfitz/monogok v0.0.0-20260429173803-229ef7981a6b github.com/bramvdbogaerde/go-scp v1.4.0 github.com/cilium/ebpf v0.16.0 github.com/coder/websocket v1.8.12 @@ -45,13 +44,14 @@ require ( github.com/go4org/plan9netshell v0.0.0-20250324183649-788daa080737 github.com/godbus/dbus/v5 v5.1.1-0.20230522191255-76236955d466 github.com/gokrazy/breakglass v0.0.0-20251229072214-9dbc0478d486 - github.com/gokrazy/gokrazy v0.0.0-20260123094004-294c93fa173c + github.com/gokrazy/gokrazy v0.0.0-20260418085648-c38c3134b8a7 + github.com/gokrazy/kernel.arm64 v0.0.0-20260403054012-807489e0272a github.com/gokrazy/serial-busybox v0.0.0-20250119153030-ac58ba7574e7 github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 github.com/golang/snappy v0.0.4 github.com/golangci/golangci-lint v1.57.1 github.com/google/go-cmp v0.7.0 - github.com/google/go-containerregistry v0.20.7 + github.com/google/go-containerregistry v0.21.5 github.com/google/go-tpm v0.9.4 github.com/google/gopacket v1.1.19 github.com/google/nftables v0.2.1-0.20240414091927-5e242ec57806 @@ -68,7 +68,8 @@ require ( github.com/jellydator/ttlcache/v3 v3.1.0 github.com/jsimonetti/rtnetlink v1.4.0 github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 - github.com/klauspost/compress v1.18.2 + github.com/kdomanski/iso9660 v0.4.0 + github.com/klauspost/compress v1.18.5 github.com/kortschak/wol v0.0.0-20200729010619-da482cc4850a github.com/mattn/go-colorable v0.1.13 github.com/mattn/go-isatty v0.0.20 @@ -81,26 +82,28 @@ require ( github.com/pires/go-proxyproto v0.8.1 github.com/pkg/errors v0.9.1 github.com/pkg/sftp v1.13.6 - github.com/prometheus-community/pro-bing v0.4.0 github.com/prometheus/client_golang v1.23.0 github.com/prometheus/common v0.65.0 github.com/prometheus/prometheus v0.49.2-0.20240125131847-c3b8ef1694ff + github.com/robert-nix/ansihtml v1.0.1 github.com/safchain/ethtool v0.3.0 github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e github.com/studio-b12/gowebdav v0.9.0 - github.com/tailscale/certstore v0.1.1-0.20231202035212-d3fa0460f47e + github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f + github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89 github.com/tailscale/goexpect v0.0.0-20210902213824-6e8c725cea41 github.com/tailscale/gokrazy-kernel v0.0.0-20240728225134-3d23beabda2e github.com/tailscale/golang-x-crypto v0.0.0-20250404221719-a5573b049869 - github.com/tailscale/hujson v0.0.0-20221223112325-20486734a56a + github.com/tailscale/hujson v0.0.0-20260302212456-ecc657c15afd github.com/tailscale/mkctr v0.0.0-20260107121656-ea857e3e500b github.com/tailscale/netlink v1.1.1-0.20240822203006-4d49adab4de7 github.com/tailscale/peercred v0.0.0-20250107143737-35a0c7bd7edc github.com/tailscale/setec v0.0.0-20251203133219-2ab774e4129a + github.com/tailscale/ts-gokrazy v0.0.0-20260429180033-fe741c6deb44 github.com/tailscale/web-client-prebuilt v0.0.0-20250124233751-d4cd19a26976 github.com/tailscale/wf v0.0.0-20240214030419-6fbb0a674ee6 - github.com/tailscale/wireguard-go v0.0.0-20250716170648-1d0488a3d7da + github.com/tailscale/wireguard-go v0.0.0-20260427181203-e3ac4a0afb4e github.com/tailscale/xnet v0.0.0-20240729143630-8497ac4dab2e github.com/tc-hib/winres v0.2.1 github.com/tcnksm/go-httpstat v0.2.0 @@ -110,22 +113,22 @@ require ( go.uber.org/zap v1.27.0 go4.org/mem v0.0.0-20240501181205-ae6ca9944745 go4.org/netipx v0.0.0-20231129151722-fdeea329fbba - golang.org/x/crypto v0.46.0 + golang.org/x/crypto v0.50.0 golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b - golang.org/x/mod v0.30.0 - golang.org/x/net v0.48.0 - golang.org/x/oauth2 v0.33.0 - golang.org/x/sync v0.19.0 - golang.org/x/sys v0.40.0 - golang.org/x/term v0.38.0 + golang.org/x/mod v0.35.0 + golang.org/x/net v0.53.0 + golang.org/x/oauth2 v0.36.0 + golang.org/x/sync v0.20.0 + golang.org/x/sys v0.43.0 + golang.org/x/term v0.42.0 golang.org/x/time v0.12.0 - golang.org/x/tools v0.39.0 + golang.org/x/tools v0.44.0 golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 golang.zx2c4.com/wireguard/windows v0.5.3 gopkg.in/square/go-jose.v2 v2.6.0 gvisor.dev/gvisor v0.0.0-20260224225140-573d5e7127a8 helm.sh/helm/v3 v3.19.0 - honnef.co/go/tools v0.7.0-0.dev.0.20251022135355-8273271481d0 + honnef.co/go/tools v0.7.0 k8s.io/api v0.34.0 k8s.io/apimachinery v0.34.0 k8s.io/apiserver v0.34.0 @@ -135,6 +138,7 @@ require ( sigs.k8s.io/kind v0.30.0 sigs.k8s.io/yaml v1.6.0 software.sslmate.com/src/go-pkcs12 v0.4.0 + tailscale.com/client/tailscale/v2 v2.9.0 ) require ( @@ -151,6 +155,7 @@ require ( github.com/OpenPeeDeeP/depguard/v2 v2.2.0 // indirect github.com/alecthomas/go-check-sumtype v0.1.4 // indirect github.com/alexkohler/nakedret/v2 v2.0.4 // indirect + github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be // indirect github.com/armon/go-metrics v0.4.1 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beevik/ntp v0.3.0 // indirect @@ -171,7 +176,7 @@ require ( github.com/cyphar/filepath-securejoin v0.6.1 // indirect github.com/deckarep/golang-set/v2 v2.8.0 // indirect github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc // indirect - github.com/docker/go-connections v0.5.0 // indirect + github.com/docker/go-connections v0.6.0 // indirect github.com/docker/go-events v0.0.0-20250808211157-605354379745 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/evanphx/json-patch v5.9.11+incompatible // indirect @@ -191,7 +196,7 @@ require ( github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-github/v66 v66.0.0 // indirect github.com/google/go-querystring v1.1.0 // indirect - github.com/google/renameio/v2 v2.0.0 // indirect + github.com/google/renameio/v2 v2.0.2 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect @@ -220,6 +225,8 @@ require ( github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/moby/buildkit v0.20.2 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/moby/moby/api v1.54.1 // indirect + github.com/moby/moby/client v0.4.0 // indirect github.com/moby/spdystream v0.5.0 // indirect github.com/moby/term v0.5.2 // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect @@ -234,7 +241,7 @@ require ( github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 // indirect github.com/stacklok/frizbee v0.1.7 // indirect - github.com/vishvananda/netlink v1.3.1-0.20240922070040-084abd93d350 // indirect + github.com/vishvananda/netlink v1.3.1 // indirect github.com/xen0n/gosmopolitan v1.2.2 // indirect github.com/xlab/treeprint v1.2.0 // indirect github.com/ykadowak/zerologlint v0.1.5 // indirect @@ -250,7 +257,7 @@ require ( go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto/x509roots/fallback v0.0.0-20260113154411-7d0074ccc6f1 // indirect - golang.org/x/telemetry v0.0.0-20251111182119-bc8e575c7b54 // indirect + golang.org/x/telemetry v0.0.0-20260409153401-be6f6cb8b1fa // indirect golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated // indirect golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20251213004720-97cd9d5aeac2 // indirect @@ -314,28 +321,26 @@ require ( github.com/charithe/durationcheck v0.0.10 // indirect github.com/chavacava/garif v0.1.0 // indirect github.com/cloudflare/circl v1.6.3 // indirect - github.com/containerd/stargz-snapshotter/estargz v0.18.1 // indirect + github.com/containerd/stargz-snapshotter/estargz v0.18.2 // indirect github.com/curioswitch/go-reassign v0.2.0 // indirect github.com/daixiang0/gci v0.12.3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/denis-tingaikin/go-header v0.5.0 // indirect - github.com/docker/cli v29.0.3+incompatible // indirect - github.com/docker/distribution v2.8.3+incompatible // indirect - github.com/docker/docker v28.5.2+incompatible // indirect + github.com/docker/cli v29.4.0+incompatible // indirect github.com/docker/docker-credential-helpers v0.9.3 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/emirpasic/gods v1.18.1 // indirect github.com/ettle/strcase v0.2.0 // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect - github.com/fatih/color v1.18.0 // indirect + github.com/fatih/color v1.18.0 github.com/fatih/structtag v1.2.0 // indirect github.com/firefart/nonamedreturns v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 github.com/fzipp/gocyclo v0.6.0 // indirect github.com/go-critic/go-critic v0.11.2 // indirect github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect - github.com/go-git/go-billy/v5 v5.6.2 // indirect - github.com/go-git/go-git/v5 v5.16.5 // indirect + github.com/go-git/go-billy/v5 v5.8.0 // indirect + github.com/go-git/go-git/v5 v5.17.1 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.4 // indirect @@ -442,17 +447,17 @@ require ( github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/shazow/go-diff v0.0.0-20160112020656-b6b7b6733b8c // indirect github.com/shopspring/decimal v1.4.0 // indirect - github.com/sirupsen/logrus v1.9.3 // indirect + github.com/sirupsen/logrus v1.9.4 // indirect github.com/sivchari/containedctx v1.0.3 // indirect github.com/sivchari/tenv v1.7.1 // indirect github.com/skeema/knownhosts v1.3.1 // indirect github.com/sonatard/noctx v0.0.2 // indirect - github.com/sourcegraph/go-diff v0.7.0 // indirect + github.com/sourcegraph/go-diff v0.7.0 github.com/spf13/afero v1.11.0 // indirect github.com/spf13/cast v1.7.0 // indirect github.com/spf13/cobra v1.10.2 // indirect github.com/spf13/jwalterweatherman v1.1.0 // indirect - github.com/spf13/pflag v1.0.9 // indirect + github.com/spf13/pflag v1.0.10 // indirect github.com/spf13/viper v1.16.0 // indirect github.com/ssgreg/nlreturn/v2 v2.2.1 // indirect github.com/stbenjam/no-sprintf-host-port v0.1.1 // indirect @@ -468,7 +473,7 @@ require ( github.com/tomarrell/wrapcheck/v2 v2.8.3 // indirect github.com/tommy-muehle/go-mnd/v2 v2.5.1 // indirect github.com/u-root/uio v0.0.0-20240224005618-d2acac8f3701 // indirect - github.com/ulikunitz/xz v0.5.15 // indirect + github.com/ulikunitz/xz v0.5.15 github.com/ultraware/funlen v0.1.0 // indirect github.com/ultraware/whitespace v0.1.0 // indirect github.com/uudashr/gocognit v1.1.2 // indirect @@ -482,7 +487,7 @@ require ( go.uber.org/multierr v1.11.0 // indirect golang.org/x/exp/typeparams v0.0.0-20240314144324-c7f7c6466f7f // indirect golang.org/x/image v0.27.0 // indirect - golang.org/x/text v0.32.0 // indirect + golang.org/x/text v0.36.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.toolchain.next.rev b/go.toolchain.next.rev index 205355c4745f6..fd5a216978c8b 100644 --- a/go.toolchain.next.rev +++ b/go.toolchain.next.rev @@ -1 +1 @@ -f4de14a515221e27c0d79446b423849a6546e3a6 +e877d973840c91ec9d4bc1921b0845789de359ae diff --git a/go.toolchain.rev b/go.toolchain.rev index 205355c4745f6..fd5a216978c8b 100644 --- a/go.toolchain.rev +++ b/go.toolchain.rev @@ -1 +1 @@ -f4de14a515221e27c0d79446b423849a6546e3a6 +e877d973840c91ec9d4bc1921b0845789de359ae diff --git a/go.toolchain.rev.sri b/go.toolchain.rev.sri deleted file mode 100644 index 86b2083ff8624..0000000000000 --- a/go.toolchain.rev.sri +++ /dev/null @@ -1 +0,0 @@ -sha256-qmX68/Ml/jvf+sD9qykdx9QhSbkYaF8xJMFtd3iLHI8= diff --git a/go.toolchain.version b/go.toolchain.version index dd43a143f0217..f8f7381409642 100644 --- a/go.toolchain.version +++ b/go.toolchain.version @@ -1 +1 @@ -1.26.1 +1.26.3 diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 242b31b4bdbf3..b5a0a353cd785 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -2960,12 +2960,7 @@ func (b *LocalBackend) updateFilterLocked(prefs ipn.PrefsView) { // Log traffic for Tailscale IPs. logNetsB.AddPrefix(tsaddr.CGNATRange()) logNetsB.AddPrefix(tsaddr.TailscaleULARange()) - for _, prefix := range tsaddr.CGNatOverrideRange() { - if prefix.IsValid() { - logNetsB.AddPrefix(prefix) - - } - } + logNetsB.RemovePrefix(tsaddr.ChromeOSVMRange()) if haveNetmap { addrs = netMap.GetAddresses() for i := range addrs.Len() { diff --git a/kube/certs/certs_test.go b/kube/certs/certs_test.go index f3662f6c39ad4..27fe12752001c 100644 --- a/kube/certs/certs_test.go +++ b/kube/certs/certs_test.go @@ -12,7 +12,6 @@ import ( "tailscale.com/ipn" "tailscale.com/kube/localclient" "tailscale.com/tailcfg" - "tailscale.com/types/netmap" ) // TestEnsureCertLoops tests that the certManager correctly starts and stops @@ -201,17 +200,11 @@ func TestEnsureCertLoops(t *testing.T) { notifyChan := make(chan ipn.Notify) go func() { + // SelfChange wakes the cert manager; cert domains are + // then fetched via FakeLocalClient.CertDomainsResult. for { notifyChan <- ipn.Notify{ - NetMap: &netmap.NetworkMap{ - DNS: tailcfg.DNSConfig{ - CertDomains: []string{ - "my-app.tailnetxyz.ts.net", - "my-other-app.tailnetxyz.ts.net", - "my-apiserver.tailnetxyz.ts.net", - }, - }, - }, + SelfChange: &tailcfg.Node{StableID: "test"}, } } }() @@ -220,6 +213,11 @@ func TestEnsureCertLoops(t *testing.T) { FakeIPNBusWatcher: localclient.FakeIPNBusWatcher{ NotifyChan: notifyChan, }, + CertDomainsResult: []string{ + "my-app.tailnetxyz.ts.net", + "my-other-app.tailnetxyz.ts.net", + "my-apiserver.tailnetxyz.ts.net", + }, }, logf: log.Printf, certLoops: make(map[string]context.CancelFunc), diff --git a/net/tsaddr/tsaddr.go b/net/tsaddr/tsaddr.go index 7f01e54911f3c..1eac9eb77cfde 100644 --- a/net/tsaddr/tsaddr.go +++ b/net/tsaddr/tsaddr.go @@ -8,9 +8,7 @@ import ( "encoding/binary" "errors" "net/netip" - "os" "slices" - "strings" "sync" "go4.org/netipx" @@ -18,29 +16,16 @@ import ( "tailscale.com/types/views" ) -// CGNatOverrideRange returns the subset of CGNAT IPv$ range that -// is passed via env to RETURN traffic over the 100.64.0.0/10 DROP. -// Additionally, it returns the CGNAT IPv4 range used by ChromeOS -// to host containers and VMs. -// We avoid allocating Tailscale IPs from it, to avoid conflicts. -func CGNatOverrideRange() []netip.Prefix { - var CGNatOverrideRange []string - chromeOSRange := "100.115.92.0/23" - envRange := os.Getenv("TS_CGNAT_OVERRIDE_RANGE") - if envRange != "" { - CGNatOverrideRange = append(CGNatOverrideRange, strings.Split(envRange, ",")...) - } - CGNatOverrideRange = append(CGNatOverrideRange, chromeOSRange) - - cgNatOverrideRange.Do(func() { - for _, cidr := range CGNatOverrideRange { - mustPrefixSlice(&cgNatOverrideRange.v, strings.TrimSpace(cidr)) - } - }) - - return cgNatOverrideRange.v +// ChromeOSVMRange returns the subset of the CGNAT IPv4 range used by +// ChromeOS to interconnect the host OS to containers and VMs. We +// avoid allocating Tailscale IPs from it, to avoid conflicts. +func ChromeOSVMRange() netip.Prefix { + chromeOSRange.Do(func() { mustPrefix(&chromeOSRange.v, "100.115.92.0/23") }) + return chromeOSRange.v } +var chromeOSRange oncePrefix + // CGNATRange returns the Carrier Grade NAT address range that // is the superset range that Tailscale assigns out of. // See https://tailscale.com/s/cgnat @@ -51,13 +36,12 @@ func CGNATRange() netip.Prefix { } var ( - cgnatRange oncePrefix - tsUlaRange oncePrefix - tsViaRange oncePrefix - ula4To6Range oncePrefix - ulaEph6Range oncePrefix - serviceIPv6 oncePrefix - cgNatOverrideRange oncePrefixSlice + cgnatRange oncePrefix + tsUlaRange oncePrefix + tsViaRange oncePrefix + ula4To6Range oncePrefix + ulaEph6Range oncePrefix + serviceIPv6 oncePrefix ) // TailscaleServiceIP returns the IPv4 listen address of services @@ -94,7 +78,7 @@ func IsTailscaleIP(ip netip.Addr) bool { // IsTailscaleIPv4 reports whether an IPv4 IP is an IP address that // Tailscale assigns from. func IsTailscaleIPv4(ip netip.Addr) bool { - return CGNATRange().Contains(ip) && !PrefixesContainsIP(CGNatOverrideRange(), ip) + return CGNATRange().Contains(ip) && !ChromeOSVMRange().Contains(ip) } // TailscaleULARange returns the IPv6 Unique Local Address range that @@ -177,24 +161,11 @@ func mustPrefix(v *netip.Prefix, prefix string) { } } -func mustPrefixSlice(prefixes *[]netip.Prefix, cidr string) { - prefix, err := netip.ParsePrefix(cidr) - if err != nil { - panic(err) - } - *prefixes = append(*prefixes, prefix) -} - type oncePrefix struct { sync.Once v netip.Prefix } -type oncePrefixSlice struct { - sync.Once - v []netip.Prefix -} - // PrefixesContainsIP reports whether any prefix in ipp contains ip. func PrefixesContainsIP(ipp []netip.Prefix, ip netip.Addr) bool { for _, r := range ipp { diff --git a/util/linuxfw/cgnat_override.go b/util/linuxfw/cgnat_override.go new file mode 100644 index 0000000000000..5f47cc66b6655 --- /dev/null +++ b/util/linuxfw/cgnat_override.go @@ -0,0 +1,42 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build linux + +package linuxfw + +import ( + "net/netip" + "strings" + + "tailscale.com/envknob" +) + +// cgnatReturnRanges returns the set of CGNAT sub-ranges for which inbound +// off-Tailscale traffic should fall out of the Tailscale chain (RETURN) instead +// of being dropped by the CGNAT drop rule. +// +// The ranges are configured via the TS_CGNAT_OVERRIDE_RANGE environment +// variable as a comma-separated list of CIDRs (e.g. "100.96.0.0/11,100.120.0.0/14"). +// Invalid entries are ignored. This is a CoreWeave-specific extension to the +// upstream CGNAT drop behavior; see [iptablesRunner.AddExternalCGNATRules] and +// [nftablesRunner.AddExternalCGNATRules]. +func cgnatReturnRanges() []netip.Prefix { + v := envknob.String("TS_CGNAT_OVERRIDE_RANGE") + if v == "" { + return nil + } + var out []netip.Prefix + for _, s := range strings.Split(v, ",") { + s = strings.TrimSpace(s) + if s == "" { + continue + } + p, err := netip.ParsePrefix(s) + if err != nil || !p.IsValid() { + continue + } + out = append(out, p) + } + return out +} diff --git a/util/linuxfw/cgnat_override_test.go b/util/linuxfw/cgnat_override_test.go new file mode 100644 index 0000000000000..6e1d7f47e3a93 --- /dev/null +++ b/util/linuxfw/cgnat_override_test.go @@ -0,0 +1,73 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build linux + +package linuxfw + +import ( + "net/netip" + "reflect" + "slices" + "testing" + + "tailscale.com/net/tsaddr" +) + +func TestCGNATReturnRanges(t *testing.T) { + tests := []struct { + name string + env string + want []netip.Prefix + }{ + {"empty", "", nil}, + {"single", "100.96.0.0/11", []netip.Prefix{netip.MustParsePrefix("100.96.0.0/11")}}, + {"multiple_with_spaces", " 100.96.0.0/11 , 100.120.0.0/14 ", []netip.Prefix{ + netip.MustParsePrefix("100.96.0.0/11"), + netip.MustParsePrefix("100.120.0.0/14"), + }}, + {"skips_invalid", "garbage,100.96.0.0/11", []netip.Prefix{netip.MustParsePrefix("100.96.0.0/11")}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv("TS_CGNAT_OVERRIDE_RANGE", tt.env) + if got := cgnatReturnRanges(); !slices.Equal(got, tt.want) { + t.Errorf("cgnatReturnRanges() = %v, want %v", got, tt.want) + } + }) + } +} + +// TestBuildExternalCGNATRulesOverride verifies that the TS_CGNAT_OVERRIDE_RANGE +// ranges are emitted as RETURN rules in CGNATModeDrop, after the ChromeOS +// RETURN rule and before the CGNAT DROP rule, and that CGNATModeReturn is +// unaffected. +func TestBuildExternalCGNATRulesOverride(t *testing.T) { + t.Setenv("TS_CGNAT_OVERRIDE_RANGE", "100.96.0.0/11") + tunname := "tun0" + + gotDrop, err := buildExternalCGNATRules(CGNATModeDrop, tunname) + if err != nil { + t.Fatal(err) + } + wantDrop := [][]string{ + {"!", "-i", tunname, "-s", tsaddr.ChromeOSVMRange().String(), "-j", "RETURN"}, + {"!", "-i", tunname, "-s", "100.96.0.0/11", "-j", "RETURN"}, + {"!", "-i", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "DROP"}, + } + if !reflect.DeepEqual(gotDrop, wantDrop) { + t.Errorf("CGNATModeDrop rules =\n %v\nwant\n %v", gotDrop, wantDrop) + } + + // Override ranges only apply to the drop mode; return mode is unchanged. + gotReturn, err := buildExternalCGNATRules(CGNATModeReturn, tunname) + if err != nil { + t.Fatal(err) + } + wantReturn := [][]string{ + {"!", "-i", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "RETURN"}, + } + if !reflect.DeepEqual(gotReturn, wantReturn) { + t.Errorf("CGNATModeReturn rules =\n %v\nwant\n %v", gotReturn, wantReturn) + } +} diff --git a/util/linuxfw/iptables_runner.go b/util/linuxfw/iptables_runner.go index 117df3fc04c7f..3711195cfa75a 100644 --- a/util/linuxfw/iptables_runner.go +++ b/util/linuxfw/iptables_runner.go @@ -214,35 +214,8 @@ func (i *iptablesRunner) AddBase(tunname string) error { // addBase4 adds some basic IPv4 processing rules to be // supplemented by later calls to other helpers. func (i *iptablesRunner) addBase4(tunname string) error { - // Only allow CGNAT range traffic to come from tailscale0. There - // is an exception carved out for ranges used by ChromeOS, for - // which we fall out of the Tailscale chain, - // as well as specified addrsses from the TS_CGNAT_OVERRIDE_RANGE env var - // - // Note, this will definitely break nodes that end up using the - // CGNAT range for other purposes :(. - var args []string - var errs []error - CGNatOverrideRange := tsaddr.CGNatOverrideRange() - for _, prefix := range CGNatOverrideRange { - if prefix.IsValid() { - args = []string{"!", "-i", tunname, "-s", prefix.String(), "-j", "RETURN"} - if err := i.ipt4.Append("filter", "ts-input", args...); err != nil { - errs = append(errs, fmt.Errorf("adding %v in v4/filter/ts-input: %w", args, err)) - continue - } - } - if len(errs) > 0 { - return fmt.Errorf("encountered multiple errors: %v", errs) - } - } - args = []string{"!", "-i", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "DROP"} - if err := i.ipt4.Append("filter", "ts-input", args...); err != nil { - return fmt.Errorf("adding %v in v4/filter/ts-input: %w", args, err) - } - - // Explicitly allow all other inbound traffic to the tun interface - args = []string{"-i", tunname, "-j", "ACCEPT"} + // Explicitly allow all inbound traffic to the tun interface + args := []string{"-i", tunname, "-j", "ACCEPT"} if err := i.ipt4.Append("filter", "ts-input", args...); err != nil { return fmt.Errorf("adding %v in v4/filter/ts-input: %w", args, err) } @@ -694,6 +667,72 @@ func (i *iptablesRunner) DelMagicsockPortRule(port uint16, network string) error return nil } +// buildExternalCGNATRules abstracts out logic for constructing firewall rules +// for handling non-Tailscale CGNAT traffic, since these rules need to be +// identical across [AddExternalCGNATRules] and [DelExternalCGNATRules]. +func buildExternalCGNATRules(mode CGNATMode, tunname string) ([][]string, error) { + switch mode { + case CGNATModeDrop: + // Only allow CGNAT range traffic to come from the Tailscale interface. + // There is an exception carved out for ranges used by ChromeOS, and for + // any ranges configured via TS_CGNAT_OVERRIDE_RANGE (see + // cgnatReturnRanges), for which we fall out of the Tailscale chain. + rules := [][]string{ + {"!", "-i", tunname, "-s", tsaddr.ChromeOSVMRange().String(), "-j", "RETURN"}, + } + for _, p := range cgnatReturnRanges() { + rules = append(rules, []string{"!", "-i", tunname, "-s", p.String(), "-j", "RETURN"}) + } + rules = append(rules, []string{"!", "-i", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "DROP"}) + return rules, nil + case CGNATModeReturn: + // Fall out of the Tailscale chain for CGNAT traffic that doesn't + // originate from the Tailscale interface. + return [][]string{ + {"!", "-i", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "RETURN"}, + }, nil + default: + return nil, fmt.Errorf("unsupported mode %q", mode) + } +} + +// AddExternalCGNATRules adds rules to the ts-input chain to deal with +// traffic from the CGNAT range that arrives on non-Tailscale network +// interfaces. +func (i *iptablesRunner) AddExternalCGNATRules(mode CGNATMode, tunname string) error { + rules, err := buildExternalCGNATRules(mode, tunname) + if err != nil { + return fmt.Errorf("build cgnat mode rule: %v", err) + } + for _, rule := range rules { + if err := i.ipt4.Append("filter", "ts-input", rule...); err != nil { + return fmt.Errorf("adding %v in v4/filter/ts-input: %w", rule, err) + } + } + return nil +} + +// DelExternalCGNATRules removes the rules created by AddExternalCGNATRules, +// if they exist. +func (i *iptablesRunner) DelExternalCGNATRules(mode CGNATMode, tunname string) error { + rules, err := buildExternalCGNATRules(mode, tunname) + if err != nil { + return fmt.Errorf("build cgnat mode rule: %v", err) + } + for _, rule := range rules { + if found, err := i.ipt4.Exists("filter", "ts-input", rule...); err != nil { + return fmt.Errorf("checking for %v in v4/filter/ts-input: %w", rule, err) + } else if !found { + // Don't need to delete a rule that isn't there. + continue + } + if err := i.ipt4.Delete("filter", "ts-input", rule...); err != nil { + return fmt.Errorf("deleting %v in v4/filter/ts-input: %w", rule, err) + } + } + return nil +} + // delTSHook deletes hook in a chain that jumps to a ts-chain. If the hook does not // exist, it's a no-op since the desired state is already achieved but we log the // error because error code from the iptables module resists unwrapping. diff --git a/util/linuxfw/iptables_runner_test.go b/util/linuxfw/iptables_runner_test.go index 77c753004a770..b5a13fdba1bca 100644 --- a/util/linuxfw/iptables_runner_test.go +++ b/util/linuxfw/iptables_runner_test.go @@ -126,8 +126,6 @@ func TestAddAndDeleteBase(t *testing.T) { // Check that the rules were created. tsRulesV4 := []fakeRule{ // table/chain/rule - {"filter", "ts-input", []string{"!", "-i", tunname, "-s", tsaddr.ChromeOSVMRange().String(), "-j", "RETURN"}}, - {"filter", "ts-input", []string{"!", "-i", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "DROP"}}, {"filter", "ts-forward", []string{"-o", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "DROP"}}, } @@ -504,3 +502,56 @@ func TestAddAndDelConnmarkSaveRule(t *testing.T) { } }) } + +func TestAddAndDelCGNATRules(t *testing.T) { + iptr := newFakeIPTablesRunner() + tunname := "tun0" + + // We need the chains to exist so we can add rules into them. + if err := iptr.AddChains(); err != nil { + t.Fatal(err) + } + + tests := []struct { + mode CGNATMode + wantRules []fakeRule + }{ + { + CGNATModeDrop, []fakeRule{ + {"filter", "ts-input", []string{"!", "-i", tunname, "-s", tsaddr.ChromeOSVMRange().String(), "-j", "RETURN"}}, + {"filter", "ts-input", []string{"!", "-i", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "DROP"}}, + }, + }, + { + CGNATModeReturn, []fakeRule{ + {"filter", "ts-input", []string{"!", "-i", tunname, "-s", tsaddr.CGNATRange().String(), "-j", "RETURN"}}, + }, + }, + } + + for _, tt := range tests { + if err := iptr.AddExternalCGNATRules(tt.mode, tunname); err != nil { + t.Fatal(err) + } + + for _, tr := range tt.wantRules { + if exists, err := iptr.ipt4.Exists(tr.table, tr.chain, tr.args...); err != nil { + t.Fatalf("mode %q: error checking for rule: %v", tt.mode, err) + } else if !exists { + t.Errorf("mode %q: rule %s/%s/%s doesn't exist", tt.mode, tr.table, tr.chain, strings.Join(tr.args, " ")) + } + } + + if err := iptr.DelExternalCGNATRules(tt.mode, tunname); err != nil { + t.Fatal(err) + } + + for _, tr := range tt.wantRules { + if exists, err := iptr.ipt4.Exists(tr.table, tr.chain, tr.args...); err != nil { + t.Fatalf("mode %q: error checking for rule: %v", tt.mode, err) + } else if exists { + t.Errorf("mode %q: rule %s/%s/%s not deleted", tt.mode, tr.table, tr.chain, strings.Join(tr.args, " ")) + } + } + } +} diff --git a/util/linuxfw/nftables_runner.go b/util/linuxfw/nftables_runner.go index 7de9b6cb05edd..9f1cb82b563ba 100644 --- a/util/linuxfw/nftables_runner.go +++ b/util/linuxfw/nftables_runner.go @@ -20,7 +20,6 @@ import ( "golang.org/x/sys/unix" "tailscale.com/net/tsaddr" "tailscale.com/types/logger" - "tailscale.com/types/ptr" ) const ( @@ -454,8 +453,13 @@ func getOrCreateChain(c *nftables.Conn, cinfo chainInfo) (*nftables.Chain, error // type/hook/priority, but for "conventional chains" assume they're what // we expect (in case iptables-nft/ufw make minor behavior changes in // the future). - if isTSChain(chain.Name) && (chain.Type != cinfo.chainType || *chain.Hooknum != *cinfo.chainHook || *chain.Priority != *cinfo.chainPriority) { - return nil, fmt.Errorf("chain %s already exists with different type/hook/priority", cinfo.name) + if isTSChain(chain.Name) { + if chain.Hooknum == nil || chain.Priority == nil { + return nil, errors.New("nftables chain has nil hooknum or priority; kernel may lack nftables support (CONFIG_NF_TABLES)") + } + if chain.Type != cinfo.chainType || *chain.Hooknum != *cinfo.chainHook || *chain.Priority != *cinfo.chainPriority { + return nil, fmt.Errorf("chain %s already exists with different type/hook/priority", cinfo.name) + } } return chain, nil } @@ -589,6 +593,15 @@ type NetfilterRunner interface { // DelMagicsockPortRule removes the rule created by AddMagicsockPortRule, // if it exists. DelMagicsockPortRule(port uint16, network string) error + + // AddExternalCGNATRules adds rules to the ts-input chain to deal with + // traffic from the CGNAT range that arrives on non-Tailscale network + // interfaces. + AddExternalCGNATRules(mode CGNATMode, tunname string) error + + // DelExternalCGNATRules removes the rules created by AddExternalCGNATRules, + // if they exist. + DelExternalCGNATRules(mode CGNATMode, tunname string) error } // New creates a NetfilterRunner, auto-detecting whether to use @@ -955,7 +968,7 @@ const ( // via netfilter via nftables, as a last resort measure to detect that nftables // can be used. It cleans up the dummy chains after creation. func (n *nftablesRunner) createDummyPostroutingChains() (retErr error) { - polAccept := ptr.To(nftables.ChainPolicyAccept) + polAccept := new(nftables.ChainPolicyAccept) for _, table := range n.getTables() { nat, err := createTableIfNotExist(n.conn, table.Proto, tsDummyTableName) if err != nil { @@ -1203,28 +1216,78 @@ func createRangeRule( } -// addReturnCGNATOverrideRange adds a rule to return if the source IP -// CGNAT Override range. -func addReturnCGNATOverrideRange(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error { - var errs []error - CGNatOverrideRange := tsaddr.CGNatOverrideRange() - for _, prefix := range CGNatOverrideRange { - if prefix.IsValid() { - rule, err := createRangeRule(table, chain, tunname, prefix, expr.VerdictReturn) - if err != nil { - errs = append(errs, fmt.Errorf("create rule for prefix %v: %w", prefix, err)) - continue - } - _ = c.AddRule(rule) - if err = c.Flush(); err != nil { - errs = append(errs, fmt.Errorf("add rule for prefix %v: %w", prefix, err)) - continue - } +// addReturnChromeOSVMRangeRule adds a rule to return if the source IP +// is in the ChromeOS VM range. +func addReturnChromeOSVMRangeRule(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error { + rule, err := createRangeRule(table, chain, tunname, tsaddr.ChromeOSVMRange(), expr.VerdictReturn) + if err != nil { + return fmt.Errorf("create rule: %w", err) + } + _ = c.AddRule(rule) + if err = c.Flush(); err != nil { + return fmt.Errorf("add rule: %w", err) + } + return nil +} + +// delReturnChromeOSVMRangeRule deletes the rule created by addReturnChromeOSVMRangeRule, +// if it exists. +func delReturnChromeOSVMRangeRule(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error { + rule, err := createRangeRule(table, chain, tunname, tsaddr.ChromeOSVMRange(), expr.VerdictReturn) + if err != nil { + return fmt.Errorf("create rule: %w", err) + } + rule, err = findRule(c, rule) + if err != nil { + return fmt.Errorf("find rule: %v", err) + } + if rule == nil { + return nil + } + _ = c.DelRule(rule) + if err := c.Flush(); err != nil { + return fmt.Errorf("flush del rule: %w", err) + } + return nil +} + +// addReturnCGNATOverrideRanges adds a RETURN rule for each CGNAT override range +// configured via TS_CGNAT_OVERRIDE_RANGE (see cgnatReturnRanges), so inbound +// off-Tailscale traffic in those ranges falls out of the Tailscale chain rather +// than being dropped by the CGNAT drop rule. +func addReturnCGNATOverrideRanges(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error { + for _, p := range cgnatReturnRanges() { + rule, err := createRangeRule(table, chain, tunname, p, expr.VerdictReturn) + if err != nil { + return fmt.Errorf("create rule for %v: %w", p, err) + } + _ = c.AddRule(rule) + if err = c.Flush(); err != nil { + return fmt.Errorf("add rule for %v: %w", p, err) } } + return nil +} - if len(errs) > 0 { - return fmt.Errorf("encountered multiple errors: %v", errs) +// delReturnCGNATOverrideRanges deletes the rules created by +// addReturnCGNATOverrideRanges, if they exist. +func delReturnCGNATOverrideRanges(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error { + for _, p := range cgnatReturnRanges() { + rule, err := createRangeRule(table, chain, tunname, p, expr.VerdictReturn) + if err != nil { + return fmt.Errorf("create rule for %v: %w", p, err) + } + rule, err = findRule(c, rule) + if err != nil { + return fmt.Errorf("find rule for %v: %v", p, err) + } + if rule == nil { + continue + } + _ = c.DelRule(rule) + if err := c.Flush(); err != nil { + return fmt.Errorf("flush del rule for %v: %w", p, err) + } } return nil } @@ -1243,6 +1306,62 @@ func addDropCGNATRangeRule(c *nftables.Conn, table *nftables.Table, chain *nftab return nil } +// delDropCGNATRangeRule deletes the rule created by addDropCGNATRangeRule, +// if it exists. +func delDropCGNATRangeRule(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error { + rule, err := createRangeRule(table, chain, tunname, tsaddr.CGNATRange(), expr.VerdictDrop) + if err != nil { + return fmt.Errorf("create rule: %w", err) + } + rule, err = findRule(c, rule) + if err != nil { + return fmt.Errorf("find rule: %v", err) + } + if rule == nil { + return nil + } + _ = c.DelRule(rule) + if err := c.Flush(); err != nil { + return fmt.Errorf("flush del rule: %w", err) + } + return nil +} + +// addReturnCGNATRangeRule adds a rule to return if the source IP is in the +// CGNAT range. +func addReturnCGNATRangeRule(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error { + rule, err := createRangeRule(table, chain, tunname, tsaddr.CGNATRange(), expr.VerdictReturn) + if err != nil { + return fmt.Errorf("create rule: %w", err) + } + _ = c.AddRule(rule) + if err = c.Flush(); err != nil { + return fmt.Errorf("add rule: %w", err) + } + return nil +} + +// delReturnCGNATRangeRule deletes the rule created by addReturnCGNATRangeRule, +// if it exists. +func delReturnCGNATRangeRule(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error { + rule, err := createRangeRule(table, chain, tunname, tsaddr.CGNATRange(), expr.VerdictReturn) + if err != nil { + return fmt.Errorf("create rule: %w", err) + } + rule, err = findRule(c, rule) + if err != nil { + return fmt.Errorf("find rule: %v", err) + } + if rule == nil { + return nil + } + _ = c.DelRule(rule) + if err := c.Flush(); err != nil { + return fmt.Errorf("flush del rule: %w", err) + } + return nil +} + // createSetSubnetRouteMarkRule creates a rule to set the subnet route // mark if the packet is from the given interface. func createSetSubnetRouteMarkRule(table *nftables.Table, chain *nftables.Chain, tunname string) (*nftables.Rule, error) { @@ -1510,6 +1629,73 @@ func (n *nftablesRunner) DelMagicsockPortRule(port uint16, network string) error return nil } +// AddExternalCGNATRules adds rules to the ts-input chain to deal with +// traffic from the CGNAT range that arrives on non-Tailscale network +// interfaces. +func (n *nftablesRunner) AddExternalCGNATRules(mode CGNATMode, tunname string) error { + conn := n.conn + + inputChain, err := getChainFromTable(conn, n.nft4.Filter, chainNameInput) + if err != nil { + return fmt.Errorf("get input chain v4: %v", err) + } + switch mode { + case CGNATModeDrop: + if err = addReturnChromeOSVMRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil { + return fmt.Errorf("add return chromeos vm range rule v4: %w", err) + } + if err = addReturnCGNATOverrideRanges(conn, n.nft4.Filter, inputChain, tunname); err != nil { + return fmt.Errorf("add return cgnat override ranges v4: %w", err) + } + if err = addDropCGNATRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil { + return fmt.Errorf("add drop cgnat range rule v4: %w", err) + } + case CGNATModeReturn: + if err = addReturnCGNATRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil { + return fmt.Errorf("add return cgnat range rule v4: %w", err) + } + default: + return fmt.Errorf("unsupported cgnat mode %q", mode) + } + if err = conn.Flush(); err != nil { + return fmt.Errorf("flush cgnat rules v4: %w", err) + } + return nil +} + +// DelExternalCGNATRules removes the rules created by AddExternalCGNATRules, +// if they exist. +func (n *nftablesRunner) DelExternalCGNATRules(mode CGNATMode, tunname string) error { + conn := n.conn + + inputChain, err := getChainFromTable(conn, n.nft4.Filter, chainNameInput) + if err != nil { + return fmt.Errorf("get input chain v4: %v", err) + } + switch mode { + case CGNATModeDrop: + if err = delReturnChromeOSVMRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil { + return fmt.Errorf("del return chromeos vm range rule v4: %w", err) + } + if err = delReturnCGNATOverrideRanges(conn, n.nft4.Filter, inputChain, tunname); err != nil { + return fmt.Errorf("del return cgnat override ranges v4: %w", err) + } + if err = delDropCGNATRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil { + return fmt.Errorf("del drop cgnat range rule v4: %w", err) + } + case CGNATModeReturn: + if err = delReturnCGNATRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil { + return fmt.Errorf("del return cgnat range rule v4: %w", err) + } + default: + return fmt.Errorf("unsupported mode %q", mode) + } + if err = conn.Flush(); err != nil { + return fmt.Errorf("flush cgnat rules v4: %w", err) + } + return nil +} + // createAcceptIncomingPacketRule creates a rule to accept incoming packets to // the given interface. func createAcceptIncomingPacketRule(table *nftables.Table, chain *nftables.Chain, tunname string) *nftables.Rule { @@ -1563,12 +1749,6 @@ func (n *nftablesRunner) addBase4(tunname string) error { if err != nil { return fmt.Errorf("get input chain v4: %v", err) } - if err = addReturnCGNATOverrideRange(conn, n.nft4.Filter, inputChain, tunname); err != nil { - return fmt.Errorf("add return cgnat override range rule v4: %w", err) - } - if err = addDropCGNATRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil { - return fmt.Errorf("add drop cgnat range rule v4: %w", err) - } if err = addAcceptIncomingPacketRule(conn, n.nft4.Filter, inputChain, tunname); err != nil { return fmt.Errorf("add accept incoming packet rule v4: %w", err) }