From dfa7f1c9640b2de71499e353f73bd1ce79206133 Mon Sep 17 00:00:00 2001 From: "Scion Agent (skill-bank-rate-limit-dev)" Date: Sun, 28 Jun 2026 09:38:12 +0000 Subject: [PATCH 1/3] feat(agent): add retry with backoff and resolution cache for GitHub skill resolver When agent templates reference many skills via gh:// URIs, provisioning hits GitHub API rate limits causing agents to fail. This adds two mitigations: 1. Retry with exponential backoff: All GitHub API calls in the skill resolver (resolveCommitSHA, listContents, downloadRawFile) now retry on 429, 403 rate-limit, and 5xx responses with exponential backoff. Respects Retry-After and X-RateLimit-Reset headers when present. 2. Resolution cache with TTL: A file-backed TTL cache (5 min default) stores the mapping from skill URI to resolved skill metadata (commit SHA, file listings, hashes). On cache hit, all GitHub API calls are skipped entirely. This works alongside the existing CachingSkillResolver which caches file content by hash for longer-term reuse. Together these form a two-layer cache: resolution cache (short TTL, tracks branch tip movement) and content cache (long-lived, content-addressed). --- pkg/agent/github_resolution_cache.go | 155 +++++++++++++++ pkg/agent/github_resolution_cache_test.go | 151 +++++++++++++++ pkg/agent/github_skill_resolver.go | 155 +++++++++++++-- pkg/agent/github_skill_resolver_test.go | 219 ++++++++++++++++++++++ 4 files changed, 667 insertions(+), 13 deletions(-) create mode 100644 pkg/agent/github_resolution_cache.go create mode 100644 pkg/agent/github_resolution_cache_test.go diff --git a/pkg/agent/github_resolution_cache.go b/pkg/agent/github_resolution_cache.go new file mode 100644 index 000000000..065858a42 --- /dev/null +++ b/pkg/agent/github_resolution_cache.go @@ -0,0 +1,155 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package agent + +import ( + "encoding/json" + "os" + "path/filepath" + "sync" + "time" + + "github.com/GoogleCloudPlatform/scion/pkg/config" + "github.com/GoogleCloudPlatform/scion/pkg/util" +) + +const ( + // DefaultResolutionCacheTTL is how long a cached resolution result is + // considered fresh. GitHub content can change, so this is a balance + // between freshness and avoiding rate limits. + DefaultResolutionCacheTTL = 5 * time.Minute + + resolutionCacheFileName = "github-resolution-cache.json" +) + +// GitHubResolutionCache caches the mapping from skill URI → ResolvedSkill +// to avoid redundant GitHub API calls during repeated provisioning. +// Entries expire after a configurable TTL. +type GitHubResolutionCache struct { + mu sync.RWMutex + dir string + ttl time.Duration + entries map[string]*resolutionCacheEntry + filePath string +} + +type resolutionCacheEntry struct { + Skill ResolvedSkill `json:"skill"` + CachedAt time.Time `json:"cachedAt"` + ExpiresAt time.Time `json:"expiresAt"` +} + +type resolutionCacheFile struct { + Entries map[string]*resolutionCacheEntry `json:"entries"` +} + +// NewGitHubResolutionCache creates or loads a resolution cache at the +// given directory with the specified TTL. +func NewGitHubResolutionCache(dir string, ttl time.Duration) (*GitHubResolutionCache, error) { + if err := os.MkdirAll(dir, 0755); err != nil { + return nil, err + } + c := &GitHubResolutionCache{ + dir: dir, + ttl: ttl, + entries: make(map[string]*resolutionCacheEntry), + filePath: filepath.Join(dir, resolutionCacheFileName), + } + c.load() + return c, nil +} + +// Get returns a cached ResolvedSkill for the given URI if it exists +// and has not expired. +func (c *GitHubResolutionCache) Get(uri string) (ResolvedSkill, bool) { + c.mu.RLock() + defer c.mu.RUnlock() + + entry, ok := c.entries[uri] + if !ok { + return ResolvedSkill{}, false + } + if time.Now().After(entry.ExpiresAt) { + return ResolvedSkill{}, false + } + return entry.Skill, true +} + +// Put stores a resolved skill in the cache. +func (c *GitHubResolutionCache) Put(uri string, skill ResolvedSkill) { + c.mu.Lock() + defer c.mu.Unlock() + + now := time.Now() + c.entries[uri] = &resolutionCacheEntry{ + Skill: skill, + CachedAt: now, + ExpiresAt: now.Add(c.ttl), + } + c.evictExpired() + c.save() +} + +// load reads the cache from disk. Best-effort: errors are silently ignored. +func (c *GitHubResolutionCache) load() { + data, err := os.ReadFile(c.filePath) + if err != nil { + return + } + var f resolutionCacheFile + if err := json.Unmarshal(data, &f); err != nil { + return + } + if f.Entries == nil { + return + } + now := time.Now() + for uri, entry := range f.Entries { + if now.Before(entry.ExpiresAt) { + c.entries[uri] = entry + } + } + util.Debugf("github: loaded %d resolution cache entries from disk", len(c.entries)) +} + +// save persists the cache to disk. Best-effort. +func (c *GitHubResolutionCache) save() { + f := resolutionCacheFile{Entries: c.entries} + data, err := json.MarshalIndent(f, "", " ") + if err != nil { + return + } + _ = os.WriteFile(c.filePath, data, 0644) +} + +// evictExpired removes expired entries. Must be called with lock held. +func (c *GitHubResolutionCache) evictExpired() { + now := time.Now() + for uri, entry := range c.entries { + if now.After(entry.ExpiresAt) { + delete(c.entries, uri) + } + } +} + +// githubResolutionCacheDir returns the directory for storing GitHub +// resolution cache files. +func githubResolutionCacheDir() (string, error) { + globalDir, err := config.GetGlobalDir() + if err != nil { + return "", err + } + return filepath.Join(globalDir, "cache", "github-resolution"), nil +} diff --git a/pkg/agent/github_resolution_cache_test.go b/pkg/agent/github_resolution_cache_test.go new file mode 100644 index 000000000..751c971bc --- /dev/null +++ b/pkg/agent/github_resolution_cache_test.go @@ -0,0 +1,151 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package agent + +import ( + "os" + "path/filepath" + "testing" + "time" +) + +func TestGitHubResolutionCache_PutAndGet(t *testing.T) { + dir := t.TempDir() + cache, err := NewGitHubResolutionCache(dir, 5*time.Minute) + if err != nil { + t.Fatalf("NewGitHubResolutionCache: %v", err) + } + + skill := ResolvedSkill{ + Name: "my-skill", + URI: "gh://owner/repo/my-skill@main", + Version: "abc123def456", + Hash: "sha256:deadbeef", + Files: []ResolvedFile{ + {Path: "SKILL.md", URL: "https://example.com/SKILL.md", Hash: "sha256:abc", Size: 42}, + }, + } + + cache.Put("gh://owner/repo/my-skill@main", skill) + + got, ok := cache.Get("gh://owner/repo/my-skill@main") + if !ok { + t.Fatal("expected cache hit, got miss") + } + if got.Name != "my-skill" { + t.Errorf("expected name my-skill, got %s", got.Name) + } + if got.Hash != "sha256:deadbeef" { + t.Errorf("expected hash sha256:deadbeef, got %s", got.Hash) + } + if len(got.Files) != 1 { + t.Fatalf("expected 1 file, got %d", len(got.Files)) + } +} + +func TestGitHubResolutionCache_Miss(t *testing.T) { + dir := t.TempDir() + cache, err := NewGitHubResolutionCache(dir, 5*time.Minute) + if err != nil { + t.Fatalf("NewGitHubResolutionCache: %v", err) + } + + _, ok := cache.Get("gh://owner/repo/nonexistent@main") + if ok { + t.Fatal("expected cache miss, got hit") + } +} + +func TestGitHubResolutionCache_Expiry(t *testing.T) { + dir := t.TempDir() + cache, err := NewGitHubResolutionCache(dir, 1*time.Millisecond) + if err != nil { + t.Fatalf("NewGitHubResolutionCache: %v", err) + } + + skill := ResolvedSkill{ + Name: "expiring-skill", + URI: "gh://owner/repo/expiring@main", + } + cache.Put("gh://owner/repo/expiring@main", skill) + + // Wait for expiry + time.Sleep(5 * time.Millisecond) + + _, ok := cache.Get("gh://owner/repo/expiring@main") + if ok { + t.Fatal("expected cache miss after expiry, got hit") + } +} + +func TestGitHubResolutionCache_PersistAndReload(t *testing.T) { + dir := t.TempDir() + cache, err := NewGitHubResolutionCache(dir, 5*time.Minute) + if err != nil { + t.Fatalf("NewGitHubResolutionCache: %v", err) + } + + skill := ResolvedSkill{ + Name: "persist-skill", + URI: "gh://owner/repo/persist@main", + Version: "abc123def456", + Hash: "sha256:persist", + } + cache.Put("gh://owner/repo/persist@main", skill) + + // Verify file exists on disk + cacheFile := filepath.Join(dir, resolutionCacheFileName) + if _, err := os.Stat(cacheFile); err != nil { + t.Fatalf("cache file not persisted: %v", err) + } + + // Create a new cache instance from the same directory + cache2, err := NewGitHubResolutionCache(dir, 5*time.Minute) + if err != nil { + t.Fatalf("NewGitHubResolutionCache (reload): %v", err) + } + + got, ok := cache2.Get("gh://owner/repo/persist@main") + if !ok { + t.Fatal("expected cache hit after reload, got miss") + } + if got.Name != "persist-skill" { + t.Errorf("expected name persist-skill, got %s", got.Name) + } +} + +func TestGitHubResolutionCache_ExpiredNotLoaded(t *testing.T) { + dir := t.TempDir() + cache, err := NewGitHubResolutionCache(dir, 1*time.Millisecond) + if err != nil { + t.Fatalf("NewGitHubResolutionCache: %v", err) + } + + skill := ResolvedSkill{Name: "expired-skill", URI: "gh://o/r/s@main"} + cache.Put("gh://o/r/s@main", skill) + + time.Sleep(5 * time.Millisecond) + + // Reload — expired entries should not be loaded + cache2, err := NewGitHubResolutionCache(dir, 5*time.Minute) + if err != nil { + t.Fatalf("NewGitHubResolutionCache (reload): %v", err) + } + + _, ok := cache2.Get("gh://o/r/s@main") + if ok { + t.Fatal("expected expired entry to not be loaded, got hit") + } +} diff --git a/pkg/agent/github_skill_resolver.go b/pkg/agent/github_skill_resolver.go index cb5287e34..ab9dbf48b 100644 --- a/pkg/agent/github_skill_resolver.go +++ b/pkg/agent/github_skill_resolver.go @@ -20,14 +20,17 @@ import ( "encoding/json" "fmt" "io" + "math" "net/http" "net/url" "os" + "strconv" "strings" "time" "github.com/GoogleCloudPlatform/scion/pkg/api" "github.com/GoogleCloudPlatform/scion/pkg/transfer" + "github.com/GoogleCloudPlatform/scion/pkg/util" ) const ( @@ -35,25 +38,38 @@ const ( githubRawBase = "https://raw.githubusercontent.com" githubAPITimeout = 30 * time.Second githubMaxFileSize = 10 * 1024 * 1024 // 10MB per file + + githubMaxRetries = 4 + githubBaseBackoff = 1 * time.Second + githubMaxBackoff = 30 * time.Second + githubBackoffFactor = 2.0 ) // GitHubSkillResolver resolves skills from GitHub repositories // using the GitHub Contents API. type GitHubSkillResolver struct { - httpClient *http.Client - token string // GITHUB_TOKEN for authenticated requests - apiBase string // Default: githubAPIBase, override in tests - rawBase string // Default: githubRawBase, override in tests + httpClient *http.Client + token string // GITHUB_TOKEN for authenticated requests + apiBase string // Default: githubAPIBase, override in tests + rawBase string // Default: githubRawBase, override in tests + resolutionCache *GitHubResolutionCache } // NewGitHubSkillResolver creates a resolver for gh:// and GitHub URL skills. // Reads GITHUB_TOKEN from environment for authenticated API access. +// If a resolution cache directory is available, cached resolution results +// are reused to avoid redundant GitHub API calls. func NewGitHubSkillResolver() *GitHubSkillResolver { + var cache *GitHubResolutionCache + if cacheDir, err := githubResolutionCacheDir(); err == nil { + cache, _ = NewGitHubResolutionCache(cacheDir, DefaultResolutionCacheTTL) + } return &GitHubSkillResolver{ - httpClient: &http.Client{Timeout: githubAPITimeout}, - token: os.Getenv("GITHUB_TOKEN"), - apiBase: githubAPIBase, - rawBase: githubRawBase, + httpClient: &http.Client{Timeout: githubAPITimeout}, + token: os.Getenv("GITHUB_TOKEN"), + apiBase: githubAPIBase, + rawBase: githubRawBase, + resolutionCache: cache, } } @@ -85,6 +101,16 @@ func (r *GitHubSkillResolver) Resolve(ctx context.Context, refs []api.SkillRefer } func (r *GitHubSkillResolver) resolveOne(ctx context.Context, ghRef *GitHubSkillRef, ref api.SkillReference) (*ResolvedSkill, error) { + // Check resolution cache first + if r.resolutionCache != nil { + if cached, ok := r.resolutionCache.Get(ref.URI); ok { + util.Debugf("github: resolution cache hit for %s", ref.URI) + result := cached + result.As = ref.As + return &result, nil + } + } + commitSHA, err := r.resolveCommitSHA(ctx, ghRef) if err != nil { return nil, fmt.Errorf("failed to resolve ref for %s: %w", ghRef.Raw, err) @@ -136,14 +162,21 @@ func (r *GitHubSkillResolver) resolveOne(ctx context.Context, ghRef *GitHubSkill bundleHash := transfer.ComputeContentHash(fileInfos) - return &ResolvedSkill{ + resolved := &ResolvedSkill{ Name: ghRef.SkillName, URI: ghRef.Raw, As: ref.As, Version: commitSHA[:12], Hash: bundleHash, Files: resolvedFiles, - }, nil + } + + // Store in resolution cache + if r.resolutionCache != nil { + r.resolutionCache.Put(ref.URI, *resolved) + } + + return resolved, nil } // githubContentEntry is the JSON structure returned by the GitHub Contents API. @@ -170,7 +203,7 @@ func (r *GitHubSkillResolver) resolveCommitSHA(ctx context.Context, ghRef *GitHu req.Header.Set("Accept", "application/vnd.github.v3.sha") r.setAuthHeader(req) - resp, err := r.httpClient.Do(req) + resp, err := r.doWithRetry(ctx, req) if err != nil { return "", fmt.Errorf("GitHub API request failed: %w", err) } @@ -206,7 +239,7 @@ func (r *GitHubSkillResolver) listContents(ctx context.Context, ghRef *GitHubSki req.Header.Set("Accept", "application/vnd.github.v3+json") r.setAuthHeader(req) - resp, err := r.httpClient.Do(req) + resp, err := r.doWithRetry(ctx, req) if err != nil { return nil, fmt.Errorf("GitHub API request failed: %w", err) } @@ -237,7 +270,7 @@ func (r *GitHubSkillResolver) downloadRawFile(ctx context.Context, ghRef *GitHub } r.setAuthHeader(req) - resp, err := r.httpClient.Do(req) + resp, err := r.doWithRetry(ctx, req) if err != nil { return nil, fmt.Errorf("download failed: %w", err) } @@ -276,6 +309,102 @@ func (r *GitHubSkillResolver) setAuthHeader(req *http.Request) { } } +// doWithRetry executes an HTTP request with retry and exponential backoff +// for rate-limited (403 with X-RateLimit-Remaining: 0, 429) and transient +// server errors (5xx). On retryable responses it respects the Retry-After +// header when present. +func (r *GitHubSkillResolver) doWithRetry(ctx context.Context, req *http.Request) (*http.Response, error) { + var lastResp *http.Response + var lastErr error + + for attempt := 0; attempt <= githubMaxRetries; attempt++ { + if attempt > 0 { + delay := retryDelay(lastResp, attempt) + util.Debugf("github: retrying request (attempt %d/%d) after %v: %s %s", + attempt, githubMaxRetries, delay, req.Method, req.URL.Path) + + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(delay): + } + } + + // Clone the request for retries (body is nil for GET requests) + cloned := req.Clone(ctx) + resp, err := r.httpClient.Do(cloned) + if err != nil { + lastErr = err + // Network errors are retryable + continue + } + + if !isRetryableResponse(resp) { + return resp, nil + } + + // Drain and close body so the connection can be reused + _, _ = io.Copy(io.Discard, resp.Body) + resp.Body.Close() + lastResp = resp + lastErr = nil + } + + if lastErr != nil { + return nil, lastErr + } + // All retries exhausted — re-execute to return the final response for + // the caller to handle (e.g. apiError extraction). + return r.httpClient.Do(req.Clone(ctx)) +} + +// isRetryableResponse returns true for HTTP responses that should be retried: +// 429 (Too Many Requests), 403 with rate-limit exhaustion, and 5xx server errors. +func isRetryableResponse(resp *http.Response) bool { + if resp.StatusCode == http.StatusTooManyRequests { + return true + } + if resp.StatusCode == http.StatusForbidden && resp.Header.Get("X-RateLimit-Remaining") == "0" { + return true + } + if resp.StatusCode >= 500 { + return true + } + return false +} + +// retryDelay calculates the backoff duration for a retry attempt. +// Uses the Retry-After header when present, otherwise exponential backoff. +func retryDelay(resp *http.Response, attempt int) time.Duration { + if resp != nil { + if ra := resp.Header.Get("Retry-After"); ra != "" { + if seconds, err := strconv.Atoi(ra); err == nil && seconds >= 0 { + d := time.Duration(seconds) * time.Second + if d > githubMaxBackoff { + d = githubMaxBackoff + } + return d + } + } + // For rate limits, check X-RateLimit-Reset (Unix timestamp) + if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusTooManyRequests { + if resetStr := resp.Header.Get("X-RateLimit-Reset"); resetStr != "" { + if resetUnix, err := strconv.ParseInt(resetStr, 10, 64); err == nil { + wait := time.Until(time.Unix(resetUnix, 0)) + if wait > 0 && wait <= githubMaxBackoff { + return wait + } + } + } + } + } + backoff := time.Duration(float64(githubBaseBackoff) * math.Pow(githubBackoffFactor, float64(attempt-1))) + if backoff > githubMaxBackoff { + backoff = githubMaxBackoff + } + return backoff +} + func (r *GitHubSkillResolver) apiError(resp *http.Response, action string) error { body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) if resp.StatusCode == http.StatusForbidden && resp.Header.Get("X-RateLimit-Remaining") == "0" { diff --git a/pkg/agent/github_skill_resolver_test.go b/pkg/agent/github_skill_resolver_test.go index a8ee5cc34..4c50768a4 100644 --- a/pkg/agent/github_skill_resolver_test.go +++ b/pkg/agent/github_skill_resolver_test.go @@ -23,6 +23,7 @@ import ( "net/http/httptest" "strings" "testing" + "time" "github.com/GoogleCloudPlatform/scion/pkg/api" "github.com/GoogleCloudPlatform/scion/pkg/transfer" @@ -216,9 +217,12 @@ func TestGitHubSkillResolver_NotFound_SkillDir(t *testing.T) { func TestGitHubSkillResolver_RateLimit(t *testing.T) { server, mux := newTestGitHubServer(t) + attempts := 0 mux.HandleFunc("/repos/owner/repo/commits/main", func(w http.ResponseWriter, _ *http.Request) { + attempts++ w.Header().Set("X-RateLimit-Remaining", "0") w.Header().Set("X-RateLimit-Reset", "1700000000") + w.Header().Set("Retry-After", "0") w.WriteHeader(http.StatusForbidden) }) @@ -240,6 +244,146 @@ func TestGitHubSkillResolver_RateLimit(t *testing.T) { if !strings.Contains(result.Errors[0].Message, "GITHUB_TOKEN") { t.Errorf("expected error to mention GITHUB_TOKEN, got %s", result.Errors[0].Message) } + // Verify retries happened before the final rate-limit error + if attempts > 1 { + t.Logf("retried %d times before giving up (expected with backoff)", attempts-1) + } +} + +func TestGitHubSkillResolver_RetryOn429(t *testing.T) { + server, mux := newTestGitHubServer(t) + + attempts := 0 + mux.HandleFunc("/repos/owner/repo/commits/main", func(w http.ResponseWriter, r *http.Request) { + attempts++ + if attempts <= 2 { + w.Header().Set("Retry-After", "0") + w.WriteHeader(http.StatusTooManyRequests) + return + } + w.Write([]byte(testCommitSHA)) + }) + mux.HandleFunc("/repos/owner/repo/contents/skills/my-skill", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode([]githubContentEntry{ + {Name: "SKILL.md", Path: "skills/my-skill/SKILL.md", Type: "file", Size: 5}, + }) + }) + mux.HandleFunc("/raw/owner/repo/"+testCommitSHA+"/skills/my-skill/SKILL.md", func(w http.ResponseWriter, _ *http.Request) { + w.Write([]byte("hello")) + }) + + resolver := newTestGitHubResolver(server) + + result, err := resolver.Resolve(context.Background(), []api.SkillReference{ + {URI: "gh://owner/repo/my-skill@main"}, + }, ResolveOpts{}) + + if err != nil { + t.Fatalf("Resolve failed: %v", err) + } + if len(result.Errors) != 0 { + t.Fatalf("unexpected errors: %v", result.Errors) + } + if len(result.Resolved) != 1 { + t.Fatalf("expected 1 resolved skill, got %d", len(result.Resolved)) + } + if attempts < 3 { + t.Errorf("expected at least 3 attempts, got %d", attempts) + } +} + +func TestGitHubSkillResolver_RetryOn5xx(t *testing.T) { + server, mux := newTestGitHubServer(t) + + attempts := 0 + mux.HandleFunc("/repos/owner/repo/commits/main", func(w http.ResponseWriter, _ *http.Request) { + attempts++ + if attempts == 1 { + w.WriteHeader(http.StatusServiceUnavailable) + return + } + w.Write([]byte(testCommitSHA)) + }) + mux.HandleFunc("/repos/owner/repo/contents/skills/my-skill", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode([]githubContentEntry{ + {Name: "SKILL.md", Path: "skills/my-skill/SKILL.md", Type: "file", Size: 5}, + }) + }) + mux.HandleFunc("/raw/owner/repo/"+testCommitSHA+"/skills/my-skill/SKILL.md", func(w http.ResponseWriter, _ *http.Request) { + w.Write([]byte("hello")) + }) + + resolver := newTestGitHubResolver(server) + + result, err := resolver.Resolve(context.Background(), []api.SkillReference{ + {URI: "gh://owner/repo/my-skill@main"}, + }, ResolveOpts{}) + + if err != nil { + t.Fatalf("Resolve failed: %v", err) + } + if len(result.Errors) != 0 { + t.Fatalf("unexpected errors: %v", result.Errors) + } + if attempts < 2 { + t.Errorf("expected at least 2 attempts, got %d", attempts) + } +} + +func TestGitHubSkillResolver_ResolutionCacheHit(t *testing.T) { + server, mux := newTestGitHubServer(t) + apiCalls := 0 + + mux.HandleFunc("/repos/owner/repo/commits/main", func(w http.ResponseWriter, _ *http.Request) { + apiCalls++ + w.Write([]byte(testCommitSHA)) + }) + mux.HandleFunc("/repos/owner/repo/contents/skills/my-skill", func(w http.ResponseWriter, _ *http.Request) { + apiCalls++ + json.NewEncoder(w).Encode([]githubContentEntry{ + {Name: "SKILL.md", Path: "skills/my-skill/SKILL.md", Type: "file", Size: 5}, + }) + }) + mux.HandleFunc("/raw/owner/repo/"+testCommitSHA+"/skills/my-skill/SKILL.md", func(w http.ResponseWriter, _ *http.Request) { + apiCalls++ + w.Write([]byte("hello")) + }) + + resolver := newTestGitHubResolver(server) + cache, err := NewGitHubResolutionCache(t.TempDir(), 5*time.Minute) + if err != nil { + t.Fatalf("cache creation failed: %v", err) + } + resolver.resolutionCache = cache + + // First call — should hit the API + result1, err := resolver.Resolve(context.Background(), []api.SkillReference{ + {URI: "gh://owner/repo/my-skill@main"}, + }, ResolveOpts{}) + if err != nil { + t.Fatalf("first Resolve failed: %v", err) + } + if len(result1.Resolved) != 1 { + t.Fatalf("expected 1 resolved skill, got %d", len(result1.Resolved)) + } + firstCallAPICalls := apiCalls + + // Second call — should use cache, no new API calls + result2, err := resolver.Resolve(context.Background(), []api.SkillReference{ + {URI: "gh://owner/repo/my-skill@main"}, + }, ResolveOpts{}) + if err != nil { + t.Fatalf("second Resolve failed: %v", err) + } + if len(result2.Resolved) != 1 { + t.Fatalf("expected 1 resolved skill on second call, got %d", len(result2.Resolved)) + } + if apiCalls != firstCallAPICalls { + t.Errorf("expected no new API calls on cache hit, but got %d additional calls", apiCalls-firstCallAPICalls) + } + if result2.Resolved[0].Name != result1.Resolved[0].Name { + t.Errorf("cached result name mismatch: %s vs %s", result2.Resolved[0].Name, result1.Resolved[0].Name) + } } func TestGitHubSkillResolver_InvalidURI(t *testing.T) { @@ -356,6 +500,81 @@ func TestGitHubSkillResolver_MixedBatch(t *testing.T) { } } +func TestIsRetryableResponse(t *testing.T) { + tests := []struct { + name string + statusCode int + headers map[string]string + want bool + }{ + {"429 is retryable", 429, nil, true}, + {"403 with rate limit is retryable", 403, map[string]string{"X-RateLimit-Remaining": "0"}, true}, + {"403 without rate limit is not retryable", 403, nil, false}, + {"500 is retryable", 500, nil, true}, + {"502 is retryable", 502, nil, true}, + {"503 is retryable", 503, nil, true}, + {"200 is not retryable", 200, nil, false}, + {"404 is not retryable", 404, nil, false}, + {"401 is not retryable", 401, nil, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + resp := &http.Response{ + StatusCode: tt.statusCode, + Header: make(http.Header), + } + for k, v := range tt.headers { + resp.Header.Set(k, v) + } + if got := isRetryableResponse(resp); got != tt.want { + t.Errorf("isRetryableResponse() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestRetryDelay(t *testing.T) { + t.Run("uses Retry-After header", func(t *testing.T) { + resp := &http.Response{ + StatusCode: 429, + Header: make(http.Header), + } + resp.Header.Set("Retry-After", "3") + got := retryDelay(resp, 1) + if got != 3*time.Second { + t.Errorf("expected 3s, got %v", got) + } + }) + + t.Run("caps Retry-After at max backoff", func(t *testing.T) { + resp := &http.Response{ + StatusCode: 429, + Header: make(http.Header), + } + resp.Header.Set("Retry-After", "120") + got := retryDelay(resp, 1) + if got != githubMaxBackoff { + t.Errorf("expected %v, got %v", githubMaxBackoff, got) + } + }) + + t.Run("exponential backoff without headers", func(t *testing.T) { + d1 := retryDelay(nil, 1) + d2 := retryDelay(nil, 2) + d3 := retryDelay(nil, 3) + if d1 != 1*time.Second { + t.Errorf("attempt 1: expected 1s, got %v", d1) + } + if d2 != 2*time.Second { + t.Errorf("attempt 2: expected 2s, got %v", d2) + } + if d3 != 4*time.Second { + t.Errorf("attempt 3: expected 4s, got %v", d3) + } + }) +} + type stubSkillResolver struct { result *ResolveResult } From 3f3e3dc54b8068ff7e853409bb8efdc3b0085ae9 Mon Sep 17 00:00:00 2001 From: "Scion Agent (skill-bank-rate-limit-fix)" Date: Sun, 28 Jun 2026 11:11:46 +0000 Subject: [PATCH 2/3] skill-bank: address Gemini review findings for retry and cache - Release lock before disk I/O in Put() by copying entries under lock - Fix extra HTTP request after retries exhausted in doWithRetry() - Deep copy Files slice in Get() to prevent data races - Use atomic write (tmp + rename) in save() - Cap rate limit reset wait at maxBackoff instead of falling back to shorter exponential backoff --- pkg/agent/github_resolution_cache.go | 32 ++++++++++++++++++++-------- pkg/agent/github_skill_resolver.go | 20 ++++++++--------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/pkg/agent/github_resolution_cache.go b/pkg/agent/github_resolution_cache.go index 065858a42..3016c9bae 100644 --- a/pkg/agent/github_resolution_cache.go +++ b/pkg/agent/github_resolution_cache.go @@ -72,7 +72,8 @@ func NewGitHubResolutionCache(dir string, ttl time.Duration) (*GitHubResolutionC } // Get returns a cached ResolvedSkill for the given URI if it exists -// and has not expired. +// and has not expired. The returned value is a deep copy safe for +// concurrent use. func (c *GitHubResolutionCache) Get(uri string) (ResolvedSkill, bool) { c.mu.RLock() defer c.mu.RUnlock() @@ -84,14 +85,17 @@ func (c *GitHubResolutionCache) Get(uri string) (ResolvedSkill, bool) { if time.Now().After(entry.ExpiresAt) { return ResolvedSkill{}, false } - return entry.Skill, true + skill := entry.Skill + if len(entry.Skill.Files) > 0 { + skill.Files = make([]ResolvedFile, len(entry.Skill.Files)) + copy(skill.Files, entry.Skill.Files) + } + return skill, true } // Put stores a resolved skill in the cache. func (c *GitHubResolutionCache) Put(uri string, skill ResolvedSkill) { c.mu.Lock() - defer c.mu.Unlock() - now := time.Now() c.entries[uri] = &resolutionCacheEntry{ Skill: skill, @@ -99,7 +103,13 @@ func (c *GitHubResolutionCache) Put(uri string, skill ResolvedSkill) { ExpiresAt: now.Add(c.ttl), } c.evictExpired() - c.save() + snapshot := make(map[string]*resolutionCacheEntry, len(c.entries)) + for k, v := range c.entries { + snapshot[k] = v + } + c.mu.Unlock() + + c.save(snapshot) } // load reads the cache from disk. Best-effort: errors are silently ignored. @@ -124,14 +134,18 @@ func (c *GitHubResolutionCache) load() { util.Debugf("github: loaded %d resolution cache entries from disk", len(c.entries)) } -// save persists the cache to disk. Best-effort. -func (c *GitHubResolutionCache) save() { - f := resolutionCacheFile{Entries: c.entries} +// save persists the given entries snapshot to disk atomically. Best-effort. +func (c *GitHubResolutionCache) save(entries map[string]*resolutionCacheEntry) { + f := resolutionCacheFile{Entries: entries} data, err := json.MarshalIndent(f, "", " ") if err != nil { return } - _ = os.WriteFile(c.filePath, data, 0644) + tmpPath := c.filePath + ".tmp" + if err := os.WriteFile(tmpPath, data, 0644); err != nil { + return + } + _ = os.Rename(tmpPath, c.filePath) } // evictExpired removes expired entries. Must be called with lock held. diff --git a/pkg/agent/github_skill_resolver.go b/pkg/agent/github_skill_resolver.go index ab9dbf48b..7d4d3ef17 100644 --- a/pkg/agent/github_skill_resolver.go +++ b/pkg/agent/github_skill_resolver.go @@ -330,12 +330,11 @@ func (r *GitHubSkillResolver) doWithRetry(ctx context.Context, req *http.Request } } - // Clone the request for retries (body is nil for GET requests) cloned := req.Clone(ctx) resp, err := r.httpClient.Do(cloned) if err != nil { lastErr = err - // Network errors are retryable + lastResp = nil continue } @@ -343,19 +342,17 @@ func (r *GitHubSkillResolver) doWithRetry(ctx context.Context, req *http.Request return resp, nil } - // Drain and close body so the connection can be reused + if attempt == githubMaxRetries { + return resp, nil + } + _, _ = io.Copy(io.Discard, resp.Body) resp.Body.Close() lastResp = resp lastErr = nil } - if lastErr != nil { - return nil, lastErr - } - // All retries exhausted — re-execute to return the final response for - // the caller to handle (e.g. apiError extraction). - return r.httpClient.Do(req.Clone(ctx)) + return nil, lastErr } // isRetryableResponse returns true for HTTP responses that should be retried: @@ -391,7 +388,10 @@ func retryDelay(resp *http.Response, attempt int) time.Duration { if resetStr := resp.Header.Get("X-RateLimit-Reset"); resetStr != "" { if resetUnix, err := strconv.ParseInt(resetStr, 10, 64); err == nil { wait := time.Until(time.Unix(resetUnix, 0)) - if wait > 0 && wait <= githubMaxBackoff { + if wait > 0 { + if wait > githubMaxBackoff { + return githubMaxBackoff + } return wait } } From dd16febd9dba7ddbb3dcebb91ccf5e665a6f8af6 Mon Sep 17 00:00:00 2001 From: "Scion Agent (ci-fix-impl)" Date: Sun, 28 Jun 2026 11:25:49 +0000 Subject: [PATCH 3/3] fix: address errcheck linter findings in github_skill_resolver Properly handle return values from resp.Body.Close(), w.Write(), and json.NewEncoder().Encode() calls. --- pkg/agent/github_skill_resolver.go | 2 +- pkg/agent/github_skill_resolver_test.go | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/agent/github_skill_resolver.go b/pkg/agent/github_skill_resolver.go index 7d4d3ef17..2e1dedba7 100644 --- a/pkg/agent/github_skill_resolver.go +++ b/pkg/agent/github_skill_resolver.go @@ -347,7 +347,7 @@ func (r *GitHubSkillResolver) doWithRetry(ctx context.Context, req *http.Request } _, _ = io.Copy(io.Discard, resp.Body) - resp.Body.Close() + _ = resp.Body.Close() lastResp = resp lastErr = nil } diff --git a/pkg/agent/github_skill_resolver_test.go b/pkg/agent/github_skill_resolver_test.go index 4c50768a4..381e390e0 100644 --- a/pkg/agent/github_skill_resolver_test.go +++ b/pkg/agent/github_skill_resolver_test.go @@ -261,15 +261,15 @@ func TestGitHubSkillResolver_RetryOn429(t *testing.T) { w.WriteHeader(http.StatusTooManyRequests) return } - w.Write([]byte(testCommitSHA)) + _, _ = w.Write([]byte(testCommitSHA)) }) mux.HandleFunc("/repos/owner/repo/contents/skills/my-skill", func(w http.ResponseWriter, _ *http.Request) { - json.NewEncoder(w).Encode([]githubContentEntry{ + _ = json.NewEncoder(w).Encode([]githubContentEntry{ {Name: "SKILL.md", Path: "skills/my-skill/SKILL.md", Type: "file", Size: 5}, }) }) mux.HandleFunc("/raw/owner/repo/"+testCommitSHA+"/skills/my-skill/SKILL.md", func(w http.ResponseWriter, _ *http.Request) { - w.Write([]byte("hello")) + _, _ = w.Write([]byte("hello")) }) resolver := newTestGitHubResolver(server) @@ -302,15 +302,15 @@ func TestGitHubSkillResolver_RetryOn5xx(t *testing.T) { w.WriteHeader(http.StatusServiceUnavailable) return } - w.Write([]byte(testCommitSHA)) + _, _ = w.Write([]byte(testCommitSHA)) }) mux.HandleFunc("/repos/owner/repo/contents/skills/my-skill", func(w http.ResponseWriter, _ *http.Request) { - json.NewEncoder(w).Encode([]githubContentEntry{ + _ = json.NewEncoder(w).Encode([]githubContentEntry{ {Name: "SKILL.md", Path: "skills/my-skill/SKILL.md", Type: "file", Size: 5}, }) }) mux.HandleFunc("/raw/owner/repo/"+testCommitSHA+"/skills/my-skill/SKILL.md", func(w http.ResponseWriter, _ *http.Request) { - w.Write([]byte("hello")) + _, _ = w.Write([]byte("hello")) }) resolver := newTestGitHubResolver(server) @@ -336,17 +336,17 @@ func TestGitHubSkillResolver_ResolutionCacheHit(t *testing.T) { mux.HandleFunc("/repos/owner/repo/commits/main", func(w http.ResponseWriter, _ *http.Request) { apiCalls++ - w.Write([]byte(testCommitSHA)) + _, _ = w.Write([]byte(testCommitSHA)) }) mux.HandleFunc("/repos/owner/repo/contents/skills/my-skill", func(w http.ResponseWriter, _ *http.Request) { apiCalls++ - json.NewEncoder(w).Encode([]githubContentEntry{ + _ = json.NewEncoder(w).Encode([]githubContentEntry{ {Name: "SKILL.md", Path: "skills/my-skill/SKILL.md", Type: "file", Size: 5}, }) }) mux.HandleFunc("/raw/owner/repo/"+testCommitSHA+"/skills/my-skill/SKILL.md", func(w http.ResponseWriter, _ *http.Request) { apiCalls++ - w.Write([]byte("hello")) + _, _ = w.Write([]byte("hello")) }) resolver := newTestGitHubResolver(server)