From c3dba74c288e7139db7e6ebdbeb37d7757c04dec Mon Sep 17 00:00:00 2001 From: "liqiankun.1111" Date: Thu, 25 Jun 2026 12:14:59 +0800 Subject: [PATCH 1/4] feat(llm): multi-model routing with availability fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an ordered model pool so a review falls over to another provider/model when the primary is rate-limited, down, or timing out — instead of failing the file. - config: new `routing` namespace — `routing.models` ([{provider, model}], priority order, reusing the existing `providers` map for credentials) and `routing.policy` (only "priority" today; reserved for future policies, an unknown value is rejected rather than silently ignored). Namespacing under `routing` keeps it distinct from providers..models (a provider's model catalog) and gives future routing knobs a home. - LLMRouter implements LLMClient: tries members in order, advances on availability errors (429/5xx/network), short-circuits on client-side errors (400/413/422) and context cancellation. A per-run shared cooldown parks a throttled model so concurrent per-file subtasks skip it. - router members use a low SDK retry budget so a rate-limited model fails fast to the next instead of burning the full backoff (MaxRetries now configurable; default 5 preserved). - docs: README.md / README.zh-CN.md config reference + Multi-model fallback. No `routing.models` keeps the current single-model behavior; `--model` pins a single endpoint. Tests cover fallover / short-circuit / exhaustion / cooldown, error classification, config chain resolution, and policy validation. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 28 ++++++ README.zh-CN.md | 28 ++++++ cmd/opencodereview/shared.go | 6 +- internal/llm/client.go | 132 +++++++++++++++++++++++++- internal/llm/resolver.go | 106 +++++++++++++++++++-- internal/llm/router_test.go | 176 +++++++++++++++++++++++++++++++++++ 6 files changed, 465 insertions(+), 11 deletions(-) create mode 100644 internal/llm/router_test.go diff --git a/README.md b/README.md index 770df031..ef49c076 100644 --- a/README.md +++ b/README.md @@ -559,6 +559,8 @@ Config file: `~/.opencodereview/config.json` | `providers..models` | array | Optional provider model list for interactive selection | | `providers..auth_header` | string | `x-api-key` \| `authorization` | | `custom_providers..*` | — | Same fields as `providers..*`, including optional `models` | +| `routing.models` | array | Ordered model pool for failover: `[{provider, model}]` (see [Multi-model fallback](#multi-model-fallback)) | +| `routing.policy` | string | Selection policy; `priority` (default, only value today) | | `llm.url` | string | `https://api.openai.com/v1/chat/completions` | | `llm.auth_token` | string | `sk-xxxxxxx` | | `llm.auth_header` | string | Anthropic only: `x-api-key` \| `authorization` | @@ -582,6 +584,32 @@ Environment variables take precedence over the config file. | `OCR_LLM_MODEL` | Model name | | `OCR_USE_ANTHROPIC` | `true` = Anthropic, `false` = OpenAI | +### Multi-model fallback + +By default a review uses a single model (`provider` + `model`). To survive rate limits and provider outages, configure an ordered `routing.models` pool — the reviewer tries each in order and falls over to the next when one is rate-limited, returns a server error, or times out: + +```json +{ + "providers": { + "anthropic": { "api_key": "sk-ant-...", "model": "claude-opus-4-6" }, + "deepseek": { "api_key": "sk-...", "model": "deepseek-v3" } + }, + "routing": { + "models": [ + { "provider": "anthropic", "model": "claude-opus-4-6" }, + { "provider": "deepseek", "model": "deepseek-v3" } + ], + "policy": "priority" + } +} +``` + +- Each entry references a configured provider (for credentials / endpoint) and a model; an omitted `model` uses the provider's default. +- `routing.policy` selects how the pool is ordered. Only `priority` is supported today (first entry is primary); the field is reserved for future policies (e.g. weighted), and an unknown value is rejected rather than silently ignored. +- A rate-limited or unavailable model is briefly parked so concurrent per-file reviews skip it instead of each re-hitting it. +- Failover triggers on availability errors (rate limit, 5xx, network/timeout). Client-side errors (bad request, payload too large) do **not** trigger failover, since another model would fail identically. +- Without `routing.models`, behavior is unchanged. `--model` pins a single model and bypasses the pool. + ## Telemetry diff --git a/README.zh-CN.md b/README.zh-CN.md index 0fb720e4..b98fda50 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -544,6 +544,8 @@ OCR 通过四层优先级链解析评审规则。每层采用首次匹配原则 | `providers..models` | array | 用于交互式选择的可选供应商模型列表 | | `providers..auth_header` | string | `x-api-key` \| `authorization` | | `custom_providers..*` | — | 与 `providers..*` 相同的字段,包括可选的 `models` | +| `routing.models` | array | 用于故障转移的有序模型池:`[{provider, model}]`(见[多模型故障转移](#多模型故障转移)) | +| `routing.policy` | string | 选择策略;`priority`(默认,目前唯一取值) | | `llm.url` | string | `https://api.openai.com/v1/chat/completions` | | `llm.auth_token` | string | `sk-xxxxxxx` | | `llm.auth_header` | string | 仅 Anthropic:`x-api-key` \| `authorization` | @@ -567,6 +569,32 @@ OCR 通过四层优先级链解析评审规则。每层采用首次匹配原则 | `OCR_LLM_MODEL` | 模型名称 | | `OCR_USE_ANTHROPIC` | `true` = Anthropic,`false` = OpenAI | +### 多模型故障转移 + +默认评审使用单一模型(`provider` + `model`)。为应对限流与供应商故障,可配置有序的 `routing.models` 池——评审按顺序尝试,当某个模型被限流、返回服务端错误或超时时,自动转移到下一个: + +```json +{ + "providers": { + "anthropic": { "api_key": "sk-ant-...", "model": "claude-opus-4-6" }, + "deepseek": { "api_key": "sk-...", "model": "deepseek-v3" } + }, + "routing": { + "models": [ + { "provider": "anthropic", "model": "claude-opus-4-6" }, + { "provider": "deepseek", "model": "deepseek-v3" } + ], + "policy": "priority" + } +} +``` + +- 每个条目引用一个已配置的供应商(提供凭据 / 端点)及一个模型;省略 `model` 时使用该供应商的默认模型。 +- `routing.policy` 决定池的排序方式。目前仅支持 `priority`(第一个为主模型);该字段为未来策略(如 weighted)预留,填入未知值会报错而非被静默忽略。 +- 被限流或不可用的模型会被短暂搁置,使并发的逐文件评审跳过它,而非各自重复命中。 +- 仅在可用性错误(限流、5xx、网络 / 超时)时转移。客户端错误(请求错误、负载过大)**不**触发转移,因为换个模型同样会失败。 +- 不配置 `routing.models` 时行为不变。`--model` 固定单一模型并绕过该池。 + ## 遥测 diff --git a/cmd/opencodereview/shared.go b/cmd/opencodereview/shared.go index d75dba1b..65d1c9bb 100644 --- a/cmd/opencodereview/shared.go +++ b/cmd/opencodereview/shared.go @@ -145,14 +145,14 @@ func loadLLMRuntime(tpl *template.Template, toolConfigPath, modelOverride string } tpl.ApplyLanguage(lang) - ep, err := llm.ResolveEndpointWithModelOverride(cfgPath, modelOverride) + eps, err := llm.ResolveModelsWithModelOverride(cfgPath, modelOverride) if err != nil { return nil, fmt.Errorf("resolve LLM endpoint: %w", err) } return &llmRuntime{ - Client: llm.NewLLMClient(ep), - Model: ep.Model, + Client: llm.NewLLMRouter(eps), + Model: eps[0].Model, PlanToolDefs: planToolDefs, MainToolDefs: mainToolDefs, Collector: tool.NewCommentCollector(), diff --git a/internal/llm/client.go b/internal/llm/client.go index 1773282e..eb887ecd 100644 --- a/internal/llm/client.go +++ b/internal/llm/client.go @@ -5,7 +5,9 @@ package llm import ( "context" "encoding/json" + "errors" "fmt" + "log" "strings" "sync" "time" @@ -185,6 +187,8 @@ type ClientConfig struct { AuthHeader string // Auth header name: "x-api-key", "authorization", or empty for protocol default Timeout time.Duration // Request timeout ExtraBody map[string]any // Vendor-specific fields merged into every request body + MaxRetries int // SDK in-provider retry budget; 0 → default. Lowered for router members so a + // rate-limited model fails fast to the next instead of burning the full backoff. } // --- Factory --- @@ -198,6 +202,7 @@ func NewLLMClient(ep ResolvedEndpoint) LLMClient { Model: ep.Model, AuthHeader: ep.AuthHeader, ExtraBody: ep.ExtraBody, + MaxRetries: ep.MaxRetries, } if ep.Protocol == "anthropic" { return NewAnthropicClient(cfg) @@ -205,6 +210,129 @@ func NewLLMClient(ep ResolvedEndpoint) LLMClient { return NewOpenAIClient(cfg) } +func maxRetriesOrDefault(n int) int { + if n > 0 { + return n + } + return 5 // SDK default budget when caller doesn't constrain it +} + +// --- Multi-model router --- + +// Tunables for LLMRouter. A router member that returns a fallover-worthy error is +// parked for routerCooldown so concurrent subtasks skip it instead of each re-hitting +// a model that's down/throttled. Members get a low retry budget so a rate-limited +// model fails fast to the next rather than burning the full SDK backoff. +const ( + routerMemberRetries = 2 + routerCooldown = 30 * time.Second +) + +type routerMember struct { + client LLMClient + label string // "protocol/model" for logs +} + +// LLMRouter is an LLMClient over an ordered pool of models. On a fallover-worthy +// failure (rate limit / 5xx / network) it advances to the next member; client-side +// errors (bad request / payload too large) short-circuit since another model would +// fail identically. Cooldown state is shared across concurrent CompletionsWithCtx +// calls (one ocr run's per-file subtasks), so a throttled model is skipped fleet-wide. +// Selection is strict priority order today; the order() seam is where a weighted / +// capability policy would plug in. +type LLMRouter struct { + members []routerMember + mu sync.Mutex + cooldown map[int]time.Time // member index → parked-until +} + +// NewLLMRouter builds an LLMClient from an ordered pool. A pool of one returns a +// plain client (no router overhead, unchanged single-model behavior). +func NewLLMRouter(eps []ResolvedEndpoint) LLMClient { + if len(eps) == 1 { + return NewLLMClient(eps[0]) + } + members := make([]routerMember, len(eps)) + for i, ep := range eps { + if ep.MaxRetries == 0 { + ep.MaxRetries = routerMemberRetries + } + members[i] = routerMember{client: NewLLMClient(ep), label: ep.Protocol + "/" + ep.Model} + } + return &LLMRouter{members: members, cooldown: make(map[int]time.Time)} +} + +func (r *LLMRouter) CompletionsWithCtx(ctx context.Context, req ChatRequest) (*ChatResponse, error) { + var lastErr error + for _, i := range r.order() { + resp, err := r.members[i].client.CompletionsWithCtx(ctx, req) + if err == nil { + return resp, nil + } + lastErr = err + if !shouldFallover(err) { + return nil, err + } + r.park(i) + log.Printf("[llm-router] %s failed (%v) — trying next model", r.members[i].label, err) + } + return nil, fmt.Errorf("all %d models exhausted; last error: %w", len(r.members), lastErr) +} + +// order returns member indices in priority order with non-parked first; parked ones +// are appended (not dropped) so an all-parked pool is still attempted as last resort. +func (r *LLMRouter) order() []int { + r.mu.Lock() + defer r.mu.Unlock() + now := time.Now() + live := make([]int, 0, len(r.members)) + parked := make([]int, 0) + for i := range r.members { + if t, ok := r.cooldown[i]; ok && now.Before(t) { + parked = append(parked, i) + } else { + live = append(live, i) + } + } + return append(live, parked...) +} + +func (r *LLMRouter) park(i int) { + r.mu.Lock() + r.cooldown[i] = time.Now().Add(routerCooldown) + r.mu.Unlock() +} + +// shouldFallover reports whether err warrants trying the next model. Availability +// failures (rate limit, server, network) → yes; a caller-cancelled context or a +// client-side request error (same payload fails on every model) → no. +func shouldFallover(err error) bool { + if err == nil { + return false + } + if errors.Is(err, context.Canceled) { + return false + } + var aerr *anthropic.Error + if errors.As(err, &aerr) { + return falloverStatus(aerr.StatusCode) + } + var oerr *openai.Error + if errors.As(err, &oerr) { + return falloverStatus(oerr.StatusCode) + } + return true // unknown (network blip / timeout / parse) → next model may succeed +} + +func falloverStatus(code int) bool { + switch code { + case 400, 413, 422: + return false // bad request / payload too large / unprocessable: deterministic across models + default: + return true // 401/403/404/408/409/429/5xx: a different provider/key/capacity may differ + } +} + // --- Token counting with tiktoken --- // modelTokenizerCache caches initialized tiktoken encoders keyed by encoding name. @@ -296,7 +424,7 @@ func NewOpenAIClient(cfg ClientConfig) *OpenAIClient { sdk: openai.NewClient( openaiopt.WithAPIKey(cfg.APIKey), openaiopt.WithBaseURL(sdkBaseURL), - openaiopt.WithMaxRetries(5), + openaiopt.WithMaxRetries(maxRetriesOrDefault(cfg.MaxRetries)), openaiopt.WithHeader("User-Agent", userAgent("")), openaiopt.WithRequestTimeout(cfg.Timeout), ), @@ -492,7 +620,7 @@ func NewAnthropicClient(cfg ClientConfig) *AnthropicClient { opts := []option.RequestOption{ option.WithBaseURL(sdkBaseURL), - option.WithMaxRetries(5), + option.WithMaxRetries(maxRetriesOrDefault(cfg.MaxRetries)), option.WithHeader("User-Agent", userAgent("claude")), option.WithRequestTimeout(cfg.Timeout), } diff --git a/internal/llm/resolver.go b/internal/llm/resolver.go index 5e2432c3..f23bb83b 100644 --- a/internal/llm/resolver.go +++ b/internal/llm/resolver.go @@ -18,6 +18,7 @@ type ResolvedEndpoint struct { AuthHeader string // Anthropic auth header: "x-api-key" or "authorization" Source string // human-readable config source label ExtraBody map[string]any // vendor-specific request body fields + MaxRetries int // optional per-endpoint SDK retry budget (0 = default); set low for router members } // Environment variable names for OCR-specific configuration. @@ -135,36 +136,129 @@ type providerEntryConfig struct { ExtraBody map[string]any `json:"extra_body,omitempty"` } +// modelRef is one entry of the ordered routing pool: which configured provider to +// use and (optionally) which of its models. An empty Model falls back to the +// provider's default model. +type modelRef struct { + Provider string `json:"provider"` + Model string `json:"model,omitempty"` +} + +// routingConfig is the multi-model namespace: an ordered pool plus a selection +// policy. Grouping these under `routing` (vs a bare top-level list) gives the policy +// and future per-pool knobs a stable home, and avoids colliding with +// providers..models (which is a provider's model catalog, not a routing pool). +type routingConfig struct { + Models []modelRef `json:"models,omitempty"` // ordered pool; index 0 is primary + Policy string `json:"policy,omitempty"` // selection policy; only "priority" supported today +} + type configFile struct { Provider string `json:"provider,omitempty"` Model string `json:"model,omitempty"` + Routing routingConfig `json:"routing,omitempty"` Providers map[string]providerEntryConfig `json:"providers,omitempty"` CustomProviders map[string]providerEntryConfig `json:"custom_providers,omitempty"` Llm llmFileConfig `json:"llm,omitempty"` } -// tryOCRConfig reads the OCR config file. -func tryOCRConfig(path, modelOverride string) (ResolvedEndpoint, bool, error) { +// loadConfigFile reads + parses the OCR config file. ok=false (nil err) when absent. +func loadConfigFile(path string) (configFile, bool, error) { data, err := os.ReadFile(path) if err != nil { if os.IsNotExist(err) { - return ResolvedEndpoint{}, false, nil + return configFile{}, false, nil } - return ResolvedEndpoint{}, false, err + return configFile{}, false, err } - var cfg configFile if err := json.Unmarshal(data, &cfg); err != nil { - return ResolvedEndpoint{}, false, fmt.Errorf("parse config: %w", err) + return configFile{}, false, fmt.Errorf("parse config: %w", err) + } + return cfg, true, nil +} + +// tryOCRConfig resolves a single endpoint from the config file (the primary, for +// callers that want one client). `provider` wins when set; otherwise `models[0]` +// (the pool's primary); otherwise the legacy `llm` block. +func tryOCRConfig(path, modelOverride string) (ResolvedEndpoint, bool, error) { + cfg, ok, err := loadConfigFile(path) + if err != nil || !ok { + return ResolvedEndpoint{}, false, err } if cfg.Provider != "" { return tryProviderConfig(cfg, modelOverride) } + if len(cfg.Routing.Models) > 0 { + ep, err := resolveModelRef(cfg, cfg.Routing.Models[0]) + if err != nil { + return ResolvedEndpoint{}, false, err + } + return ep, true, nil + } return tryLegacyLlmConfig(cfg, modelOverride) } +// resolveModelRef resolves one pool entry, reusing tryProviderConfig by pinning it to +// this entry's provider. ref.Model is passed as the model override so it wins over the +// provider's default and is validated against the provider's available models. +func resolveModelRef(cfg configFile, ref modelRef) (ResolvedEndpoint, error) { + if ref.Provider == "" { + return ResolvedEndpoint{}, fmt.Errorf("models[] entry missing required 'provider' field") + } + sub := cfg + sub.Provider = ref.Provider + sub.Routing = routingConfig{} + ep, ok, err := tryProviderConfig(sub, ref.Model) + if err != nil { + return ResolvedEndpoint{}, err + } + if !ok || ep.URL == "" || ep.Token == "" || ep.Model == "" { + return ResolvedEndpoint{}, fmt.Errorf("models[] entry {provider:%q model:%q} did not resolve to a complete endpoint", ref.Provider, ref.Model) + } + ep.Model = stripModelSuffix(ep.Model) + return ep, nil +} + +// ResolveModels resolves the full ordered model pool for the router. +func ResolveModels(configPath string) ([]ResolvedEndpoint, error) { + return ResolveModelsWithModelOverride(configPath, "") +} + +// ResolveModelsWithModelOverride returns the ordered pool of endpoints. An explicit +// modelOverride (--model) bypasses the pool and pins a single endpoint. Without it, a +// config `models` list resolves to the whole chain; otherwise it falls back to the +// single-endpoint resolution (env / single provider / legacy / shell), wrapped as a +// one-element pool — so existing configs behave exactly as before. +func ResolveModelsWithModelOverride(configPath, modelOverride string) ([]ResolvedEndpoint, error) { + if strings.TrimSpace(modelOverride) == "" { + if cfg, ok, err := loadConfigFile(configPath); err != nil { + return nil, err + } else if ok && len(cfg.Routing.Models) > 0 { + if pol := strings.TrimSpace(cfg.Routing.Policy); pol != "" && pol != "priority" { + return nil, fmt.Errorf("unsupported routing.policy %q (only \"priority\" is supported)", pol) + } + eps := make([]ResolvedEndpoint, 0, len(cfg.Routing.Models)) + for _, ref := range cfg.Routing.Models { + ep, err := resolveModelRef(cfg, ref) + if err != nil { + return nil, err + } + eps = append(eps, ep) + } + return eps, nil + } + } + + ep, err := ResolveEndpointWithModelOverride(configPath, modelOverride) + if err != nil { + return nil, err + } + return []ResolvedEndpoint{ep}, nil +} + // tryProviderConfig resolves an endpoint from the provider-based configuration. func tryProviderConfig(cfg configFile, modelOverride string) (ResolvedEndpoint, bool, error) { preset, isPreset := LookupProvider(cfg.Provider) diff --git a/internal/llm/router_test.go b/internal/llm/router_test.go new file mode 100644 index 00000000..339612d0 --- /dev/null +++ b/internal/llm/router_test.go @@ -0,0 +1,176 @@ +package llm + +import ( + "context" + "encoding/json" + "errors" + "os" + "path/filepath" + "testing" + "time" + + anthropic "github.com/anthropics/anthropic-sdk-go" + openai "github.com/openai/openai-go/v3" +) + +// fakeClient is a programmable LLMClient for router tests: it records call count and +// returns a fixed resp/err. +type fakeClient struct { + calls int + resp *ChatResponse + err error +} + +func (f *fakeClient) CompletionsWithCtx(context.Context, ChatRequest) (*ChatResponse, error) { + f.calls++ + return f.resp, f.err +} + +func newRouter(members ...routerMember) *LLMRouter { + return &LLMRouter{members: members, cooldown: make(map[int]time.Time)} +} + +func TestLLMRouter_FalloverThenSuccess(t *testing.T) { + c0 := &fakeClient{err: errors.New("network blip")} // unknown error → fallover + c1 := &fakeClient{resp: &ChatResponse{ID: "ok"}} + r := newRouter(routerMember{c0, "a"}, routerMember{c1, "b"}) + + resp, err := r.CompletionsWithCtx(context.Background(), ChatRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if resp.ID != "ok" { + t.Fatalf("expected fallback response, got %+v", resp) + } + if c0.calls != 1 || c1.calls != 1 { + t.Fatalf("calls c0=%d c1=%d, want 1/1", c0.calls, c1.calls) + } + + // member 0 is now parked: a second call should skip it and hit c1 directly. + if _, err := r.CompletionsWithCtx(context.Background(), ChatRequest{}); err != nil { + t.Fatalf("second call error: %v", err) + } + if c0.calls != 1 { + t.Fatalf("parked member retried: c0.calls=%d, want 1", c0.calls) + } + if c1.calls != 2 { + t.Fatalf("c1.calls=%d, want 2", c1.calls) + } +} + +func TestLLMRouter_ClientErrorShortCircuits(t *testing.T) { + c0 := &fakeClient{err: &openai.Error{StatusCode: 400}} // bad request → no fallover + c1 := &fakeClient{resp: &ChatResponse{ID: "ok"}} + r := newRouter(routerMember{c0, "a"}, routerMember{c1, "b"}) + + if _, err := r.CompletionsWithCtx(context.Background(), ChatRequest{}); err == nil { + t.Fatal("expected error to propagate, got nil") + } + if c1.calls != 0 { + t.Fatalf("next model tried on client-side error: c1.calls=%d, want 0", c1.calls) + } +} + +func TestLLMRouter_AllExhausted(t *testing.T) { + c0 := &fakeClient{err: errors.New("down")} + c1 := &fakeClient{err: errors.New("down")} + r := newRouter(routerMember{c0, "a"}, routerMember{c1, "b"}) + + _, err := r.CompletionsWithCtx(context.Background(), ChatRequest{}) + if err == nil { + t.Fatal("expected exhausted error, got nil") + } + if c0.calls != 1 || c1.calls != 1 { + t.Fatalf("calls c0=%d c1=%d, want both 1", c0.calls, c1.calls) + } +} + +func TestShouldFallover(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"unknown", errors.New("boom"), true}, + {"canceled", context.Canceled, false}, + {"openai 400", &openai.Error{StatusCode: 400}, false}, + {"openai 429", &openai.Error{StatusCode: 429}, true}, + {"anthropic 503", &anthropic.Error{StatusCode: 503}, true}, + {"anthropic 401", &anthropic.Error{StatusCode: 401}, true}, + } + for _, c := range cases { + if got := shouldFallover(c.err); got != c.want { + t.Errorf("%s: shouldFallover=%v, want %v", c.name, got, c.want) + } + } +} + +func TestNewLLMRouter_SinglePoolNoRouter(t *testing.T) { + ep := ResolvedEndpoint{URL: "https://x.example.com", Protocol: "openai", Model: "m", Token: "t"} + if _, isRouter := NewLLMRouter([]ResolvedEndpoint{ep}).(*LLMRouter); isRouter { + t.Fatal("single-model pool should not be wrapped in a router") + } + two := NewLLMRouter([]ResolvedEndpoint{ep, ep}) + if _, isRouter := two.(*LLMRouter); !isRouter { + t.Fatal("multi-model pool should be a router") + } +} + +func TestResolveModels_Chain(t *testing.T) { + for _, k := range []string{"OCR_LLM_URL", "OCR_LLM_TOKEN", "OCR_LLM_MODEL", "ANTHROPIC_BASE_URL", "ANTHROPIC_AUTH_TOKEN", "ANTHROPIC_MODEL"} { + t.Setenv(k, "") + } + cfg := configFile{ + Routing: routingConfig{ + Models: []modelRef{{Provider: "p1"}, {Provider: "p2", Model: "m2b"}}, + Policy: "priority", + }, + CustomProviders: map[string]providerEntryConfig{ + "p1": {URL: "https://a.example.com", Protocol: "openai", APIKey: "k1", Model: "m1"}, + "p2": {URL: "https://b.example.com", Protocol: "openai", APIKey: "k2", Model: "m2"}, + }, + } + data, _ := json.Marshal(cfg) + cfgPath := filepath.Join(t.TempDir(), "config.json") + if err := os.WriteFile(cfgPath, data, 0o644); err != nil { + t.Fatal(err) + } + + eps, err := ResolveModels(cfgPath) + if err != nil { + t.Fatalf("ResolveModels: %v", err) + } + if len(eps) != 2 { + t.Fatalf("pool size=%d, want 2", len(eps)) + } + if eps[0].Model != "m1" { + t.Errorf("eps[0].Model=%q, want m1", eps[0].Model) + } + if eps[1].Model != "m2b" { // explicit ref.Model wins over the provider's default m2 + t.Errorf("eps[1].Model=%q, want m2b (ref.Model overrides provider default)", eps[1].Model) + } +} + +func TestResolveModels_RejectsUnknownPolicy(t *testing.T) { + for _, k := range []string{"OCR_LLM_URL", "OCR_LLM_TOKEN", "OCR_LLM_MODEL", "ANTHROPIC_BASE_URL", "ANTHROPIC_AUTH_TOKEN", "ANTHROPIC_MODEL"} { + t.Setenv(k, "") + } + cfg := configFile{ + Routing: routingConfig{ + Models: []modelRef{{Provider: "p1"}}, + Policy: "weighted", // not supported yet → reserved, must error rather than silently ignore + }, + CustomProviders: map[string]providerEntryConfig{ + "p1": {URL: "https://a.example.com", Protocol: "openai", APIKey: "k1", Model: "m1"}, + }, + } + data, _ := json.Marshal(cfg) + cfgPath := filepath.Join(t.TempDir(), "config.json") + if err := os.WriteFile(cfgPath, data, 0o644); err != nil { + t.Fatal(err) + } + if _, err := ResolveModels(cfgPath); err == nil { + t.Fatal("expected error for unsupported routing.policy, got nil") + } +} From ac4a9142cc4e89c714d0741439c94e4ebf768615 Mon Sep 17 00:00:00 2001 From: "liqiankun.1111" Date: Thu, 25 Jun 2026 14:03:24 +0800 Subject: [PATCH 2/4] fix(llm): address self-review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - resolveModelRef: clear sub.Model so a top-level `model` cannot leak into a routing entry that omits its own model (model now comes only from ref.Model or the provider default). - LLMRouter: when a call fails, stop and return ctx.Err() if the shared context is canceled or past its deadline — every member uses that ctx, so none can succeed; avoids wasted fallover attempts and misleading logs. A per-request timeout (ctx still live) still falls over. - order(): delete expired cooldown entries so the map stays bounded. - ResolvedEndpoint.MaxRetries: clarify it is internal/router-set, not read from config. Adds a router test for the context-done short-circuit. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/llm/client.go | 18 ++++++++++++++---- internal/llm/resolver.go | 3 ++- internal/llm/router_test.go | 16 ++++++++++++++++ 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/internal/llm/client.go b/internal/llm/client.go index eb887ecd..f964162c 100644 --- a/internal/llm/client.go +++ b/internal/llm/client.go @@ -270,6 +270,13 @@ func (r *LLMRouter) CompletionsWithCtx(ctx context.Context, req ChatRequest) (*C return resp, nil } lastErr = err + if ctx.Err() != nil { + // The shared ctx is canceled or past its deadline: the overall budget is + // exhausted and no other member can succeed (they all use this ctx). Stop + // here rather than burning fallover attempts. A per-request timeout (ctx + // still live) is NOT caught here and still falls over below. + return nil, ctx.Err() + } if !shouldFallover(err) { return nil, err } @@ -288,11 +295,14 @@ func (r *LLMRouter) order() []int { live := make([]int, 0, len(r.members)) parked := make([]int, 0) for i := range r.members { - if t, ok := r.cooldown[i]; ok && now.Before(t) { - parked = append(parked, i) - } else { - live = append(live, i) + if t, ok := r.cooldown[i]; ok { + if now.Before(t) { + parked = append(parked, i) + continue + } + delete(r.cooldown, i) // expired: drop the entry so the map stays bounded } + live = append(live, i) } return append(live, parked...) } diff --git a/internal/llm/resolver.go b/internal/llm/resolver.go index f23bb83b..17e636de 100644 --- a/internal/llm/resolver.go +++ b/internal/llm/resolver.go @@ -18,7 +18,7 @@ type ResolvedEndpoint struct { AuthHeader string // Anthropic auth header: "x-api-key" or "authorization" Source string // human-readable config source label ExtraBody map[string]any // vendor-specific request body fields - MaxRetries int // optional per-endpoint SDK retry budget (0 = default); set low for router members + MaxRetries int // internal SDK retry budget (0 = SDK default); not read from config — set by NewLLMRouter, low for pool members so a throttled one fails fast to the next } // Environment variable names for OCR-specific configuration. @@ -210,6 +210,7 @@ func resolveModelRef(cfg configFile, ref modelRef) (ResolvedEndpoint, error) { } sub := cfg sub.Provider = ref.Provider + sub.Model = "" // don't let a top-level `model` leak into routing entries; model comes from ref.Model or the provider default sub.Routing = routingConfig{} ep, ok, err := tryProviderConfig(sub, ref.Model) if err != nil { diff --git a/internal/llm/router_test.go b/internal/llm/router_test.go index 339612d0..c917fb7d 100644 --- a/internal/llm/router_test.go +++ b/internal/llm/router_test.go @@ -85,6 +85,22 @@ func TestLLMRouter_AllExhausted(t *testing.T) { } } +func TestLLMRouter_StopsWhenContextDone(t *testing.T) { + c0 := &fakeClient{err: errors.New("boom")} + c1 := &fakeClient{resp: &ChatResponse{ID: "ok"}} + r := newRouter(routerMember{c0, "a"}, routerMember{c1, "b"}) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // shared budget exhausted — no member can succeed + + if _, err := r.CompletionsWithCtx(ctx, ChatRequest{}); err == nil { + t.Fatal("expected error when ctx is done, got nil") + } + if c1.calls != 0 { + t.Fatalf("fell over despite done ctx: c1.calls=%d, want 0", c1.calls) + } +} + func TestShouldFallover(t *testing.T) { cases := []struct { name string From 5bbe6e9e67dcd71bc620ee453a0f59d14b7e8936 Mon Sep 17 00:00:00 2001 From: qiankunli Date: Thu, 25 Jun 2026 14:24:02 +0800 Subject: [PATCH 3/4] Update internal/llm/client.go Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- internal/llm/client.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/internal/llm/client.go b/internal/llm/client.go index f964162c..8b033535 100644 --- a/internal/llm/client.go +++ b/internal/llm/client.go @@ -294,15 +294,20 @@ func (r *LLMRouter) order() []int { now := time.Now() live := make([]int, 0, len(r.members)) parked := make([]int, 0) + for i := range r.members { + if t, ok := r.cooldown[i]; ok { + if now.Before(t) { for i := range r.members { if t, ok := r.cooldown[i]; ok { if now.Before(t) { parked = append(parked, i) - continue + } else { + delete(r.cooldown, i) + live = append(live, i) } - delete(r.cooldown, i) // expired: drop the entry so the map stays bounded + } else { + live = append(live, i) } - live = append(live, i) } return append(live, parked...) } From c8987293b3cf63654f26ae8f643d76a0256da072 Mon Sep 17 00:00:00 2001 From: "liqiankun.1111" Date: Thu, 25 Jun 2026 16:02:33 +0800 Subject: [PATCH 4/4] fix(llm): drop duplicated loop header in order() A web edit (5bbe6e9) accidentally pasted the for/if/if header twice in LLMRouter.order(), leaving unbalanced braces that broke the build. Remove the duplicate; the intended if/else cooldown handling is preserved. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/llm/client.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/internal/llm/client.go b/internal/llm/client.go index 8b033535..b597ad59 100644 --- a/internal/llm/client.go +++ b/internal/llm/client.go @@ -294,9 +294,6 @@ func (r *LLMRouter) order() []int { now := time.Now() live := make([]int, 0, len(r.members)) parked := make([]int, 0) - for i := range r.members { - if t, ok := r.cooldown[i]; ok { - if now.Before(t) { for i := range r.members { if t, ok := r.cooldown[i]; ok { if now.Before(t) {