alibaba · qiankunli · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/README.md b/README.md
@@ -559,6 +559,8 @@ Config file: `~/.opencodereview/config.json`
 | `providers.<name>.models` | array | Optional provider model list for interactive selection |
 | `providers.<name>.auth_header` | string | `x-api-key` \| `authorization` |
 | `custom_providers.<name>.*` | — | Same fields as `providers.<name>.*`, including optional `models` |
+| `routing.models` | array | Ordered model pool for failover: `[{provider, model}]` (see [Multi-model fallback](#multi-model-fallback)) |
+| `routing.policy` | string | Selection policy; `priority` (default, only value today) |
 | `llm.url` | string | `https://api.openai.com/v1/chat/completions` |
 | `llm.auth_token` | string | `sk-xxxxxxx` |
 | `llm.auth_header` | string | Anthropic only: `x-api-key` \| `authorization` |
@@ -582,6 +584,32 @@ Environment variables take precedence over the config file.
 | `OCR_LLM_MODEL` | Model name |
 | `OCR_USE_ANTHROPIC` | `true` = Anthropic, `false` = OpenAI |
 
+### Multi-model fallback
+
+By default a review uses a single model (`provider` + `model`). To survive rate limits and provider outages, configure an ordered `routing.models` pool — the reviewer tries each in order and falls over to the next when one is rate-limited, returns a server error, or times out:
+
+```json
+{
+  "providers": {
+    "anthropic": { "api_key": "sk-ant-...", "model": "claude-opus-4-6" },
+    "deepseek":  { "api_key": "sk-...",     "model": "deepseek-v3" }
+  },
+  "routing": {
+    "models": [
+      { "provider": "anthropic", "model": "claude-opus-4-6" },
+      { "provider": "deepseek",  "model": "deepseek-v3" }
+    ],
+    "policy": "priority"
+  }
+}
+```
+
+- Each entry references a configured provider (for credentials / endpoint) and a model; an omitted `model` uses the provider's default.
+- `routing.policy` selects how the pool is ordered. Only `priority` is supported today (first entry is primary); the field is reserved for future policies (e.g. weighted), and an unknown value is rejected rather than silently ignored.
+- A rate-limited or unavailable model is briefly parked so concurrent per-file reviews skip it instead of each re-hitting it.
+- Failover triggers on availability errors (rate limit, 5xx, network/timeout). Client-side errors (bad request, payload too large) do **not** trigger failover, since another model would fail identically.
+- Without `routing.models`, behavior is unchanged. `--model` pins a single model and bypasses the pool.
+
 
 ## Telemetry
 

diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -544,6 +544,8 @@ OCR 通过四层优先级链解析评审规则。每层采用首次匹配原则
 | `providers.<name>.models` | array | 用于交互式选择的可选供应商模型列表 |
 | `providers.<name>.auth_header` | string | `x-api-key` \| `authorization` |
 | `custom_providers.<name>.*` | — | 与 `providers.<name>.*` 相同的字段，包括可选的 `models` |
+| `routing.models` | array | 用于故障转移的有序模型池：`[{provider, model}]`（见[多模型故障转移](#多模型故障转移)） |
+| `routing.policy` | string | 选择策略；`priority`（默认，目前唯一取值） |
 | `llm.url` | string | `https://api.openai.com/v1/chat/completions` |
 | `llm.auth_token` | string | `sk-xxxxxxx` |
 | `llm.auth_header` | string | 仅 Anthropic：`x-api-key` \| `authorization` |
@@ -567,6 +569,32 @@ OCR 通过四层优先级链解析评审规则。每层采用首次匹配原则
 | `OCR_LLM_MODEL` | 模型名称 |
 | `OCR_USE_ANTHROPIC` | `true` = Anthropic，`false` = OpenAI |
 
+### 多模型故障转移
+
+默认评审使用单一模型（`provider` + `model`）。为应对限流与供应商故障，可配置有序的 `routing.models` 池——评审按顺序尝试，当某个模型被限流、返回服务端错误或超时时，自动转移到下一个：
+
+```json
+{
+  "providers": {
+    "anthropic": { "api_key": "sk-ant-...", "model": "claude-opus-4-6" },
+    "deepseek":  { "api_key": "sk-...",     "model": "deepseek-v3" }
+  },
+  "routing": {
+    "models": [
+      { "provider": "anthropic", "model": "claude-opus-4-6" },
+      { "provider": "deepseek",  "model": "deepseek-v3" }
+    ],
+    "policy": "priority"
+  }
+}
+```
+
+- 每个条目引用一个已配置的供应商（提供凭据 / 端点）及一个模型；省略 `model` 时使用该供应商的默认模型。
+- `routing.policy` 决定池的排序方式。目前仅支持 `priority`（第一个为主模型）；该字段为未来策略（如 weighted）预留，填入未知值会报错而非被静默忽略。
+- 被限流或不可用的模型会被短暂搁置，使并发的逐文件评审跳过它，而非各自重复命中。
+- 仅在可用性错误（限流、5xx、网络 / 超时）时转移。客户端错误（请求错误、负载过大）**不**触发转移，因为换个模型同样会失败。
+- 不配置 `routing.models` 时行为不变。`--model` 固定单一模型并绕过该池。
+
 
 ## 遥测
 

diff --git a/cmd/opencodereview/shared.go b/cmd/opencodereview/shared.go
@@ -145,14 +145,14 @@ func loadLLMRuntime(tpl *template.Template, toolConfigPath, modelOverride string
 	}
 	tpl.ApplyLanguage(lang)
 
-	ep, err := llm.ResolveEndpointWithModelOverride(cfgPath, modelOverride)
+	eps, err := llm.ResolveModelsWithModelOverride(cfgPath, modelOverride)
 	if err != nil {
 		return nil, fmt.Errorf("resolve LLM endpoint: %w", err)
 	}
 
 	return &llmRuntime{
-		Client:       llm.NewLLMClient(ep),
-		Model:        ep.Model,
+		Client:       llm.NewLLMRouter(eps),
+		Model:        eps[0].Model,
 		PlanToolDefs: planToolDefs,
 		MainToolDefs: mainToolDefs,
 		Collector:    tool.NewCommentCollector(),

diff --git a/internal/llm/client.go b/internal/llm/client.go
@@ -5,7 +5,9 @@ package llm
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
+	"log"
 	"strings"
 	"sync"
 	"time"
@@ -185,6 +187,8 @@ type ClientConfig struct {
 	AuthHeader string         // Auth header name: "x-api-key", "authorization", or empty for protocol default
 	Timeout    time.Duration  // Request timeout
 	ExtraBody  map[string]any // Vendor-specific fields merged into every request body
+	MaxRetries int            // SDK in-provider retry budget; 0 → default. Lowered for router members so a
+	// rate-limited model fails fast to the next instead of burning the full backoff.
 }
 
 // --- Factory ---
@@ -198,13 +202,149 @@ func NewLLMClient(ep ResolvedEndpoint) LLMClient {
 		Model:      ep.Model,
 		AuthHeader: ep.AuthHeader,
 		ExtraBody:  ep.ExtraBody,
+		MaxRetries: ep.MaxRetries,
 	}
 	if ep.Protocol == "anthropic" {
 		return NewAnthropicClient(cfg)
 	}
 	return NewOpenAIClient(cfg)
 }
 
+func maxRetriesOrDefault(n int) int {
+	if n > 0 {
+		return n
+	}
+	return 5 // SDK default budget when caller doesn't constrain it
+}
+
+// --- Multi-model router ---
+
+// Tunables for LLMRouter. A router member that returns a fallover-worthy error is
+// parked for routerCooldown so concurrent subtasks skip it instead of each re-hitting
+// a model that's down/throttled. Members get a low retry budget so a rate-limited
+// model fails fast to the next rather than burning the full SDK backoff.
+const (
+	routerMemberRetries = 2
+	routerCooldown      = 30 * time.Second
+)
+
+type routerMember struct {
+	client LLMClient
+	label  string // "protocol/model" for logs
+}
+
+// LLMRouter is an LLMClient over an ordered pool of models. On a fallover-worthy
+// failure (rate limit / 5xx / network) it advances to the next member; client-side
+// errors (bad request / payload too large) short-circuit since another model would
+// fail identically. Cooldown state is shared across concurrent CompletionsWithCtx
+// calls (one ocr run's per-file subtasks), so a throttled model is skipped fleet-wide.
+// Selection is strict priority order today; the order() seam is where a weighted /
+// capability policy would plug in.
+type LLMRouter struct {
+	members  []routerMember
+	mu       sync.Mutex
+	cooldown map[int]time.Time // member index → parked-until
+}
+
+// NewLLMRouter builds an LLMClient from an ordered pool. A pool of one returns a
+// plain client (no router overhead, unchanged single-model behavior).
+func NewLLMRouter(eps []ResolvedEndpoint) LLMClient {
+	if len(eps) == 1 {
+		return NewLLMClient(eps[0])
+	}
+	members := make([]routerMember, len(eps))
+	for i, ep := range eps {
+		if ep.MaxRetries == 0 {
+			ep.MaxRetries = routerMemberRetries
+		}
+		members[i] = routerMember{client: NewLLMClient(ep), label: ep.Protocol + "/" + ep.Model}
+	}
+	return &LLMRouter{members: members, cooldown: make(map[int]time.Time)}
+}
+
+func (r *LLMRouter) CompletionsWithCtx(ctx context.Context, req ChatRequest) (*ChatResponse, error) {
+	var lastErr error
+	for _, i := range r.order() {
+		resp, err := r.members[i].client.CompletionsWithCtx(ctx, req)
+		if err == nil {
+			return resp, nil
+		}
+		lastErr = err
+		if ctx.Err() != nil {
+			// The shared ctx is canceled or past its deadline: the overall budget is
+			// exhausted and no other member can succeed (they all use this ctx). Stop
+			// here rather than burning fallover attempts. A per-request timeout (ctx
+			// still live) is NOT caught here and still falls over below.
+			return nil, ctx.Err()
+		}
+		if !shouldFallover(err) {
+			return nil, err
+		}
+		r.park(i)
+		log.Printf("[llm-router] %s failed (%v) — trying next model", r.members[i].label, err)
+	}
+	return nil, fmt.Errorf("all %d models exhausted; last error: %w", len(r.members), lastErr)
+}
+
+// order returns member indices in priority order with non-parked first; parked ones
+// are appended (not dropped) so an all-parked pool is still attempted as last resort.
+func (r *LLMRouter) order() []int {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	now := time.Now()
+	live := make([]int, 0, len(r.members))
+	parked := make([]int, 0)
+	for i := range r.members {
+		if t, ok := r.cooldown[i]; ok {
+			if now.Before(t) {
+				parked = append(parked, i)
+			} else {
+				delete(r.cooldown, i)
+				live = append(live, i)
+			}
+		} else {
+			live = append(live, i)
+		}
+	}
+	return append(live, parked...)
+}
+
+func (r *LLMRouter) park(i int) {
+	r.mu.Lock()
+	r.cooldown[i] = time.Now().Add(routerCooldown)
+	r.mu.Unlock()
+}
+
+// shouldFallover reports whether err warrants trying the next model. Availability
+// failures (rate limit, server, network) → yes; a caller-cancelled context or a
+// client-side request error (same payload fails on every model) → no.
+func shouldFallover(err error) bool {
+	if err == nil {
+		return false
+	}
+	if errors.Is(err, context.Canceled) {
+		return false
+	}
+	var aerr *anthropic.Error
+	if errors.As(err, &aerr) {
+		return falloverStatus(aerr.StatusCode)
+	}
+	var oerr *openai.Error
+	if errors.As(err, &oerr) {
+		return falloverStatus(oerr.StatusCode)
+	}
+	return true // unknown (network blip / timeout / parse) → next model may succeed
+}
+
+func falloverStatus(code int) bool {
+	switch code {
+	case 400, 413, 422:
+		return false // bad request / payload too large / unprocessable: deterministic across models
+	default:
+		return true // 401/403/404/408/409/429/5xx: a different provider/key/capacity may differ
+	}
+}
+
 // --- Token counting with tiktoken ---
 
 // modelTokenizerCache caches initialized tiktoken encoders keyed by encoding name.
@@ -296,7 +436,7 @@ func NewOpenAIClient(cfg ClientConfig) *OpenAIClient {
 		sdk: openai.NewClient(
 			openaiopt.WithAPIKey(cfg.APIKey),
 			openaiopt.WithBaseURL(sdkBaseURL),
-			openaiopt.WithMaxRetries(5),
+			openaiopt.WithMaxRetries(maxRetriesOrDefault(cfg.MaxRetries)),
 			openaiopt.WithHeader("User-Agent", userAgent("")),
 			openaiopt.WithRequestTimeout(cfg.Timeout),
 		),
@@ -492,7 +632,7 @@ func NewAnthropicClient(cfg ClientConfig) *AnthropicClient {
 
 	opts := []option.RequestOption{
 		option.WithBaseURL(sdkBaseURL),
-		option.WithMaxRetries(5),
+		option.WithMaxRetries(maxRetriesOrDefault(cfg.MaxRetries)),
 		option.WithHeader("User-Agent", userAgent("claude")),
 		option.WithRequestTimeout(cfg.Timeout),
 	}