Skip to content

Commit 8d7eedd

Browse files
committed
unhealthy session management
1 parent 5f71d9d commit 8d7eedd

2 files changed

Lines changed: 68 additions & 8 deletions

File tree

internal/config/config.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ type Config struct {
2323
CommandParallelism int `env:"COMMAND_PARALLELISM" envDefault:"16"`
2424
// WSSilenceTimeout triggers a session restart if no gateway messages are received.
2525
WSSilenceTimeout time.Duration `env:"WS_SILENCE_TIMEOUT" envDefault:"2m"`
26+
27+
// DiscordUnhealthyMode controls what happens when watchdogs/API probe decide the session is unhealthy.
28+
// Supported: restart|invalidate-only|ignore.
29+
DiscordUnhealthyMode string `env:"DISCORD_UNHEALTHY_MODE" envDefault:"restart"`
30+
// DiscordUnhealthyGrace allows ignoring the first N unhealthy signals within DiscordUnhealthyWindow
31+
// (still invalidating sinks), before triggering a session restart. Applies to mode=restart only.
32+
DiscordUnhealthyGrace int `env:"DISCORD_UNHEALTHY_GRACE" envDefault:"0"`
33+
// DiscordUnhealthyWindow is the counting window for DiscordUnhealthyGrace.
34+
DiscordUnhealthyWindow time.Duration `env:"DISCORD_UNHEALTHY_WINDOW" envDefault:"1m"`
2635
}
2736

2837
// IsDeveloper reports whether userID is the configured developer (avoids discord import in middleware).

internal/discord/bot_session.go

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,65 @@ func (b *Bot) RunSession(ctx context.Context) error {
7070
// and similar events; treating every Disconnect as fatal caused dg.Close() to race with that
7171
// reconnect and wiped in-memory voice/queue state.
7272
disconnected := make(chan struct{})
73-
var disconnectOnce sync.Once
74-
notifyDisconnect := func() {
75-
disconnectOnce.Do(func() {
73+
var restartOnce sync.Once
74+
var unhealthyMu sync.Mutex
75+
var unhealthyCount int
76+
var unhealthyWindowStart time.Time
77+
78+
invalidateSinks := func() {
79+
if b.voice != nil {
80+
b.voice.InvalidateAllSinks()
81+
}
82+
}
83+
84+
notifyUnhealthy := func() {
85+
mode := b.cfg.DiscordUnhealthyMode
86+
switch mode {
87+
case "ignore":
88+
return
89+
case "invalidate-only":
90+
invalidateSinks()
91+
return
92+
case "restart", "":
93+
// fallthrough to restart logic
94+
default:
95+
log.Printf("[WARN] Unknown DISCORD_UNHEALTHY_MODE=%q, falling back to restart", mode)
96+
}
97+
98+
// mode=restart: optionally ignore first N signals within a window (still invalidating sinks).
99+
grace := b.cfg.DiscordUnhealthyGrace
100+
if grace < 0 {
101+
grace = 0
102+
}
103+
window := b.cfg.DiscordUnhealthyWindow
104+
if window <= 0 {
105+
window = time.Minute
106+
}
107+
108+
shouldRestart := true
109+
if grace > 0 {
110+
now := time.Now()
111+
unhealthyMu.Lock()
112+
if unhealthyWindowStart.IsZero() || now.Sub(unhealthyWindowStart) > window {
113+
unhealthyWindowStart = now
114+
unhealthyCount = 0
115+
}
116+
unhealthyCount++
117+
if unhealthyCount <= grace {
118+
shouldRestart = false
119+
}
120+
unhealthyMu.Unlock()
121+
}
122+
123+
if !shouldRestart {
124+
invalidateSinks()
125+
return
126+
}
127+
128+
restartOnce.Do(func() {
76129
log.Println("[WARN] Discord session unhealthy — will restart session")
77130
// Soft-restart path: keep players/queues, but invalidate transport so they recover fast.
78-
if b.voice != nil {
79-
b.voice.InvalidateAllSinks()
80-
}
131+
invalidateSinks()
81132
close(disconnected)
82133
})
83134
}
@@ -149,7 +200,7 @@ func (b *Bot) RunSession(ctx context.Context) error {
149200
dg.HeartbeatLatency,
150201
func(meta watchdog.WSSilenceMeta) {
151202
log.Printf("[WARN] Gateway silent for %v (timeout=%v, heartbeat=%v) — reconnecting", meta.SinceLastWS, meta.Timeout, meta.HeartbeatLatency)
152-
notifyDisconnect()
203+
notifyUnhealthy()
153204
},
154205
watchdog.WSSilenceOptions{SettleDelay: 15 * time.Second, Tick: 10 * time.Second},
155206
).Run(sessionCtx)
@@ -186,7 +237,7 @@ func (b *Bot) RunSession(ctx context.Context) error {
186237
log.Printf("[WARN] API probe failed (%d/3): %v", fails, err)
187238
if fails >= 3 {
188239
log.Println("[WARN] 3 consecutive API probe failures — reconnecting")
189-
notifyDisconnect()
240+
notifyUnhealthy()
190241
return
191242
}
192243
} else {

0 commit comments

Comments
 (0)