diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bc63536..92f864c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,33 @@ ## Unreleased +### Added + +- **`llm_error_message` / `llmErrorMessage` — opt-in spoken fallback when the + pipeline-mode LLM stream raises before any text is spoken.** Agent-runtime + providers (HermesLLM, OpenClawLLM, OpenAICompatibleLLM) may take 30-90 s to + complete a turn (tools, memory, skills); on gateway-down or timeout the caller + previously heard silence then a silent turn-end. Set + `llm_error_message="Sorry, I'm having trouble right now."` (Python) / + `llmErrorMessage: "Sorry, I'm having trouble right now."` (TypeScript) on the + agent to speak that line through the normal TTS path (subject to barge-in). + Four-part trigger condition: (1) a real LLM error (not a clean barge-in + abort), (2) zero assistant *audio* emitted this turn — gated on whether a PCM + chunk actually reached the carrier (`first_tts_chunk` / `ttsFirstByteSent`), + not on whether tokens were received, so a provider that streams partial tokens + ("Let me check…") and then times out before a sentence boundary (the chunker + buffered them, TTS never ran, the caller heard silence) still triggers the + fallback, while a turn that already spoke a full sentence does not double-speak, + (3) agent still owns the floor, (4) the field is set to a non-empty string. + `undefined` / `None` (default) preserves today's silence-on-error behaviour — + fully backward compatible. Pipeline mode only; Realtime and ConvAI surface + provider errors on their own audio path. + `libraries/python/getpatter/models.py` (`Agent.llm_error_message`), + `libraries/python/getpatter/client.py` (`Patter.agent(llm_error_message=...)`), + `libraries/python/getpatter/stream_handler.py` + (`PipelineStreamHandler._process_streaming_response` error branch), + `libraries/typescript/src/types.ts` (`AgentOptions.llmErrorMessage`), + `libraries/typescript/src/stream-handler.ts` (`runPipelineLlm` catch branch). + ## 0.6.4 (2026-06-05) ### Security @@ -28,6 +56,87 @@ ### Added +- **Patter as a voice shell in front of an agent runtime — three new + pipeline-mode LLM providers (`OpenAICompatibleLLM`, `HermesLLM`, + `OpenClawLLM`).** Drive a phone call where the LLM is an external, + OpenAI-compatible agent runtime reached at `POST {base_url}/chat/completions`: + Patter owns the carrier leg, STT, turn-taking / VAD / barge-in, and TTS, and + each conversation turn is answered by the runtime (which can run its own + tools, memory, and skills before replying). Used like any other pipeline LLM + — `phone.agent(llm=HermesLLM())` (Python) / `phone.agent({ llm: new + HermesLLM() })` (TypeScript). + - **`OpenAICompatibleLLM`** (Python `getpatter/llm/openai_compatible.py`, + TypeScript `src/llm/openai-compatible.ts`) — the generic provider for *any* + OpenAI-compatible chat endpoint: Hermes, OpenClaw, Ollama, vLLM, LM Studio, + or a custom gateway. `base_url` and `model` are required; `timeout` defaults + to **60 s** (configurable) so a runtime that runs tools mid-turn isn't cut + off — the base OpenAI provider's shorter ceiling is unchanged for raw + inference. Keyless local gateways (Ollama / vLLM / LM Studio) are supported: + pass no key and the request goes out without an `Authorization` header. + `extra_headers` / `extraHeaders` merge after the `getpatter/` + User-Agent so it can't be silently clobbered. + - **`HermesLLM`** (Python `getpatter/llm/hermes.py`, TypeScript + `src/llm/hermes.ts`) — thin preset over `OpenAICompatibleLLM` for the Hermes + agent runtime: `base_url` defaults to `http://127.0.0.1:8642/v1`, `model` to + `hermes-agent` (env `API_SERVER_MODEL_NAME` fallback), api key from env + `API_SERVER_KEY`, `timeout` **120 s**. + - **`OpenClawLLM`** (Python `getpatter/llm/openclaw.py`, TypeScript + `src/llm/openclaw.ts`) — thin preset over `OpenAICompatibleLLM` for the + OpenClaw agent runtime: takes an `agent` id (e.g. `receptionist`), validated + and mapped to `model="openclaw/"` using the **same** charset rule and + namespaced pass-through as the shipped `consult` OpenClaw preset; `base_url` + defaults to `http://127.0.0.1:18789/v1`, api key from env `OPENCLAW_API_KEY`, + `timeout` **120 s**. +- **Per-call session continuity for the agent-runtime providers (opt-in), now + with three decoupled signals.** Each runtime keys session continuity + differently, so `OpenAICompatibleLLM` exposes three independent, optional + signals — emit any subset: + - **`session_user_prefix` / `sessionUserPrefix`** → OpenAI `user` field as a + stable `{prefix}{call_id}` (value `patter-call-` for the presets). + OpenClaw's gateway derives its session from `user`; Hermes uses it only for + upstream-log correlation. + - **`session_id_header` + `session_id_prefix` / `sessionIdHeader` + + `sessionIdPrefix`** → a per-call request header carrying + `{session_id_prefix}{call_id}`, for runtimes that key session/transcript + continuity off a header rather than `user`. + - **`session_key_header` + `session_key` / `sessionKeyHeader` + + `sessionKey`** → a *static* request header for long-term memory scoping + (value is the configured key, not the call id). Omitted unless the value is + set. + + Each signal is gated independently and merged into the per-request headers; + when none are configured the request is byte-identical to the base provider + (no `user`, no extra headers). Preset wiring: + - **`HermesLLM`** — Hermes is stateless and keys continuity off **headers**: + `session_id_header="X-Hermes-Session-Id"`, `session_id_prefix="patter-call-"` + (so each call sends `X-Hermes-Session-Id: patter-call-` by default). + A new optional `session_key` (Python) / `sessionKey` (TypeScript) constructor + arg, default `None` / `undefined`, opts into long-term memory scoping by + emitting `X-Hermes-Session-Key: `. `session_user_prefix` is kept at + `patter-call-` for upstream-log correlation but does **not** drive the Hermes + session. + - **`OpenClawLLM`** — wire-identical to before: `session_user_prefix= + "patter-call-"` (OpenClaw's gateway keys off `user`) plus + `session_id_header="x-openclaw-session-key"` with `session_id_prefix=""` so + the header still carries the raw `call_id`. No memory-scope header. + + The presets enable continuity by default; the generic provider leaves it + **off** unless opted in. To make `call_id` reach the provider, `LLMLoop.run` + threads it through to `provider.stream()` — Python via an optional `call_id` + keyword on the `LLMProvider.stream` protocol (the loop now introspects each + provider's `stream` signature and only passes `call_id` to providers that + accept it or take `**kwargs`, so a minimal custom provider that declares + neither is **not** broken), TypeScript via an optional `callId` field on + `LLMStreamOptions` (an extra options-object property a provider ignores is a + no-op). Both are additive and optional, so every existing provider is + unaffected. `libraries/python/getpatter/llm/openai_compatible.py`, + `libraries/python/getpatter/llm/hermes.py`, + `libraries/python/getpatter/llm/openclaw.py`, + `libraries/python/getpatter/services/llm_loop.py`, + `libraries/typescript/src/llm/openai-compatible.ts`, + `libraries/typescript/src/llm/hermes.ts`, + `libraries/typescript/src/llm/openclaw.ts`, + `libraries/typescript/src/llm-loop.ts`. - **`allow_insecure_dashboard` / `allowInsecureDashboard` escape hatch (opt-in, default off).** New optional config on `Patter(...)` (Python) and `serve(...)` `ServeOptions` (TypeScript), defaulting to `False` / `false`. When the diff --git a/docs/integrations/hermes.mdx b/docs/integrations/hermes.mdx index 49a8666a..7b03ede1 100644 --- a/docs/integrations/hermes.mdx +++ b/docs/integrations/hermes.mdx @@ -1,21 +1,275 @@ --- title: "Hermes Agent" -description: "Give your Hermes Agent a phone with Patter — connect the Patter MCP server so Hermes can place calls, or let an in-call Patter agent consult Hermes as its brain." +description: "Give your Hermes Agent a phone with Patter — make Hermes the voice the caller talks to, connect the Patter MCP server so Hermes can place calls, or let an in-call Patter agent consult Hermes as its brain." icon: "phone" --- [Hermes Agent](https://github.com/NousResearch/hermes-agent) is a self-hosted AI assistant that grows with you through modular skills and persistent memory. If you -already run Hermes, Patter adds phone calling in two directions: - -- **Direction A — Hermes places calls.** Connect [Patter](https://github.com/PatterAI/Patter) +already run Hermes, Patter adds phone calling in three directions: + +- **Direction A — Patter is the voice shell, Hermes is the brain.** Patter answers the + phone — carrier, speech-to-text, turn-taking, barge-in, text-to-speech — and routes + every conversation turn to your Hermes agent as the LLM, using the new + [`HermesLLM`](#what-if-hermes-could-pick-up-the-phone) pipeline provider. **This is + the headline integration**; the rest of this page covers the other two directions. +- **Direction B — Hermes places calls.** Connect [Patter](https://github.com/PatterAI/Patter) as an MCP server and Hermes gains tools to dial out, read transcripts, end calls, and inspect metrics. -- **Direction B — Patter consults Hermes.** During a live call, a Patter agent can - reach back to Hermes for deeper reasoning, fresh information, or a decision the +- **Direction C — Patter consults Hermes on demand.** During a live call, a Patter agent + can reach back to Hermes for deeper reasoning, fresh information, or a decision the in-call prompt can't make on its own. -Pick one direction or wire up both — they are independent. +Pick one direction or wire up several — they are independent. + +## What if Hermes could pick up the phone? + +You already have a Hermes agent with memory, skills, and tools. Direction A puts it on +the end of a phone line: a caller dials your number, and they are *talking to Hermes* — +not to a separate voice bot that occasionally asks Hermes for help. Patter is the **voice +shell**: it owns everything that has to happen in real time (the carrier leg, transcribing +the caller, deciding when a turn ends, handling barge-in, and speaking the reply), and for +the one thing that requires intelligence — *what to say next* — it calls your Hermes agent. + +This is the same shape as wiring a custom LLM into a hosted voice platform, except **you +own the voice layer** and run it next to Hermes. Because Hermes is reached over its +OpenAI-compatible HTTP API (`POST /v1/chat/completions`), the new `HermesLLM` provider +plugs straight into Patter's **pipeline mode** as the LLM stage. + +### Architecture + +``` +Caller (any number) + │ + ▼ +Twilio / Telnyx DID ── Patter verifies the carrier signature at the public boundary + │ + ▼ +PATTER VOICE SHELL (pipeline mode) + · STT — transcribes the caller + · turn-taking / VAD / barge-in — decides when the caller is done + · TTS — speaks Hermes's reply + │ + │ each turn → HermesLLM → POST http://127.0.0.1:8642/v1/chat/completions + ▼ +HERMES AGENT RUNTIME (tools · memory · skills run here, before it replies) + │ + ▼ +reply text → Patter speaks it → back to the caller +``` + +Hermes runtimes execute tools, recall memory, and run skills *before* they answer, so a +single turn can take **30–90 s**. That is why `HermesLLM` defaults to a **120 s** request +timeout (the generic provider's 60 s, raised for the preset) instead of the short ceiling +used for raw inference providers — a turn that runs a tool isn't cut off mid-thought. + + + **Where the session lives.** Hermes is **stateless** and keys continuity off + **HTTP headers**, not the OpenAI `user` field. Each phone call maps to **one** Hermes + session: Patter sends `X-Hermes-Session-Id: patter-call-` on every turn, so + multi-turn session and transcript continuity inside a call works without any extra + wiring. This is on by default for `HermesLLM`. + + For **long-term memory scoping** across calls, set `session_key` (Python) / + `sessionKey` (TypeScript) on `HermesLLM` — Patter then also sends + `X-Hermes-Session-Key: `, which tells Hermes which memory scope this + caller belongs to. It is **off by default** (no header sent) and is opt-in: + + ```python + llm = HermesLLM(session_key="customer-42") # scopes long-term memory to this caller + ``` + + ```ts + const llm = new HermesLLM({ sessionKey: 'customer-42' }); + ``` + + (Patter also still sends `user=patter-call-` for upstream-log correlation, + but that field is **not** what drives the Hermes session — the headers are.) + + +### Prerequisites for the voice shell + +```bash +# Python SDK +pip install getpatter + +# TypeScript SDK +npm install getpatter +``` + +You also need: + +- A running **Hermes agent runtime** with its OpenAI-compatible API server enabled + (next section). +- A **phone number** from Twilio or Telnyx, and the matching carrier credentials. +- An **STT** and a **TTS** provider for the pipeline (e.g. Deepgram STT + ElevenLabs TTS). + Patter handles the audio; Hermes only ever sees text. + +### Setting up the Hermes gateway + +Hermes exposes an OpenAI-compatible HTTP API. Enable it and bind it to loopback so only +processes on the same machine — including Patter — can reach it: + +```bash +export API_SERVER_ENABLED=true +export API_SERVER_HOST=127.0.0.1 # loopback only — never expose to the internet +export API_SERVER_PORT=8642 # the HermesLLM default base_url targets :8642 +export API_SERVER_KEY="choose-a-strong-key" +export API_SERVER_MODEL_NAME="hermes-agent" # the model id HermesLLM sends; optional +``` + +Start the gateway (per your Hermes install), then confirm it answers on loopback before +wiring Patter to it: + +```bash +curl -s http://127.0.0.1:8642/v1/models \ + -H "Authorization: Bearer $API_SERVER_KEY" +``` + +A JSON list of models (including your `hermes-agent`) means the gateway is up. If `curl` +hangs or refuses the connection, fix that before going further — Patter can't reach a +gateway that isn't listening. + + + **`HermesLLM` reads these env vars for you.** With `API_SERVER_KEY` and + (optionally) `API_SERVER_MODEL_NAME` set, you can construct `HermesLLM()` with no + arguments — it defaults `base_url` to `http://127.0.0.1:8642/v1`, the api key from + `API_SERVER_KEY`, and the model from `API_SERVER_MODEL_NAME` (falling back to + `hermes-agent`). + + +### Running Patter locally + +Build a pipeline-mode agent whose LLM is `HermesLLM`. Patter wraps the carrier, STT, and +TTS around it; Hermes is the brain for every turn. + +#### Python example + +```python +from getpatter import Patter, HermesLLM, DeepgramSTT, ElevenLabsTTS, Twilio + +phone = Patter( + carrier=Twilio(), # reads TWILIO_ACCOUNT_SID / TWILIO_AUTH_TOKEN + phone_number="+15550001234", +) + +# Passing stt + tts (and no engine=) selects pipeline mode: STT -> LLM -> TTS. +agent = phone.agent( + system_prompt=( + "You are the front desk. Keep replies short and speakable; " + "you have phone callers, not chat users." + ), + stt=DeepgramSTT(), # reads DEEPGRAM_API_KEY + llm=HermesLLM(), # base_url http://127.0.0.1:8642/v1, 120 s timeout + tts=ElevenLabsTTS(), # reads ELEVENLABS_API_KEY +) + +phone.serve(agent) # listens for inbound calls +``` + +`HermesLLM()` with no arguments is the common case — every default comes from the env. To +target a remote Hermes or pin a model explicitly: + +```python +llm = HermesLLM( + base_url="http://hermes-host.internal:8642/v1", + model="hermes-agent", + api_key="...", # or leave unset to read API_SERVER_KEY + timeout=120.0, # raise further if your turns run long +) +``` + +#### TypeScript example + +```ts +import { Patter, HermesLLM, DeepgramSTT, ElevenLabsTTS, Twilio } from 'getpatter'; + +const phone = new Patter({ + carrier: new Twilio(), // reads TWILIO_ACCOUNT_SID / TWILIO_AUTH_TOKEN + phoneNumber: '+15550001234', +}); + +// Passing stt + tts (and no engine) selects pipeline mode: STT -> LLM -> TTS. +const agent = phone.agent({ + systemPrompt: + 'You are the front desk. Keep replies short and speakable; ' + + 'you have phone callers, not chat users.', + stt: new DeepgramSTT(), // reads DEEPGRAM_API_KEY + llm: new HermesLLM(), // baseUrl http://127.0.0.1:8642/v1, 120 s timeout + tts: new ElevenLabsTTS(), // reads ELEVENLABS_API_KEY +}); + +await phone.serve(agent); +``` + +To target a remote Hermes or pin a model explicitly: + +```ts +const llm = new HermesLLM({ + baseUrl: 'http://hermes-host.internal:8642/v1', + model: 'hermes-agent', + apiKey: process.env.API_SERVER_KEY, // or omit to read it automatically + timeout: 120, // seconds; raise if turns run long +}); +``` + +### Connecting Twilio or Telnyx + +The voice shell answers a real phone number. `phone.serve(agent)` exposes a webhook that +the carrier calls when a number rings; point your number's voice webhook at Patter's +public URL. + +- **Local dev:** run Patter behind a tunnel (a stable `cloudflared` hostname is best — an + ephemeral quick-tunnel rotates its URL on every restart) and set that URL as the number's + voice webhook. +- **Twilio:** in the [Twilio console](https://console.twilio.com/us1/develop/phone-numbers/manage/incoming), + set the number's "A call comes in" webhook to your Patter URL. Patter verifies + `X-Twilio-Signature` on every inbound request. +- **Telnyx:** in the [Telnyx portal](https://portal.telnyx.com/#/app/numbers/my-numbers), + point the number's voice connection at your Patter URL. Patter verifies the Ed25519 + signature. + +Only **Patter** is exposed to the carrier. Hermes stays on loopback (see Security notes). + +### Production deployment + +For an always-on line, run Patter and Hermes **on the same machine** and keep the gateway +private: + +- Bind the Hermes gateway to `127.0.0.1:8642` (`API_SERVER_HOST=127.0.0.1`) — Patter + reaches it over loopback, so it never needs to be tunnelled or ngrok-exposed. +- Expose **only** Patter's carrier webhook to the internet — via a stable production + webhook URL or a persistent `cloudflared` tunnel with a fixed hostname. +- Set a strong `API_SERVER_KEY` even on loopback; defence in depth. +- Pin the SDK version and your provider/carrier keys via environment variables, never in + source. + +### Security notes + +- **Hermes does not need to be on the public internet.** When Patter and Hermes run on the + same box, Patter reaches Hermes on `127.0.0.1:8642` — only Patter is exposed to the + carrier. This is **safer than the hosted custom-LLM path**, where your brain endpoint has + to be publicly reachable for the platform's cloud to call it. Here the brain stays + loopback-only and the only public surface is the carrier webhook, which verifies the + carrier signature at the boundary. +- **Use a strong `API_SERVER_KEY`.** Even on loopback, set a real key; `HermesLLM` sends it + as a bearer token and Patter never logs it. +- **Keep the prompt voice-safe.** Phone replies are spoken, not read — ask the agent for + short, plain sentences and no markdown, lists, or code blocks. Long replies bloat the + voice context and slow the next turn. +- **PII and recording consent.** Calls may carry personal information; if you record or + store transcripts, follow your jurisdiction's consent rules (two-party-consent states, + GDPR, etc.). Treat transcripts as PII and don't log full caller numbers (Patter logs only + the last four digits by default). + + + **Generic OpenAI-compatible runtimes.** `HermesLLM` is a thin preset over the generic + `OpenAICompatibleLLM` provider, which drives *any* OpenAI-compatible chat endpoint — + Hermes, OpenClaw, Ollama, vLLM, LM Studio, or a custom gateway. To point the voice shell + at one of those instead, use `OpenAICompatibleLLM(base_url=..., model=...)` directly. See + the [OpenClaw page](/integrations/openclaw#openclaw-as-the-primary-llm-voice-shell) for + the OpenClaw preset and the generic-runtime note. + ## Prerequisites @@ -37,7 +291,7 @@ See the [patter-mcp README](https://github.com/PatterAI/patter-mcp#readme) for f --- -## Direction A — Hermes places calls via Patter +## Direction B — Hermes places calls via Patter Hermes is a native [Model Context Protocol](https://modelcontextprotocol.io) client. Register the Patter MCP server under the top-level `mcp_servers` key in @@ -133,11 +387,13 @@ Hermes can summarise without polling `get_transcript` separately: --- -## Direction B — Patter consults Hermes as the brain +## Direction C — Patter consults Hermes on demand -Here the roles flip: Patter runs the call (speech-to-text, the voice agent, -text-to-speech, and the carrier) and, when the in-call agent needs help, it -**consults Hermes** over HTTP. This is the *dispatch + consult* pattern described +Here the roles are split differently: Patter runs the call with a *local* in-call agent and +only reaches Hermes when that agent decides it needs help — it **consults Hermes** over +HTTP. Use this when most turns are simple and only a few need Hermes's full reasoning; +Direction A (above) routes *every* turn to Hermes. This is the *dispatch + consult* pattern +described on the [Python consult page](/python-sdk/consult) and the [TypeScript consult page](/typescript-sdk/consult) — Hermes stays off the per-turn path and is only consulted when the in-call agent decides it needs to. @@ -264,9 +520,13 @@ the adapter forwards that to Hermes, and the answer is spoken back to the caller ## What's next -- **Read the [Concepts](/concepts) page** to understand Patter's three modes - (Realtime / ConvAI / Pipeline) and which one matches your latency / cost - trade-off. +- **Read the [Concepts](/concepts) page** to understand Patter's voice modes + (Realtime end-to-end vs. Pipeline composed STT + LLM + TTS) — the voice shell + (Direction A) is a pipeline-mode agent with `HermesLLM` as the LLM stage. +- **OpenClaw** runs the same way — see + [OpenClaw as the primary LLM](/integrations/openclaw#openclaw-as-the-primary-llm-voice-shell) + for the `OpenClawLLM` preset and the generic `OpenAICompatibleLLM` provider + (Ollama / vLLM / LM Studio). - **Consult feature reference**: [Python](/python-sdk/consult) · [TypeScript](/typescript-sdk/consult). - **Get a phone number**: [Twilio console](https://console.twilio.com/us1/develop/phone-numbers/manage/incoming) diff --git a/docs/integrations/openclaw.mdx b/docs/integrations/openclaw.mdx index 4682e14d..be8b24c1 100644 --- a/docs/integrations/openclaw.mdx +++ b/docs/integrations/openclaw.mdx @@ -26,7 +26,7 @@ Ed25519 for Telnyx) at the public boundary. The caller reaches the receptionist the trusted operator-side gateway path instead. If today you forward calls around OpenClaw's allowlist with carrier tricks, that hack goes away. -There are two directions, and they compose: +There are three directions, and they compose: - **Direction A — OpenClaw places calls via Patter.** Connect the `patter-mcp` server so your OpenClaw agent can dial, wait for a call to finish, and read back the @@ -35,9 +35,13 @@ There are two directions, and they compose: feature at a small adapter so an in-call Patter agent can escalate a hard turn to one specific OpenClaw agent over HTTP. This call must stay **fast** — it runs while the caller waits on the line. +- **Direction C — OpenClaw *is* the in-call LLM (voice shell).** Patter answers the + carrier leg and drives speech, and routes **every** turn to one scoped OpenClaw agent as + the LLM, using the new [`OpenClawLLM`](#openclaw-as-the-primary-llm-voice-shell) pipeline + provider. No adapter; the caller is talking to OpenClaw the whole time. -Pick either, or wire both for a full loop: OpenClaw starts the call, and the in-call -agent calls back into OpenClaw when it needs deeper reasoning. +Pick one, or wire several for a full loop: OpenClaw starts the call, and an in-call agent +calls back into OpenClaw when it needs deeper reasoning. **Which direction for what.** Direction A (`make_call` / `call_third_party` with @@ -66,6 +70,149 @@ Provider and carrier keys (`OPENAI_API_KEY`, `TWILIO_ACCOUNT_SID`, `TWILIO_AUTH_TOKEN`, `TELNYX_API_KEY`, …) are read from the server's environment. See the [patter-mcp README](https://github.com/PatterAI/patter-mcp#readme) for full setup. +For the voice-shell direction (Direction C) you only need the SDK — no MCP server: + +```bash +pip install getpatter # Python +npm install getpatter # TypeScript +``` + +## OpenClaw as the primary LLM (voice shell) + +In Direction C, OpenClaw is not consulted *on demand* — it **is** the in-call LLM. Patter +owns the carrier leg and speech (STT, turn-taking, barge-in, TTS) and routes every turn to +one scoped OpenClaw agent through the new `OpenClawLLM` pipeline provider. The caller is +talking to OpenClaw the whole call. Use this when most turns need the agent's full +reasoning; use [Direction B](#direction-b-patter-consults-openclaw-mid-call) instead when a +*local* in-call agent handles ordinary turns and only escalates the hard ones. + +`OpenClawLLM` aligns byte-for-byte with the shipped `consult` OpenClaw preset +([`ConsultConfig.openclaw`](#native-target-recommended-no-adapter)): same gateway base url +(`http://127.0.0.1:18789/v1`), same `OPENCLAW_API_KEY` env var, same +`model="openclaw/"` agent-target convention and id charset rule, and the same +`x-openclaw-session-key` session header. The only difference is the timeout default: the +LLM provider defaults to **120 s** (the runtime *is* the per-turn brain) where the consult +preset defaults to a phone-safe 30 s (the runtime is an on-demand escalation). + +### Enable the gateway and pick the agent + +Enable OpenClaw's OpenAI-compatible endpoint (it is **disabled by default**) in +`~/.openclaw/openclaw.json`, and bind the gateway to loopback with a strong auth credential +— exactly as in [Direction B](#direction-b-patter-consults-openclaw-mid-call): + +```json5 +{ + gateway: { + http: { + endpoints: { + chatCompletions: { + enabled: true, + }, + }, + }, + }, +} +``` + +Then point `OpenClawLLM` at **one explicit, least-privileged receptionist agent** — never +the default / master agent. The `agent` id is validated and mapped to +`model="openclaw/"`; an already-namespaced id (`openclaw/x`, `openclaw:x`, +`agent:x`) is passed through unchanged, identical to the consult preset. + +#### Python example + +```python +from getpatter import Patter, OpenClawLLM, DeepgramSTT, ElevenLabsTTS, Twilio + +phone = Patter(carrier=Twilio(), phone_number="+15550001234") + +# stt + tts (and no engine=) selects pipeline mode: STT -> LLM -> TTS. +agent = phone.agent( + system_prompt="You are the after-hours receptionist. Keep replies short and spoken.", + stt=DeepgramSTT(), + llm=OpenClawLLM(agent="receptionist"), # -> model "openclaw/receptionist" + tts=ElevenLabsTTS(), +) + +phone.serve(agent) +``` + +`OpenClawLLM(agent="receptionist")` reads the operator-grade bearer from `OPENCLAW_API_KEY` +(never logged), targets `http://127.0.0.1:18789/v1`, and sends the call id as both the +OpenAI `user` field (`patter-call-`) and the `x-openclaw-session-key` header — one +OpenClaw session per phone call. Override any default explicitly: + +```python +llm = OpenClawLLM( + agent="openclaw/receptionist-roofing-ca", # already-namespaced id passes through + base_url="http://127.0.0.1:18789/v1", + api_key="...", # or leave unset to read OPENCLAW_API_KEY + timeout=120.0, +) +``` + +#### TypeScript example + +```ts +import { Patter, OpenClawLLM, DeepgramSTT, ElevenLabsTTS, Twilio } from 'getpatter'; + +const phone = new Patter({ carrier: new Twilio(), phoneNumber: '+15550001234' }); + +// stt + tts (and no engine) selects pipeline mode: STT -> LLM -> TTS. +const agent = phone.agent({ + systemPrompt: 'You are the after-hours receptionist. Keep replies short and spoken.', + stt: new DeepgramSTT(), + llm: new OpenClawLLM({ agent: 'receptionist' }), // -> model "openclaw/receptionist" + tts: new ElevenLabsTTS(), +}); + +await phone.serve(agent); +``` + + + **Target one scoped agent, never the default.** The gateway credential authorises the + *whole* gateway — choosing `agent: "receptionist"` picks the persona, not a security + boundary. Model the receptionist as a **top-level**, sandboxed, least-privileged agent + with its own tool allow/deny, bind the gateway to loopback with a strong credential, and + keep one gateway per client. The full isolation model is in + [Route the call to a SPECIFIC OpenClaw agent](#route-the-call-to-a-specific-openclaw-agent-security-isolation) + below — it applies identically whether OpenClaw is reached as the primary LLM + (Direction C) or via `consult` (Direction B). + + + + **Generic OpenAI-compatible runtimes (Ollama / vLLM / LM Studio).** `OpenClawLLM` is a + thin preset over the generic `OpenAICompatibleLLM` provider, which drives *any* + OpenAI-compatible chat endpoint. To put a local model server on the phone instead of + OpenClaw, use it directly: + + ```python + from getpatter import OpenAICompatibleLLM + + # Ollama (keyless local gateway — no api key needed) + llm = OpenAICompatibleLLM( + base_url="http://127.0.0.1:11434/v1", + model="llama3.1", + ) + ``` + + ```ts + import { OpenAICompatibleLLM } from 'getpatter'; + + // vLLM / LM Studio (keyless local gateway) + const llm = new OpenAICompatibleLLM({ + baseUrl: 'http://127.0.0.1:8000/v1', + model: 'my-model', + }); + ``` + + `base_url` and `model` are required; `timeout` defaults to **60 s**. Keyless gateways + send no `Authorization` header. For a remote endpoint that needs a key, pass `api_key` or + set `api_key_env` to the env var holding it. See the [Hermes page](/integrations/hermes) + for the full voice-shell walkthrough (carrier setup, security notes, production + deployment) — it applies to any of these runtimes. + + ## Direction A — OpenClaw places calls via Patter OpenClaw reads MCP servers from `~/.openclaw/openclaw.json` (JSON5) under diff --git a/libraries/python/getpatter/__init__.py b/libraries/python/getpatter/__init__.py index cd8f5ee5..9869b242 100644 --- a/libraries/python/getpatter/__init__.py +++ b/libraries/python/getpatter/__init__.py @@ -122,6 +122,12 @@ from getpatter.llm.groq import LLM as GroqLLM from getpatter.llm.cerebras import LLM as CerebrasLLM from getpatter.llm.google import LLM as GoogleLLM +from getpatter.llm.openai_compatible import ( + LLM as OpenAICompatibleLLM, + OpenAICompatibleLLMProvider, +) +from getpatter.llm.hermes import LLM as HermesLLM +from getpatter.llm.openclaw import LLM as OpenClawLLM # Telephony adapters — surface for tests + advanced integrations that need # direct access to provider-specific APIs (e.g. custom webhook wiring). @@ -464,6 +470,10 @@ def mix_pcm(agent: bytes, bg: bytes, ratio: float) -> bytes: "GroqLLM", "CerebrasLLM", "GoogleLLM", + "OpenAICompatibleLLM", + "OpenAICompatibleLLMProvider", + "HermesLLM", + "OpenClawLLM", "TwilioAdapter", "TelnyxAdapter", "PlivoAdapter", diff --git a/libraries/python/getpatter/client.py b/libraries/python/getpatter/client.py index 6d3a7da5..a3ce6865 100644 --- a/libraries/python/getpatter/client.py +++ b/libraries/python/getpatter/client.py @@ -1440,6 +1440,7 @@ def agent( model: str = "gpt-realtime-mini", language: str = "en", first_message: str = "", + llm_error_message: str | None = None, tools: list[Tool] | None = None, stt: STTProvider | None = None, tts: TTSProvider | None = None, @@ -1657,6 +1658,7 @@ def agent( model=model, language=language, first_message=first_message, + llm_error_message=llm_error_message, tools=tuple(tools_out) if tools_out is not None else None, provider=provider, stt=stt_resolved, diff --git a/libraries/python/getpatter/llm/__init__.py b/libraries/python/getpatter/llm/__init__.py index bb18e15f..e6b892f2 100644 --- a/libraries/python/getpatter/llm/__init__.py +++ b/libraries/python/getpatter/llm/__init__.py @@ -20,4 +20,7 @@ "groq", "cerebras", "google", + "openai_compatible", + "hermes", + "openclaw", ] diff --git a/libraries/python/getpatter/llm/hermes.py b/libraries/python/getpatter/llm/hermes.py new file mode 100644 index 00000000..0ebfa695 --- /dev/null +++ b/libraries/python/getpatter/llm/hermes.py @@ -0,0 +1,96 @@ +"""Hermes agent-runtime LLM for Patter pipeline mode. + +Thin preset over :class:`getpatter.llm.openai_compatible.OpenAICompatibleLLMProvider` +that defaults the base URL, model, env-key name, and timeout for the Hermes +agent runtime. Patter is the voice shell (carrier + STT + turn-taking + TTS); +Hermes is the brain on the line — each turn is one +``POST {base_url}/chat/completions`` against the local Hermes gateway. + +Hermes is stateless and keys continuity off HEADERS, not the OpenAI ``user`` +field. Patter sends ``X-Hermes-Session-Id: patter-call-`` on every +turn so one phone call maps to one Hermes session / transcript (on by default). +For long-term memory scoping, set ``session_key`` to emit a static +``X-Hermes-Session-Key`` header (off by default). The OpenAI ``user`` field is +still sent (``patter-call-``) as a harmless upstream-log correlation +id, but it is not what drives the session. +""" + +from __future__ import annotations + +import os +from typing import ClassVar + +from getpatter.llm.openai_compatible import OpenAICompatibleLLMProvider + +__all__ = ["LLM"] + +# Hermes gateway default (loopback; operator-co-located deployment). +_BASE_URL = "http://127.0.0.1:8642/v1" +_DEFAULT_MODEL = "hermes-agent" + +# Hermes is stateless — continuity is carried in headers. +_SESSION_USER_PREFIX = "patter-call-" +_SESSION_ID_HEADER = "X-Hermes-Session-Id" +_SESSION_ID_PREFIX = "patter-call-" +_SESSION_KEY_HEADER = "X-Hermes-Session-Key" + + +class LLM(OpenAICompatibleLLMProvider): + """Hermes agent-runtime LLM provider. + + Example:: + + from getpatter.llm import hermes + + llm = hermes.LLM() # all env-defaulted + llm = hermes.LLM(model="hermes-7b") # explicit model override + llm = hermes.LLM(api_key="...", base_url="http://host:8642/v1") + + Defaults: + + * ``base_url`` → ``http://127.0.0.1:8642/v1`` + * ``model`` → ``API_SERVER_MODEL_NAME`` env if set, else ``"hermes-agent"`` + * ``api_key`` → ``api_key`` arg or ``API_SERVER_KEY`` env (may be absent for + a keyless local Hermes) + * ``timeout`` → ``120.0`` s (runtimes run tools / memory / skills before + replying, so turns can take 30-90 s) + * per-call continuity → ``X-Hermes-Session-Id: patter-call-`` + (always sent with a call id — the primary mechanism) + * long-term memory → ``X-Hermes-Session-Key: `` (only sent + when ``session_key`` is configured) + + Args: + session_key: Optional long-term memory scope. When set, every turn + emits ``X-Hermes-Session-Key: `` so Hermes namespaces + persistent memory across calls. Credential-grade — never logged. + ``None`` (default) means the header is not sent. + """ + + provider_key: ClassVar[str] = "hermes" + + def __init__( + self, + api_key: str | None = None, + *, + base_url: str = _BASE_URL, + model: str | None = None, + timeout: float = 120.0, + session_key: str | None = None, + **kwargs, + ) -> None: + resolved_model = model or os.environ.get( + "API_SERVER_MODEL_NAME", _DEFAULT_MODEL + ) + super().__init__( + api_key=api_key, + base_url=base_url, + model=resolved_model, + api_key_env="API_SERVER_KEY", + timeout=timeout, + session_user_prefix=_SESSION_USER_PREFIX, + session_id_header=_SESSION_ID_HEADER, + session_id_prefix=_SESSION_ID_PREFIX, + session_key_header=_SESSION_KEY_HEADER, + session_key=session_key, + **kwargs, + ) diff --git a/libraries/python/getpatter/llm/openai_compatible.py b/libraries/python/getpatter/llm/openai_compatible.py new file mode 100644 index 00000000..b3d02c6c --- /dev/null +++ b/libraries/python/getpatter/llm/openai_compatible.py @@ -0,0 +1,319 @@ +"""Generic OpenAI-compatible LLM provider for Patter's pipeline mode. + +Drives *any* OpenAI-compatible ``/chat/completions`` endpoint — an agent +runtime (Hermes, OpenClaw) or a local inference gateway (Ollama, vLLM, +LM Studio). Patter owns the carrier + STT + turn-taking + TTS; this +provider turns each conversation turn into a single +``POST {base_url}/chat/completions`` request and speaks the response. + +It subclasses :class:`getpatter.services.llm_loop.OpenAILLMProvider` exactly +like :mod:`getpatter.providers.groq_llm` / :mod:`getpatter.providers.cerebras_llm`: +``super().__init__()`` initialises the inherited SSE streaming loop, the +sampling kwargs, and ``_user_agent``; then ``self._client`` is replaced with +an ``AsyncOpenAI`` pointed at ``base_url`` with the long configurable timeout +the parent does not set today. + +Two additions over the parent: + +* **Long timeout.** Agent runtimes execute tools / memory / skills before + replying, so a turn can take 30-90 s. The default is 60 s here (the presets + raise it to 120 s); the base provider keeps no timeout, which is correct for + raw inference. +* **Session continuity.** Three independent, opt-in signals let a runtime + scope one session per phone call (and, optionally, a long-term memory + namespace). Each is decoupled from the others: + + - ``session_user_prefix`` → emits the OpenAI ``user`` field as + ``f"{prefix}{call_id}"`` (some gateways derive a session from ``user``). + - ``session_id_header`` + ``session_id_prefix`` → emits a per-call header + ``f"{session_id_prefix}{call_id}"`` for per-call session / transcript + continuity (the mechanism stateless runtimes such as Hermes key off). + - ``session_key_header`` + ``session_key`` → emits a *static* header for + long-term memory scoping. The value is credential-grade and is never + logged. + + All three are OFF by default — when none is configured the emitted request + is byte-identical to the parent (no ``user``, no extra headers). + +Keyless gateways (Ollama / vLLM / LM Studio accept no key) are supported: the +conventional ``"EMPTY"`` sentinel is passed to ``AsyncOpenAI`` (whose +constructor rejects ``None``). +""" + +from __future__ import annotations + +import asyncio +import logging +import os +from typing import Any, AsyncIterator, ClassVar + +from getpatter.services.llm_loop import OpenAILLMProvider + +__all__ = ["OpenAICompatibleLLMProvider", "LLM"] + +logger = logging.getLogger("getpatter.llm.openai_compatible") + +# AsyncOpenAI rejects ``api_key=None`` — keyless gateways (Ollama / vLLM / +# LM Studio) accept any bearer (or none), so we pass this conventional sentinel. +_EMPTY_KEY_SENTINEL = "EMPTY" + + +class OpenAICompatibleLLMProvider(OpenAILLMProvider): + """LLM provider for any OpenAI-compatible ``/chat/completions`` endpoint. + + Streams in the same ``{"type": "text" | "tool_call" | "usage"}`` chunk + format as :class:`OpenAILLMProvider`. All OpenAI-spec sampling kwargs + accepted by the parent (``response_format``, ``parallel_tool_calls``, + ``tool_choice``, ``seed``, ``top_p``, ``frequency_penalty``, + ``presence_penalty``, ``stop``, ``temperature``, ``max_tokens``) are + forwarded transparently. + + Args: + api_key: Bearer token. If omitted and ``api_key_env`` is given, read + from that environment variable. May resolve to ``None`` for + keyless local gateways — the ``"EMPTY"`` sentinel is sent so the + ``AsyncOpenAI`` constructor (which rejects ``None``) is satisfied. + base_url: OpenAI-compatible base URL ending in ``/v1`` — the whole + point of this provider, so it is **required**. Operator-controlled + config, never derived from caller / transcript input. + model: Model / agent target — **required**. + api_key_env: Environment variable to read the bearer from when + ``api_key`` is not given (e.g. ``"OPENCLAW_API_KEY"``). + timeout: Per-request timeout in seconds. Default ``60.0`` (the base + OpenAI provider sets no timeout — raised here because agent + runtimes run tools before replying). + extra_headers: Extra headers merged into ``default_headers`` *after* + the ``User-Agent`` so the SDK attribution is not silently + clobbered (a caller can still override ``User-Agent`` explicitly). + session_user_prefix: When set, emits the OpenAI ``user`` field as + ``f"{prefix}{call_id}"`` for per-call session continuity. ``None`` + (default) means no ``user`` field is sent. + session_id_header: Optional header NAME carrying the per-call session + id, e.g. ``"X-Hermes-Session-Id"`` / ``"x-openclaw-session-key"``. + When set (and a ``call_id`` is available) the header value is + ``f"{session_id_prefix}{call_id}"``. ``None`` (default) means off. + session_id_prefix: Prefix for the ``session_id_header`` VALUE. Defaults + to ``""`` (the raw call id). Independent of ``session_user_prefix``. + session_key_header: Optional header NAME for long-term memory scope, + e.g. ``"X-Hermes-Session-Key"``. The value is static (does not vary + per call). ``None`` (default) means off. + session_key: Static value emitted in ``session_key_header``. It is a + credential-grade memory scope and is NEVER logged. ``None`` + (default) means the header is omitted even if + ``session_key_header`` is set. + **kwargs: Sampling kwargs forwarded to :class:`OpenAILLMProvider`. + """ + + #: Stable pricing/dashboard key — read by stream-handler/metrics. + provider_key: ClassVar[str] = "openai_compatible" + + def __init__( + self, + api_key: str | None = None, + *, + base_url: str, + model: str, + api_key_env: str | None = None, + timeout: float = 60.0, + extra_headers: dict[str, str] | None = None, + session_user_prefix: str | None = None, + session_id_header: str | None = None, + session_id_prefix: str = "", + session_key_header: str | None = None, + session_key: str | None = None, + **kwargs, + ) -> None: + try: + from openai import AsyncOpenAI + except ImportError as e: + raise RuntimeError( + "The 'openai' package is required for " + "OpenAICompatibleLLMProvider. Install it with: pip install openai" + ) from e + + # Resolve the bearer: explicit api_key wins, then api_key_env, else + # None (keyless local gateway). Never logged. + key = api_key or (os.environ.get(api_key_env) if api_key_env else None) + + # Initialise parent state (model, sampling kwargs, _user_agent) without + # using its OpenAI-pointed client. We swap in a base_url-pointed client + # below with the same User-Agent the parent computed plus the long + # configurable timeout the parent does not set. + super().__init__(api_key=key or _EMPTY_KEY_SENTINEL, model=model, **kwargs) + + default_headers = {"User-Agent": self._user_agent, **(extra_headers or {})} + self._client: Any = AsyncOpenAI( + api_key=key or _EMPTY_KEY_SENTINEL, + base_url=base_url, + timeout=timeout, + default_headers=default_headers, + ) + + self._session_user_prefix = session_user_prefix + self._session_id_header = session_id_header + self._session_id_prefix = session_id_prefix + self._session_key_header = session_key_header + # Credential-grade memory scope — never logged. + self._session_key = session_key + + async def warmup(self) -> None: + """Pre-call DNS / TLS warmup that omits ``Authorization`` for keyless gateways. + + Overrides :meth:`OpenAILLMProvider.warmup`, which sends + ``Authorization: Bearer {api_key}`` unconditionally — for keyless + gateways (Ollama / vLLM / LM Studio) that becomes + ``Bearer EMPTY``, which some gateways reject. Here the header is sent + only when a real key is present, matching the TS provider's warmup. + Best-effort: 5 s timeout, all exceptions swallowed at DEBUG. + """ + try: + base_url = str(getattr(self._client, "base_url", "") or "").rstrip("/") + if not base_url: + return + import httpx + + headers: dict[str, str] = {"User-Agent": self._user_agent} + key = getattr(self._client, "api_key", None) + if key and key != _EMPTY_KEY_SENTINEL: + headers["Authorization"] = f"Bearer {key}" + async with httpx.AsyncClient(timeout=5.0) as http: + await http.get(f"{base_url}/models", headers=headers) + except Exception as exc: # noqa: BLE001 - best-effort + logger.debug("OpenAI-compatible LLM warmup failed (best-effort): %s", exc) + + def _record_completion_cost( + self, *, prompt_tokens: int, completion_tokens: int + ) -> None: + """Stamp ``patter.cost.llm_*_tokens`` with the provider key tag.""" + try: + from getpatter.observability.attributes import record_patter_attrs + + record_patter_attrs( + { + "patter.cost.llm_input_tokens": prompt_tokens, + "patter.cost.llm_output_tokens": completion_tokens, + "patter.llm.provider": self.provider_key, + } + ) + except Exception: # pragma: no cover — defense in depth + logger.debug("_record_completion_cost failed", exc_info=True) + + def _build_completion_kwargs( + self, + messages: list[dict], + tools: list[dict] | None, + *, + call_id: str | None = None, + ) -> dict[str, Any]: + """Assemble ``chat.completions.create`` kwargs, adding session continuity. + + Extends the parent builder with up to three INDEPENDENT, opt-in + session signals — the OpenAI ``user`` field, a per-call session-id + header, and a static memory-scope header. Each is gated separately, so + e.g. a runtime can take the per-call header without the ``user`` field. + Per-call signals require a ``call_id``; the memory-scope header does + not. When none applies the result is byte-identical to the parent + (no ``user``, no ``extra_headers``). + """ + kwargs = super()._build_completion_kwargs(messages, tools) + extra: dict[str, str] = {} + if self._session_user_prefix is not None and call_id: + kwargs["user"] = f"{self._session_user_prefix}{call_id}" + if self._session_id_header is not None and call_id: + extra[self._session_id_header] = f"{self._session_id_prefix}{call_id}" + if self._session_key_header is not None and self._session_key: + # Truthy check (not ``is not None``): an empty-string session key is + # not a meaningful memory scope — treat it as unset rather than + # emitting a confusing empty header on the wire. + extra[self._session_key_header] = self._session_key + if extra: + # Merge over any pre-existing extra_headers (the parent never sets + # this today, but the spread keeps it future-safe and clobber-free). + kwargs["extra_headers"] = {**kwargs.get("extra_headers", {}), **extra} + return kwargs + + async def stream( + self, + messages: list[dict], + tools: list[dict] | None = None, + *, + cancel_event: asyncio.Event | None = None, + call_id: str | None = None, + ) -> AsyncIterator[dict]: + """Stream chunks, threading ``call_id`` into the session continuity fields. + + Mirrors :meth:`OpenAILLMProvider.stream` but routes ``call_id`` into + ``_build_completion_kwargs`` so the per-call ``user`` / session header + are emitted. ``call_id`` is optional — unset means the parent-identical + no-session path. + """ + kwargs = self._build_completion_kwargs(messages, tools, call_id=call_id) + response = await self._client.chat.completions.create(**kwargs) + + last_usage = None + async for chunk in response: + if cancel_event is not None and cancel_event.is_set(): + try: + await response.close() + except Exception: # noqa: BLE001 - best-effort cleanup + pass + return + usage = getattr(chunk, "usage", None) + if usage is not None: + last_usage = usage + + delta = chunk.choices[0].delta if chunk.choices else None + if delta is None: + continue + + if delta.content: + yield {"type": "text", "content": delta.content} + + if delta.tool_calls: + for tc in delta.tool_calls: + yield { + "type": "tool_call", + "index": tc.index, + "id": tc.id, + "name": tc.function.name if tc.function else None, + "arguments": tc.function.arguments if tc.function else None, + } + + if last_usage is not None: + cache_read = 0 + details = getattr(last_usage, "prompt_tokens_details", None) + if details is not None: + cache_read = getattr(details, "cached_tokens", 0) or 0 + # Mirror OpenAILLMProvider.stream exactly: prompt_tokens is the + # TOTAL input (uncached + cached); subtract cached so input_tokens + # is the uncached portion and cost isn't double-billed. + prompt_tokens = getattr(last_usage, "prompt_tokens", 0) or 0 + uncached_input = max(0, prompt_tokens - cache_read) + completion_tokens = getattr(last_usage, "completion_tokens", 0) or 0 + self._record_completion_cost( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + yield { + "type": "usage", + "input_tokens": uncached_input, + "output_tokens": completion_tokens, + "cache_read_tokens": cache_read, + } + + +class LLM(OpenAICompatibleLLMProvider): + """Public alias of :class:`OpenAICompatibleLLMProvider` for the + ``from getpatter.llm import openai_compatible`` namespace. + + Example:: + + from getpatter.llm import openai_compatible + + # Ollama / vLLM / LM Studio (keyless local gateway): + llm = openai_compatible.LLM( + base_url="http://127.0.0.1:11434/v1", model="llama3.1", + ) + """ + + provider_key: ClassVar[str] = "openai_compatible" diff --git a/libraries/python/getpatter/llm/openclaw.py b/libraries/python/getpatter/llm/openclaw.py new file mode 100644 index 00000000..a11c5caf --- /dev/null +++ b/libraries/python/getpatter/llm/openclaw.py @@ -0,0 +1,94 @@ +"""OpenClaw agent-runtime LLM for Patter pipeline mode. + +Thin preset over :class:`getpatter.llm.openai_compatible.OpenAICompatibleLLMProvider` +that targets a specific OpenClaw agent directly. Patter is the voice shell +(carrier + STT + turn-taking + TTS); the OpenClaw agent is the brain on the +line — each turn is one ``POST {base_url}/chat/completions`` against the local +OpenClaw gateway with ``model="openclaw/"``. + +Naming and agent-target semantics are aligned byte-for-byte with the shipped +consult preset (:meth:`getpatter.models.ConsultConfig.openclaw`): same +``:18789/v1`` base URL, same ``OPENCLAW_API_KEY`` env, same agent-id charset and +``openclaw/`` namespacing rule, same ``x-openclaw-session-key`` session +header. The constants are imported from :mod:`getpatter.models` so the two paths +can never drift — an agent id valid for consult is valid here. + +Unlike the consult preset (whose ``timeout_s=30`` is a phone-safe filler +default for an on-demand escalation), the LLM-provider default is ``120`` s +because here the runtime *is* the per-turn brain. +""" + +from __future__ import annotations + +from typing import ClassVar + +from getpatter.llm.openai_compatible import OpenAICompatibleLLMProvider +from getpatter.models import ( + _OPENCLAW_AGENT_RE, + _OPENCLAW_API_KEY_ENV, + _OPENCLAW_DEFAULT_BASE_URL, + _OPENCLAW_SESSION_HEADER, +) + +__all__ = ["LLM"] + + +class LLM(OpenAICompatibleLLMProvider): + """OpenClaw agent-runtime LLM provider. + + Example:: + + from getpatter.llm import openclaw + + llm = openclaw.LLM(agent="receptionist") # → model "openclaw/receptionist" + llm = openclaw.LLM(agent="openclaw/custom") # already-namespaced, passed through + llm = openclaw.LLM(agent="receptionist", api_key="...") + + Args: + agent: OpenClaw agent id (e.g. ``"receptionist"``) → targets + ``model="openclaw/"``. An already-namespaced target + (``"openclaw/x"``, ``"openclaw:x"``, ``"agent:x"``) is passed + through unchanged. Validated against the same charset rule the + shipped consult preset uses (``^[A-Za-z0-9._:/-]+$``). + base_url: OpenClaw gateway base URL. Defaults to + ``http://127.0.0.1:18789/v1``. + api_key: OPERATOR-grade bearer (never logged). Defaults to the + ``OPENCLAW_API_KEY`` env var. + timeout: Per-request timeout in seconds. Default ``120.0``. + + Defaults: + + * ``session_user_prefix`` → ``"patter-call-"`` + * ``session_id_header`` → ``"x-openclaw-session-key"`` carrying the raw + ``call_id`` (``session_id_prefix=""``). OpenClaw keys sessions off both + the ``user`` field and this header. + """ + + provider_key: ClassVar[str] = "openclaw" + + def __init__( + self, + agent: str, + *, + base_url: str = _OPENCLAW_DEFAULT_BASE_URL, + api_key: str | None = None, + timeout: float = 120.0, + **kwargs, + ) -> None: + if not agent or not _OPENCLAW_AGENT_RE.fullmatch(agent): + raise ValueError( + "OpenClaw agent must be a non-empty id of letters, digits, and " + "._:/- only" + ) + model = agent if (":" in agent or "/" in agent) else f"openclaw/{agent}" + super().__init__( + api_key=api_key, + base_url=base_url, + model=model, + api_key_env=_OPENCLAW_API_KEY_ENV, + timeout=timeout, + session_user_prefix="patter-call-", + session_id_header=_OPENCLAW_SESSION_HEADER, + session_id_prefix="", + **kwargs, + ) diff --git a/libraries/python/getpatter/models.py b/libraries/python/getpatter/models.py index 560e1616..f22c21b4 100644 --- a/libraries/python/getpatter/models.py +++ b/libraries/python/getpatter/models.py @@ -419,6 +419,16 @@ class Agent: model: str = "gpt-realtime-mini" language: str = "en" first_message: str = "" + # Opt-in spoken fallback for pipeline mode when the per-turn LLM stream + # raises (gateway-down / 120 s timeout) BEFORE any assistant text was + # spoken. Agent-runtime providers (Hermes / OpenClaw) run tools+memory + # internally so a turn can take 30-90 s; on failure the caller currently + # hears SILENCE then a silent turn-end. When set to a non-empty string, + # the SDK synthesizes and speaks this line through the normal TTS turn + # lifecycle (subject to barge-in). ``None`` (default) preserves today's + # behaviour: nothing is spoken on LLM error. Pipeline mode only — + # Realtime / ConvAI surface provider errors on their own audio path. + llm_error_message: str | None = None tools: tuple[dict, ...] | None = None provider: ProviderMode = "openai_realtime" stt: STTConfig | None = None # which STT provider to use in pipeline mode diff --git a/libraries/python/getpatter/providers/anthropic_llm.py b/libraries/python/getpatter/providers/anthropic_llm.py index 08849455..dac94a3e 100644 --- a/libraries/python/getpatter/providers/anthropic_llm.py +++ b/libraries/python/getpatter/providers/anthropic_llm.py @@ -165,9 +165,13 @@ async def stream( tools: list[dict] | None = None, *, cancel_event: asyncio.Event | None = None, + call_id: str | None = None, ) -> AsyncIterator[dict]: """Stream chunks from Anthropic's Messages API. + ``call_id`` is accepted for protocol parity with session-aware + providers but ignored — Anthropic has no per-call session model. + Translates OpenAI-style ``messages``/``tools`` to Anthropic's shape, then normalises the event stream back into the Patter chunk protocol. ``cancel_event`` (set on barge-in by the stream diff --git a/libraries/python/getpatter/providers/cerebras_llm.py b/libraries/python/getpatter/providers/cerebras_llm.py index 91769c35..1972ac26 100644 --- a/libraries/python/getpatter/providers/cerebras_llm.py +++ b/libraries/python/getpatter/providers/cerebras_llm.py @@ -250,6 +250,7 @@ async def stream( tools: list[dict] | None = None, *, cancel_event: asyncio.Event | None = None, + call_id: str | None = None, ) -> AsyncIterator[dict]: """Stream from Cerebras, delegating SSE consumption to the parent. @@ -259,10 +260,14 @@ async def stream( the generator returns silently — voice pipelines treat LLM provider failures as recoverable (the call continues; the user just hears no LLM response), so raising would be a behavioural change. + + ``call_id`` is accepted for protocol parity and forwarded to the + parent (which ignores it — Cerebras is a raw-inference provider with + no per-call session model). """ try: async for chunk in super().stream( - messages, tools, cancel_event=cancel_event + messages, tools, cancel_event=cancel_event, call_id=call_id ): yield chunk except Exception as exc: diff --git a/libraries/python/getpatter/providers/google_llm.py b/libraries/python/getpatter/providers/google_llm.py index 8f0524ec..623981cf 100644 --- a/libraries/python/getpatter/providers/google_llm.py +++ b/libraries/python/getpatter/providers/google_llm.py @@ -167,9 +167,13 @@ async def stream( tools: list[dict] | None = None, *, cancel_event: asyncio.Event | None = None, + call_id: str | None = None, ) -> AsyncIterator[dict]: """Stream chunks from Gemini's ``generate_content_stream``. + ``call_id`` is accepted for protocol parity with session-aware + providers but ignored — Gemini has no per-call session model. + ``cancel_event`` (set on barge-in by the stream handler) is checked between chunks and short-circuits the stream so the underlying request is freed immediately instead of blocking the next user diff --git a/libraries/python/getpatter/services/fallback_provider.py b/libraries/python/getpatter/services/fallback_provider.py index 6c8c5e3b..b717b7af 100644 --- a/libraries/python/getpatter/services/fallback_provider.py +++ b/libraries/python/getpatter/services/fallback_provider.py @@ -112,6 +112,7 @@ async def complete_stream( tools: list[dict] | None = None, *, cancel_event: asyncio.Event | None = None, + call_id: str | None = None, ) -> AsyncIterator[str]: """Stream only the text deltas, flattening the chunk envelope. @@ -119,7 +120,9 @@ async def complete_stream( assistant's text output and don't need tool-call or done markers. Mirrors the TypeScript SDK's ``fallback.completeStream`` shape. """ - async for chunk in self.stream(messages, tools, cancel_event=cancel_event): + async for chunk in self.stream( + messages, tools, cancel_event=cancel_event, call_id=call_id + ): if chunk.get("type") == "text": yield chunk.get("content", "") @@ -129,6 +132,7 @@ async def stream( tools: list[dict] | None = None, *, cancel_event: asyncio.Event | None = None, + call_id: str | None = None, ) -> AsyncIterator[dict]: """Try providers in sequence, yielding chunks from the first that succeeds. @@ -138,6 +142,10 @@ async def stream( built-in LLM loop calls ``provider.stream(..., cancel_event=...)`` on every turn, and a fallback that dropped the kwarg would raise ``TypeError`` on the first turn. + + ``call_id`` (optional, per-call session id) is forwarded the same way so + session-aware delegate providers (Hermes / OpenClaw / OpenAI-compatible) + still receive it when wrapped by the fallback. """ errors: list[Exception] = [] @@ -148,6 +156,7 @@ async def stream( available_only=True, errors=errors, cancel_event=cancel_event, + call_id=call_id, ): if isinstance(chunk, _Done): return @@ -163,6 +172,7 @@ async def stream( available_only=False, errors=errors, cancel_event=cancel_event, + call_id=call_id, ): if isinstance(chunk, _Done): return @@ -185,6 +195,7 @@ async def _try_providers( available_only: bool, errors: list[Exception], cancel_event: asyncio.Event | None = None, + call_id: str | None = None, ) -> AsyncIterator[dict | _Done]: """Try each provider, yielding chunks or a _Done sentinel.""" for i, provider in enumerate(self._providers): @@ -202,7 +213,7 @@ async def _try_providers( yielded_tokens = False async for chunk in provider.stream( - messages, tools, cancel_event=cancel_event + messages, tools, cancel_event=cancel_event, call_id=call_id ): yield chunk yielded_tokens = True diff --git a/libraries/python/getpatter/services/llm_loop.py b/libraries/python/getpatter/services/llm_loop.py index 42144242..2dc52c3b 100644 --- a/libraries/python/getpatter/services/llm_loop.py +++ b/libraries/python/getpatter/services/llm_loop.py @@ -16,6 +16,7 @@ ] import asyncio +import inspect import json import logging from dataclasses import dataclass @@ -36,6 +37,40 @@ logger = logging.getLogger("getpatter") +# Per-provider-TYPE memo of whether ``stream`` accepts a ``call_id`` keyword. +# Built-in providers declare ``call_id`` (or ``**kwargs``) and hit the fast +# path after the first call; a user's minimal custom provider whose ``stream`` +# is ``(self, messages, tools=None, *, cancel_event=None)`` is detected once and +# called WITHOUT ``call_id`` thereafter — otherwise it would raise TypeError. +_provider_accepts_call_id: dict[type, bool] = {} + + +def _stream_accepts_call_id(provider: object) -> bool: + """Whether ``provider.stream`` tolerates a ``call_id`` keyword argument. + + True when the signature declares a parameter named ``call_id`` OR accepts + ``**kwargs`` (``VAR_KEYWORD``). Cached per provider type to keep the hot + path cheap. Some callables (C-level, ``functools.partial`` without + ``__wrapped__``) refuse introspection — those default to ``False`` so the + safe no-``call_id`` path is taken rather than risking a new crash site. + """ + provider_type = type(provider) + cached = _provider_accepts_call_id.get(provider_type) + if cached is not None: + return cached + accepts = False + try: + sig = inspect.signature(provider.stream) + for param in sig.parameters.values(): + if param.name == "call_id" or param.kind is inspect.Parameter.VAR_KEYWORD: + accepts = True + break + except (ValueError, TypeError): # pragma: no cover - exotic callables + accepts = False + _provider_accepts_call_id[provider_type] = accepts + return accepts + + # --------------------------------------------------------------------------- # Streaming chunk type — public mirror of TypeScript ``LLMChunk`` # --------------------------------------------------------------------------- @@ -404,6 +439,7 @@ async def stream( tools: list[dict] | None = None, *, cancel_event: asyncio.Event | None = None, + call_id: str | None = None, ) -> AsyncIterator[dict]: """Yield streaming chunks for the given messages and tools. @@ -417,6 +453,12 @@ async def stream( interruption" symptom. Optional for backward compatibility; providers that don't honour it are still usable but the user-facing interrupt-then-respond loop will be slower. + + ``call_id`` is the stable per-call identifier (optional). Agent-runtime + providers (Hermes / OpenClaw / any OpenAI-compatible gateway) thread it + into the OpenAI ``user`` field so the runtime derives one session per + phone call. Existing providers ignore it harmlessly — it is purely + additive and OFF unless a provider opts in via ``session_user_prefix``. """ ... # pragma: no cover @@ -605,6 +647,7 @@ async def stream( tools: list[dict] | None = None, *, cancel_event: asyncio.Event | None = None, + call_id: str | None = None, ) -> AsyncIterator[dict]: """Yield normalised chunks from OpenAI Chat Completions. @@ -622,6 +665,12 @@ async def stream( is checked between upstream chunks and short-circuits the stream immediately so the next user transcript is not blocked behind a long-running fetch. + + ``call_id`` (optional) is accepted for protocol parity with + session-aware providers but ignored here — the base OpenAI provider + emits no per-call ``user`` field. Subclasses (e.g. the + OpenAI-compatible agent-runtime provider) override ``stream`` to thread + it into ``_build_completion_kwargs``. """ kwargs = self._build_completion_kwargs(messages, tools) response = await self._client.chat.completions.create(**kwargs) @@ -912,9 +961,25 @@ async def run( _span_cm.__enter__() _span_exc_info: tuple = (None, None, None) try: - async for chunk in self._provider.stream( - messages, self._openai_tools, cancel_event=cancel_event - ): + # Only thread ``call_id`` into providers whose ``stream`` + # accepts it (or ``**kwargs``). A user's minimal custom provider + # with ``(messages, tools=None, *, cancel_event=None)`` would + # otherwise raise TypeError on the added keyword. ``cancel_event`` + # predates this and every Protocol implementer tolerates it. + if _stream_accepts_call_id(self._provider): + stream_iter = self._provider.stream( + messages, + self._openai_tools, + cancel_event=cancel_event, + call_id=call_context.get("call_id"), + ) + else: + stream_iter = self._provider.stream( + messages, + self._openai_tools, + cancel_event=cancel_event, + ) + async for chunk in stream_iter: chunk_type = chunk.get("type") if chunk_type == "text": diff --git a/libraries/python/getpatter/stream_handler.py b/libraries/python/getpatter/stream_handler.py index ea221438..a3b9e129 100644 --- a/libraries/python/getpatter/stream_handler.py +++ b/libraries/python/getpatter/stream_handler.py @@ -3071,9 +3071,14 @@ async def _synthesize_sentence( if not self._is_speaking: return False # barge-in fired during the hook await - if first_tts_chunk[0] and self.metrics is not None: - self.metrics.record_tts_first_byte() + if first_tts_chunk[0]: + # Flip the per-turn "first PCM chunk emitted" flag BEFORE + # the metrics branch so it is a reliable "audio reached the + # carrier" signal even when ``self.metrics is None`` — the + # llm_error_message fallback gate depends on it. first_tts_chunk[0] = False + if self.metrics is not None: + self.metrics.record_tts_first_byte() # Speech-event: per-turn first TTS audio chunk. Idempotent # in the dispatcher; fires for the first sentence's first # synthesized chunk per turn. @@ -3190,6 +3195,29 @@ async def _process_streaming_response(self, result, call_id: str) -> str: if self.metrics is not None and self.metrics.turn_active: self.metrics.record_turn_interrupted() + # Opt-in spoken fallback: when the LLM stream raised BEFORE any + # assistant audio was emitted this turn and the agent configured + # a non-empty ``llm_error_message``, speak that line through the + # normal TTS turn lifecycle (subject to barge-in). Gated on + # ``first_tts_chunk[0]`` — still ``True`` means no PCM chunk has + # been sent to the carrier yet, i.e. the caller heard SILENCE — + # rather than on token receipt, so a provider that streams + # partial tokens ('Let me check…') and then times out before a + # sentence boundary (the chunker never produced a complete + # sentence, so TTS never ran) still triggers the fallback. Also + # gated on ``_is_speaking`` so a concurrent barge-in that flipped + # the floor does not get talked over. Wrapped in its own guard so + # a TTS outage on top of an LLM outage degrades to today's + # silence rather than raising out of the handler. + fallback = getattr(self.agent, "llm_error_message", None) + if fallback and first_tts_chunk[0] and self._is_speaking: + try: + await self._synthesize_sentence( + fallback, hook_executor, hook_ctx, first_tts_chunk + ) + except Exception: # pragma: no cover - defensive + logger.exception("llm_error_message fallback synthesis failed") + if self.metrics is not None: self.metrics.record_llm_complete() diff --git a/libraries/python/tests/test_llm_hermes_openclaw_presets.py b/libraries/python/tests/test_llm_hermes_openclaw_presets.py new file mode 100644 index 00000000..c0a2a6ac --- /dev/null +++ b/libraries/python/tests/test_llm_hermes_openclaw_presets.py @@ -0,0 +1,212 @@ +"""Tests for the Hermes and OpenClaw thin LLM presets. + +Real construction throughout — no mocks. The presets defer to +``OpenAICompatibleLLMProvider`` so these assertions read the live constructed +client (base URL / timeout) and the session-continuity config. +""" + +from __future__ import annotations + +import pytest + +from getpatter.llm import hermes, openclaw +from getpatter.models import ( + _OPENCLAW_API_KEY_ENV, + _OPENCLAW_DEFAULT_BASE_URL, + _OPENCLAW_SESSION_HEADER, +) + + +def _base_url_str(provider) -> str: + return str(provider._client.base_url) + + +# --------------------------------------------------------------------------- +# Hermes +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_hermes_defaults_base_url_model_timeout(monkeypatch) -> None: + monkeypatch.delenv("API_SERVER_MODEL_NAME", raising=False) + monkeypatch.delenv("API_SERVER_KEY", raising=False) + llm = hermes.LLM() + assert _base_url_str(llm).startswith("http://127.0.0.1:8642/v1") + assert llm._model == "hermes-agent" + assert llm._client.timeout == 120.0 + # Hermes is stateless and keys continuity off HEADERS: + # X-Hermes-Session-Id (per call) + optional X-Hermes-Session-Key (memory). + assert llm._session_user_prefix == "patter-call-" + assert llm._session_id_header == "X-Hermes-Session-Id" + assert llm._session_id_prefix == "patter-call-" + assert llm._session_key_header == "X-Hermes-Session-Key" + assert llm.provider_key == "hermes" + + +@pytest.mark.unit +def test_hermes_session_key_off_by_default_and_configurable() -> None: + # Default: no session_key value held => X-Hermes-Session-Key not emitted. + assert hermes.LLM()._session_key is None + # Configurable long-term memory scope. + llm = hermes.LLM(session_key="mem-123") + assert llm._session_key == "mem-123" + kwargs = llm._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="c1" + ) + assert kwargs["extra_headers"]["X-Hermes-Session-Key"] == "mem-123" + assert kwargs["extra_headers"]["X-Hermes-Session-Id"] == "patter-call-c1" + + +@pytest.mark.unit +def test_hermes_model_env_override(monkeypatch) -> None: + monkeypatch.setenv("API_SERVER_MODEL_NAME", "hermes-7b") + assert hermes.LLM()._model == "hermes-7b" + # Explicit model arg still wins over the env default. + assert hermes.LLM(model="hermes-custom")._model == "hermes-custom" + + +@pytest.mark.unit +def test_hermes_api_key_from_env(monkeypatch) -> None: + monkeypatch.setenv("API_SERVER_KEY", "hermes-key") + assert hermes.LLM()._client.api_key == "hermes-key" + # Keyless local Hermes — absent env, no api_key — still constructs. + monkeypatch.delenv("API_SERVER_KEY", raising=False) + assert hermes.LLM()._client.api_key == "EMPTY" + + +# --------------------------------------------------------------------------- +# OpenClaw +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_openclaw_agent_maps_to_namespaced_model() -> None: + assert openclaw.LLM(agent="receptionist")._model == "openclaw/receptionist" + # Already-namespaced ids pass through unchanged. + assert openclaw.LLM(agent="openclaw/custom")._model == "openclaw/custom" + assert openclaw.LLM(agent="openclaw:custom")._model == "openclaw:custom" + assert openclaw.LLM(agent="agent:x")._model == "agent:x" + + +@pytest.mark.unit +def test_openclaw_rejects_invalid_agent_id() -> None: + with pytest.raises(ValueError, match="letters, digits"): + openclaw.LLM(agent="a b") # space is outside the charset + with pytest.raises(ValueError): + openclaw.LLM(agent="") + + +@pytest.mark.unit +def test_openclaw_defaults_match_consult_preset(monkeypatch) -> None: + monkeypatch.delenv("OPENCLAW_API_KEY", raising=False) + llm = openclaw.LLM(agent="receptionist") + # Byte-identical to the shipped consult preset constants in models.py. + assert _base_url_str(llm).startswith(_OPENCLAW_DEFAULT_BASE_URL) + assert _OPENCLAW_DEFAULT_BASE_URL == "http://127.0.0.1:18789/v1" + assert _OPENCLAW_API_KEY_ENV == "OPENCLAW_API_KEY" + # Wire-identical to the old session_header behaviour: per-call header with + # an empty prefix => the raw call id. + assert ( + llm._session_id_header == _OPENCLAW_SESSION_HEADER == "x-openclaw-session-key" + ) + assert llm._session_id_prefix == "" + assert llm._session_user_prefix == "patter-call-" + # OpenClaw has no separate memory-scope header. + assert llm._session_key_header is None + assert llm._client.timeout == 120.0 + assert llm.provider_key == "openclaw" + + +@pytest.mark.unit +def test_openclaw_wire_output_is_byte_identical(monkeypatch) -> None: + """The OpenClaw preset emits user='patter-call-' and the raw call id in + the x-openclaw-session-key header — unchanged by the param rename.""" + monkeypatch.delenv("OPENCLAW_API_KEY", raising=False) + llm = openclaw.LLM(agent="receptionist") + kwargs = llm._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="c2" + ) + assert kwargs["user"] == "patter-call-c2" + assert kwargs["extra_headers"] == {"x-openclaw-session-key": "c2"} + + +@pytest.mark.unit +def test_openclaw_api_key_from_env(monkeypatch) -> None: + monkeypatch.setenv("OPENCLAW_API_KEY", "operator-grade-token") + llm = openclaw.LLM(agent="receptionist") + assert llm._client.api_key == "operator-grade-token" + + +# --------------------------------------------------------------------------- +# Wire-level — mocks ONLY the paid boundary (chat.completions.create). +# --------------------------------------------------------------------------- + + +class _Choice: + def __init__(self, content) -> None: + self.delta = type("D", (), {"content": content, "tool_calls": None})() + + +class _Chunk: + def __init__(self, content) -> None: + self.choices = [_Choice(content)] + self.usage = None + + +class _FakeStream: + def __init__(self, chunks) -> None: + self._chunks = chunks + + def __aiter__(self): + return self._gen() + + async def _gen(self): + for chunk in self._chunks: + yield chunk + + async def close(self) -> None: # pragma: no cover - not exercised + pass + + +async def _capture_create_kwargs(llm, call_id="hid-1") -> dict: + captured: dict = {} + + async def fake_create(**kwargs): + captured.update(kwargs) + return _FakeStream([_Chunk("ok")]) + + llm._client.chat.completions.create = fake_create + async for _ in llm.stream( + [{"role": "user", "content": "hi"}], None, call_id=call_id + ): + pass + return captured + + +@pytest.mark.mocked +async def test_hermes_sends_session_id_header_by_default() -> None: + """Hermes emits X-Hermes-Session-Id=patter-call- on the wire; the + memory-scope header is absent unless session_key is configured.""" + captured = await _capture_create_kwargs(hermes.LLM(), call_id="hid-1") + headers = captured["extra_headers"] + assert headers["X-Hermes-Session-Id"] == "patter-call-hid-1" + assert "X-Hermes-Session-Key" not in headers + + +@pytest.mark.mocked +async def test_hermes_sends_session_key_header_when_configured() -> None: + captured = await _capture_create_kwargs( + hermes.LLM(session_key="mem-xyz"), call_id="hid-2" + ) + headers = captured["extra_headers"] + assert headers["X-Hermes-Session-Id"] == "patter-call-hid-2" + assert headers["X-Hermes-Session-Key"] == "mem-xyz" + + +@pytest.mark.mocked +async def test_openclaw_sends_raw_call_id_header_on_the_wire() -> None: + captured = await _capture_create_kwargs( + openclaw.LLM(agent="receptionist"), call_id="c3" + ) + assert captured["user"] == "patter-call-c3" + assert captured["extra_headers"] == {"x-openclaw-session-key": "c3"} diff --git a/libraries/python/tests/test_llm_loop_call_id_threading.py b/libraries/python/tests/test_llm_loop_call_id_threading.py new file mode 100644 index 00000000..7797d5bc --- /dev/null +++ b/libraries/python/tests/test_llm_loop_call_id_threading.py @@ -0,0 +1,167 @@ +"""The LLM loop threads call_id from call_context into provider.stream(). + +Real ``LLMLoop`` end to end. The "provider" is a tiny in-process recording +double that records the ``call_id`` kwarg it received — it is NOT a mock of the +unit under test (the loop is real); it only stands in for the external LLM +endpoint, exactly like the existing FakeLLMProvider in test_llm_loop.py. +""" + +from __future__ import annotations + +import pytest + +from getpatter.services.llm_loop import LLMLoop, _stream_accepts_call_id + + +def _make_loop(provider) -> LLMLoop: + """Construct a real LLMLoop around a recording provider (no network).""" + loop = LLMLoop.__new__(LLMLoop) + loop._provider = provider + loop._system_prompt = "You are a test assistant." + loop._tools = None + loop._tool_executor = None + loop._metrics = None + loop._event_bus = None + loop._model = "fake-model" + loop._provider_name = "fake" + loop._openai_tools = None + loop._tool_map = {} + loop._on_tool_call = None + loop._usage_missing_count = 0 + loop._logged_usage_fallback = False + return loop + + +class _RecordingProvider: + """Records the call_id it was streamed with, then yields one text chunk.""" + + def __init__(self) -> None: + self.seen_call_id: object = "<>" + self.stream_called = False + + async def stream(self, messages, tools=None, *, cancel_event=None, call_id=None): + self.stream_called = True + self.seen_call_id = call_id + yield {"type": "text", "content": "ok"} + + +class _LegacyProvider: + """An older provider that only reads **kwargs (mirrors a stock + OpenAILLMProvider-shaped double) — no session config, no `user` emitted.""" + + def __init__(self) -> None: + self.seen_kwargs: dict = {} + + async def stream(self, messages, tools=None, **kwargs): + self.seen_kwargs = dict(kwargs) + yield {"type": "text", "content": "legacy"} + + +class _MinimalProvider: + """A minimal custom provider whose stream takes NO call_id and NO **kwargs. + + This is the case the inspect.signature guard protects: the loop must NOT + pass call_id, or this raises ``TypeError: stream() got an unexpected + keyword argument 'call_id'``. + """ + + def __init__(self) -> None: + self.stream_called = False + + async def stream(self, messages, tools=None, *, cancel_event=None): + self.stream_called = True + yield {"type": "text", "content": "minimal"} + + +@pytest.mark.unit +async def test_run_forwards_call_id_from_context_into_provider_stream() -> None: + provider = _RecordingProvider() + loop = _make_loop(provider) + + tokens = [] + async for token in loop.run("Hi", [], {"call_id": "xyz"}): + tokens.append(token) + + assert provider.stream_called is True + assert provider.seen_call_id == "xyz" + assert tokens == ["ok"] + + +@pytest.mark.unit +async def test_run_passes_none_call_id_when_context_lacks_it() -> None: + provider = _RecordingProvider() + loop = _make_loop(provider) + + async for _ in loop.run("Hi", [], {}): # no call_id key + pass + + assert provider.seen_call_id is None + + +@pytest.mark.unit +async def test_legacy_provider_ignores_call_id_without_error() -> None: + """A provider that only takes **kwargs still works — the added call_id + kwarg is swallowed harmlessly (backward compatibility of the protocol).""" + provider = _LegacyProvider() + loop = _make_loop(provider) + + tokens = [] + async for token in loop.run("Hi", [], {"call_id": "abc"}): + tokens.append(token) + + assert tokens == ["legacy"] + # The loop did pass call_id; the legacy provider absorbed it via **kwargs. + assert provider.seen_kwargs.get("call_id") == "abc" + + +@pytest.mark.unit +async def test_minimal_provider_without_call_id_param_runs_without_error() -> None: + """A custom provider whose stream lacks call_id AND **kwargs must still run + — the loop's signature guard skips call_id for it (no TypeError).""" + provider = _MinimalProvider() + loop = _make_loop(provider) + + tokens = [] + async for token in loop.run("Hi", [], {"call_id": "abc"}): + tokens.append(token) + + assert provider.stream_called is True + assert tokens == ["minimal"] + + +@pytest.mark.unit +def test_signature_guard_classifies_providers() -> None: + """The introspection guard accepts providers that declare call_id or + **kwargs and rejects the minimal one — the load-bearing back-compat check.""" + assert _stream_accepts_call_id(_RecordingProvider()) is True # declares call_id + assert _stream_accepts_call_id(_LegacyProvider()) is True # **kwargs + assert _stream_accepts_call_id(_MinimalProvider()) is False + + +@pytest.mark.unit +def test_signature_guard_defaults_to_no_call_id_when_uninspectable(monkeypatch) -> None: + """When ``inspect.signature`` cannot introspect a provider's ``stream`` + (some C-level callables raise ValueError/TypeError), the guard catches it + and defaults to the safe no-call_id path rather than propagating. + + Forces ``inspect.signature`` to raise so the ``except`` branch is actually + exercised — relying on a specific builtin being uninspectable is brittle + (e.g. ``range`` *is* inspectable on CPython 3.11+ via ``__text_signature__``). + """ + import getpatter.services.llm_loop as _loop + + def _raise(*_args, **_kwargs): + raise ValueError("no signature available") + + monkeypatch.setattr(_loop.inspect, "signature", _raise) + + class _Uninspectable: + # Declares call_id, but the forced signature() failure must still drive + # the guard into its except branch → conservative no-call_id path. + async def stream( + self, messages, tools=None, *, cancel_event=None, call_id=None + ): + yield {"type": "text", "content": ""} + + # Must not raise, and must take the conservative (no call_id) path. + assert _stream_accepts_call_id(_Uninspectable()) is False diff --git a/libraries/python/tests/test_llm_openai_compatible.py b/libraries/python/tests/test_llm_openai_compatible.py new file mode 100644 index 00000000..ea00b111 --- /dev/null +++ b/libraries/python/tests/test_llm_openai_compatible.py @@ -0,0 +1,424 @@ +"""Tests for the generic OpenAI-compatible LLM provider. + +These exercise the REAL provider: real construction of the ``AsyncOpenAI`` +client (base URL / timeout / headers), real ``_build_completion_kwargs`` +assembly, and real SSE-chunk normalisation. The ONLY mocked surface is the +paid external boundary — ``AsyncOpenAI.chat.completions.create`` — and that +test is tagged ``@pytest.mark.mocked``. +""" + +from __future__ import annotations + +import pytest + +from getpatter.llm.openai_compatible import OpenAICompatibleLLMProvider +from getpatter.services.llm_loop import LLMProvider + + +def _base_url_str(provider: OpenAICompatibleLLMProvider) -> str: + """Read the constructed client's base URL (AsyncOpenAI appends a slash).""" + return str(provider._client.base_url) + + +@pytest.mark.unit +def test_openai_compatible_provider_points_client_at_base_url_with_timeout() -> None: + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", model="m", timeout=120.0 + ) + # Real client carries the base URL and the long (non-default) timeout. + assert _base_url_str(provider).startswith("http://127.0.0.1:9/v1") + assert provider._client.timeout == 120.0 + assert provider._model == "m" + # Satisfies the LLMProvider protocol. + assert isinstance(provider, LLMProvider) + + +@pytest.mark.unit +def test_keyless_gateway_construction_does_not_raise(monkeypatch) -> None: + # No api_key, no api_key_env — Ollama / vLLM / LM Studio keyless path. + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:11434/v1", model="llama3.1" + ) + # The EMPTY sentinel keeps AsyncOpenAI (which rejects None) happy. + assert provider._client.api_key == "EMPTY" + + +@pytest.mark.unit +def test_api_key_resolved_from_env_var(monkeypatch) -> None: + monkeypatch.setenv("MY_GATEWAY_KEY", "secret-token-value") + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + api_key_env="MY_GATEWAY_KEY", + ) + # The resolved key reaches the client (and is never logged). + assert provider._client.api_key == "secret-token-value" + + +@pytest.mark.unit +def test_explicit_api_key_wins_over_env(monkeypatch) -> None: + monkeypatch.setenv("MY_GATEWAY_KEY", "from-env") + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + api_key="explicit-key", + api_key_env="MY_GATEWAY_KEY", + ) + assert provider._client.api_key == "explicit-key" + + +@pytest.mark.unit +def test_session_user_prefix_off_by_default_omits_user_field() -> None: + provider = OpenAICompatibleLLMProvider(base_url="http://127.0.0.1:9/v1", model="m") + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="abc" + ) + # Backward compatible: no session prefix => no `user` field. + assert "user" not in kwargs + + +@pytest.mark.unit +def test_session_user_prefix_emits_stable_call_user() -> None: + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + session_user_prefix="patter-call-", + ) + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="abc" + ) + assert kwargs["user"] == "patter-call-abc" + + +@pytest.mark.unit +def test_session_user_field_omitted_without_call_id() -> None: + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + session_user_prefix="patter-call-", + ) + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id=None + ) + # Prefix set but no call id => still no `user` field. + assert "user" not in kwargs + + +@pytest.mark.unit +def test_session_id_header_emits_prefixed_value_independent_of_user() -> None: + """session_id_header + session_id_prefix produce + extra_headers[name]=f'{prefix}{call_id}' WITHOUT needing the user field.""" + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + # Note: no session_user_prefix — the header stands alone. + session_id_header="X-Hermes-Session-Id", + session_id_prefix="patter-call-", + ) + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="abc" + ) + assert kwargs["extra_headers"] == {"X-Hermes-Session-Id": "patter-call-abc"} + # Decoupled: no user field because session_user_prefix is unset. + assert "user" not in kwargs + + +@pytest.mark.unit +def test_session_key_header_emits_static_value_regardless_of_call_id() -> None: + """session_key_header + session_key emit a STATIC header (no call_id + interpolation), present even when no call_id is available.""" + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + session_key_header="X-Hermes-Session-Key", + session_key="mem-scope-123", + ) + # No call_id at all — the memory-scope header is per-call-independent. + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id=None + ) + assert kwargs["extra_headers"] == {"X-Hermes-Session-Key": "mem-scope-123"} + + +@pytest.mark.unit +def test_session_key_header_without_value_is_omitted() -> None: + """session_key_header set but session_key None => header omitted (opt-in).""" + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + session_key_header="X-Hermes-Session-Key", + # session_key intentionally unset. + ) + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="abc" + ) + assert "extra_headers" not in kwargs + + +@pytest.mark.unit +def test_all_three_signals_combine_without_clobbering_existing_headers() -> None: + """user + session_id_header + session_key_header merge into one + extra_headers dict that also preserves a pre-existing header.""" + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + session_user_prefix="patter-call-", + session_id_header="X-Hermes-Session-Id", + session_id_prefix="patter-call-", + session_key_header="X-Hermes-Session-Key", + session_key="mem-9", + ) + # Simulate a pre-existing extra_headers on the kwargs (future-safe merge). + import getpatter.services.llm_loop as base_mod + + orig = base_mod.OpenAILLMProvider._build_completion_kwargs + + def _with_existing(self, messages, tools): + kw = orig(self, messages, tools) + kw["extra_headers"] = {"X-Pre": "keep"} + return kw + + base_mod.OpenAILLMProvider._build_completion_kwargs = _with_existing + try: + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="abc" + ) + finally: + base_mod.OpenAILLMProvider._build_completion_kwargs = orig + + assert kwargs["user"] == "patter-call-abc" + assert kwargs["extra_headers"] == { + "X-Pre": "keep", + "X-Hermes-Session-Id": "patter-call-abc", + "X-Hermes-Session-Key": "mem-9", + } + + +@pytest.mark.unit +def test_no_session_signals_is_byte_identical_to_parent() -> None: + """None of the three signals set => no `user`, no `extra_headers`.""" + provider = OpenAICompatibleLLMProvider(base_url="http://127.0.0.1:9/v1", model="m") + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="abc" + ) + assert "user" not in kwargs + assert "extra_headers" not in kwargs + + +@pytest.mark.unit +def test_openclaw_shape_config_yields_raw_call_id_value() -> None: + """Regression: OpenClaw uses session_id_header with an empty prefix, so the + header value is the RAW call id — wire-identical to the old session_header + behaviour.""" + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + session_user_prefix="patter-call-", + session_id_header="x-openclaw-session-key", + session_id_prefix="", + ) + kwargs = provider._build_completion_kwargs( + [{"role": "user", "content": "hi"}], None, call_id="abc" + ) + assert kwargs["user"] == "patter-call-abc" + assert kwargs["extra_headers"] == {"x-openclaw-session-key": "abc"} + + +@pytest.mark.unit +def test_extra_headers_merge_preserves_user_agent() -> None: + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + extra_headers={"X-Foo": "1"}, + ) + headers = dict(provider._client.default_headers) + assert headers.get("X-Foo") == "1" + # User-Agent is the getpatter SDK attribution and must survive the merge. + assert headers.get("User-Agent", "").startswith("getpatter/") + + +# --------------------------------------------------------------------------- +# Streaming — mocks ONLY the paid external boundary (chat.completions.create). +# --------------------------------------------------------------------------- + + +class _Delta: + def __init__(self, content=None, tool_calls=None) -> None: + self.content = content + self.tool_calls = tool_calls + + +class _Choice: + def __init__(self, delta) -> None: + self.delta = delta + + +class _Chunk: + """Real OpenAI-SSE-shaped streaming chunk (the SDK yields these objects).""" + + def __init__(self, *, content=None, usage=None) -> None: + self.choices = [_Choice(_Delta(content=content))] if content is not None else [] + self.usage = usage + + +class _Usage: + def __init__(self, prompt_tokens, completion_tokens) -> None: + self.prompt_tokens = prompt_tokens + self.completion_tokens = completion_tokens + self.prompt_tokens_details = None + + +class _FakeStream: + """Async iterator over chunks, mimicking the AsyncOpenAI stream object.""" + + def __init__(self, chunks) -> None: + self._chunks = chunks + + def __aiter__(self): + return self._gen() + + async def _gen(self): + for chunk in self._chunks: + yield chunk + + async def close(self) -> None: # pragma: no cover - not exercised here + pass + + +@pytest.mark.mocked +async def test_stream_sends_user_field_and_speaks_content() -> None: + """The create() call receives user='patter-call-' and the inherited + real SSE loop normalises chunks to the Patter chunk protocol.""" + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + session_user_prefix="patter-call-", + ) + + captured: dict = {} + + async def fake_create(**kwargs): + captured.update(kwargs) + return _FakeStream( + [ + _Chunk(content="Hello"), + _Chunk(content=" world"), + _Chunk(usage=_Usage(prompt_tokens=10, completion_tokens=3)), + ] + ) + + # Mock ONLY the paid external boundary. + provider._client.chat.completions.create = fake_create + + chunks = [] + async for chunk in provider.stream( + [{"role": "user", "content": "hi"}], None, call_id="xyz" + ): + chunks.append(chunk) + + # The per-call session user reached the wire. + assert captured["user"] == "patter-call-xyz" + + text = "".join(c["content"] for c in chunks if c["type"] == "text") + assert text == "Hello world" + + usage = [c for c in chunks if c["type"] == "usage"] + assert len(usage) == 1 + assert usage[0]["input_tokens"] == 10 + assert usage[0]["output_tokens"] == 3 + + +@pytest.mark.mocked +async def test_stream_without_session_prefix_omits_user_on_the_wire() -> None: + provider = OpenAICompatibleLLMProvider(base_url="http://127.0.0.1:9/v1", model="m") + + captured: dict = {} + + async def fake_create(**kwargs): + captured.update(kwargs) + return _FakeStream([_Chunk(content="ok")]) + + provider._client.chat.completions.create = fake_create + + async for _ in provider.stream( + [{"role": "user", "content": "hi"}], None, call_id="xyz" + ): + pass + + # Backward compatible: no `user` field unless the caller opts in. + assert "user" not in captured + + +@pytest.mark.mocked +async def test_stream_sends_session_id_header_on_the_wire() -> None: + """A provider configured with session_id_header puts the per-call header + onto the create() call via extra_headers — independent of the user field.""" + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:9/v1", + model="m", + session_id_header="X-Hermes-Session-Id", + session_id_prefix="patter-call-", + ) + + captured: dict = {} + + async def fake_create(**kwargs): + captured.update(kwargs) + return _FakeStream([_Chunk(content="ok")]) + + provider._client.chat.completions.create = fake_create + + async for _ in provider.stream( + [{"role": "user", "content": "hi"}], None, call_id="abc" + ): + pass + + assert captured["extra_headers"] == {"X-Hermes-Session-Id": "patter-call-abc"} + # No user field — session_user_prefix unset. + assert "user" not in captured + + +async def _capture_warmup_headers(provider, monkeypatch) -> dict: + """Run warmup() with only the httpx GET boundary mocked, returning the + real headers the provider assembled.""" + import httpx + + captured: dict = {} + + async def fake_get(self, url, headers=None, **kwargs): + captured["url"] = url + captured["headers"] = headers or {} + return None + + monkeypatch.setattr(httpx.AsyncClient, "get", fake_get) + await provider.warmup() + return captured + + +@pytest.mark.mocked +async def test_warmup_omits_authorization_for_keyless_gateway(monkeypatch) -> None: + """Keyless gateways (Ollama / vLLM / LM Studio) must not receive a + ``Bearer EMPTY`` header — some reject an unexpected Authorization header.""" + provider = OpenAICompatibleLLMProvider( + base_url="http://127.0.0.1:11434/v1", model="llama3.1" + ) + + captured = await _capture_warmup_headers(provider, monkeypatch) + + assert captured["url"] == "http://127.0.0.1:11434/v1/models" + assert "Authorization" not in captured["headers"] + + +@pytest.mark.mocked +async def test_warmup_sends_authorization_when_real_key_present(monkeypatch) -> None: + """When a real bearer is configured, warmup forwards it (matching the TS + provider) so authenticated gateways accept the prewarm request.""" + provider = OpenAICompatibleLLMProvider( + api_key="sk-real-key", + base_url="http://127.0.0.1:9/v1", + model="m", + ) + + captured = await _capture_warmup_headers(provider, monkeypatch) + + assert captured["headers"].get("Authorization") == "Bearer sk-real-key" diff --git a/libraries/python/tests/unit/test_llm_error_fallback.py b/libraries/python/tests/unit/test_llm_error_fallback.py new file mode 100644 index 00000000..0c49dd5f --- /dev/null +++ b/libraries/python/tests/unit/test_llm_error_fallback.py @@ -0,0 +1,289 @@ +"""Authentic tests for the opt-in spoken LLM-error fallback (pipeline mode). + +When the per-turn LLM stream raises (gateway-down / timeout) BEFORE any +assistant text was spoken, and the agent configured a non-empty +``llm_error_message``, the SDK speaks that line through the SAME TTS turn +lifecycle every normal sentence uses (``_synthesize_sentence`` → +``_tts.synthesize`` → ``audio_sender.send_audio``). + +Only the external boundary is mocked: the LLM provider's ``stream()`` raising +(the gateway hop) and the TTS byte boundary (``_tts.synthesize`` yielding PCM). +Everything from there inward — the real ``LLMLoop.run`` async generator, the +real ``PipelineStreamHandler._process_streaming_response`` error path, the real +``_synthesize_sentence`` speak primitive, and the real metrics accounting — runs +unmocked. These tests carry ``@pytest.mark.mocked`` because the provider stream +throw is an external-boundary mock. +""" + +from __future__ import annotations + +from collections import deque +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from getpatter.exceptions import PatterConnectionError +from getpatter.stream_handler import PipelineStreamHandler + +from tests.conftest import make_agent + +_FALLBACK = "Sorry, I am having trouble right now." + + +# --------------------------------------------------------------------------- +# Boundary doubles — the ONLY mocks: the LLM gateway stream and the TTS bytes +# --------------------------------------------------------------------------- + + +class _RaisingLLMProvider: + """LLM provider whose ``stream()`` raises before yielding any text. + + Mirrors a gateway-down / timeout: the real inherited provider path raises + ``PatterConnectionError`` on a non-OK response. This is the single external + boundary mocked in these tests. + """ + + async def stream(self, messages, tools=None, **_kwargs): + if False: # pragma: no cover - make this an async generator + yield {} + raise PatterConnectionError("gateway down") + + +class _PartialTokenThenRaiseLLMProvider: + """Yields one *partial* token (no sentence boundary), THEN raises. + + The chunker buffers ``"Let me check "`` without producing a complete + sentence, so ``_synthesize_sentence`` is never called and NO PCM reaches + the carrier — the caller heard SILENCE. This is the agent-runtime + (Hermes / OpenClaw) gateway-timeout case: tokens were received but the + fallback must still fire because nothing was actually spoken. + """ + + async def stream(self, messages, tools=None, **_kwargs): + yield {"type": "text", "content": "Let me check "} + raise PatterConnectionError("gateway down mid-stream") + + +class _SpokenSentenceThenRaiseLLMProvider: + """Yields a COMPLETE sentence (real TTS audio emitted), THEN raises. + + A full sentence flushes through the chunker, so ``_synthesize_sentence`` + runs and PCM reaches the carrier. The fallback must NOT fire on top — + the caller already heard speech and a tacked-on apology would double-speak. + """ + + async def stream(self, messages, tools=None, **_kwargs): + yield {"type": "text", "content": "Hello there. "} + raise PatterConnectionError("gateway down after a full sentence") + + +class _FakeTTS: + """TTS byte boundary — ``synthesize(text)`` yields a couple of PCM chunks. + + Records every text it was asked to synthesize so a test can assert the + fallback line (and nothing else) was spoken. + """ + + output_format = "pcm_16000" + + def __init__(self) -> None: + self.synthesized: list[str] = [] + + async def synthesize(self, text: str): + self.synthesized.append(text) + # Two 16-bit PCM frames of silence — enough to drive a send_audio. + yield b"\x00\x00" * 80 + yield b"\x00\x00" * 80 + + +def _make_loop(provider) -> object: + """Build a REAL ``LLMLoop`` wrapping the boundary provider double.""" + from getpatter.services.llm_loop import LLMLoop + + loop = LLMLoop.__new__(LLMLoop) + loop._provider = provider + loop._system_prompt = "You are a test assistant." + loop._tools = None + loop._tool_executor = None + loop._metrics = None + loop._event_bus = None + loop._model = "fake-model" + loop._provider_name = "fake" + loop._openai_tools = None + loop._tool_map = {} + loop._on_tool_call = None + return loop + + +def _make_handler(*, llm_error_message: str | None, tts) -> PipelineStreamHandler: + audio_sender = AsyncMock() + # reset_pcm_carry is called synchronously inside _synthesize_sentence. + audio_sender.reset_pcm_carry = MagicMock() + handler = PipelineStreamHandler( + agent=make_agent(llm_error_message=llm_error_message), + audio_sender=audio_sender, + call_id="call-llm-err", + caller="+15551110000", + callee="+15552220000", + resolved_prompt="p", + metrics=None, + for_twilio=True, + on_transcript=None, + conversation_history=deque(maxlen=10), + transcript_entries=deque(maxlen=10), + ) + handler.on_message = None + handler._tts = tts # type: ignore[assignment] + handler._is_speaking = True + return handler + + +# --------------------------------------------------------------------------- +# Positive: fallback set + stream raises with zero text → line is spoken +# --------------------------------------------------------------------------- + + +@pytest.mark.mocked +class TestFallbackSpokenOnError: + async def test_fallback_line_is_synthesized_and_spoken(self) -> None: + tts = _FakeTTS() + handler = _make_handler(llm_error_message=_FALLBACK, tts=tts) + loop = _make_loop(_RaisingLLMProvider()) + + result = loop.run("Hi", [], {"call_id": "call-llm-err"}) + await handler._process_streaming_response(result, "call-llm-err") + + # Observable spoken-bytes outcome: the configured line went through the + # real TTS primitive AND real audio bytes were sent to the carrier. + assert tts.synthesized == [_FALLBACK] + handler.audio_sender.send_audio.assert_awaited() + + +# --------------------------------------------------------------------------- +# Negative / regression: unset field → nothing spoken on error (today's behaviour) +# --------------------------------------------------------------------------- + + +@pytest.mark.mocked +class TestNoFallbackWhenUnset: + async def test_no_fallback_synthesized_when_field_is_none(self) -> None: + tts = _FakeTTS() + handler = _make_handler(llm_error_message=None, tts=tts) + loop = _make_loop(_RaisingLLMProvider()) + + result = loop.run("Hi", [], {"call_id": "call-llm-err"}) + await handler._process_streaming_response(result, "call-llm-err") + + # Today's behaviour preserved: silence on LLM error, no fallback speech. + assert tts.synthesized == [] + handler.audio_sender.send_audio.assert_not_awaited() + + async def test_empty_string_is_treated_as_unset(self) -> None: + tts = _FakeTTS() + handler = _make_handler(llm_error_message="", tts=tts) + loop = _make_loop(_RaisingLLMProvider()) + + result = loop.run("Hi", [], {"call_id": "call-llm-err"}) + await handler._process_streaming_response(result, "call-llm-err") + + assert tts.synthesized == [] + handler.audio_sender.send_audio.assert_not_awaited() + + +# --------------------------------------------------------------------------- +# Gate semantics: fallback fires on emitted-audio, not on received-tokens +# --------------------------------------------------------------------------- + + +@pytest.mark.mocked +class TestFallbackGatedOnEmittedAudio: + async def test_fallback_fires_when_partial_tokens_produced_no_audio(self) -> None: + """Partial tokens buffered by the chunker but never synthesized → the + caller heard SILENCE → the fallback line MUST still be spoken. + + This is the agent-runtime gateway-timeout regression: gating on token + receipt (the old ``not full_response_parts`` check) wrongly suppressed + the fallback here even though no PCM ever reached the carrier. + """ + tts = _FakeTTS() + handler = _make_handler(llm_error_message=_FALLBACK, tts=tts) + loop = _make_loop(_PartialTokenThenRaiseLLMProvider()) + + result = loop.run("Hi", [], {"call_id": "call-llm-err"}) + await handler._process_streaming_response(result, "call-llm-err") + + # The partial token never produced a sentence, so the ONLY thing + # synthesized is the fallback line — and real audio was sent. + assert tts.synthesized == [_FALLBACK] + handler.audio_sender.send_audio.assert_awaited() + + async def test_fallback_suppressed_after_a_full_sentence_was_spoken(self) -> None: + """A complete sentence flushed real TTS audio before the raise → the + caller already heard speech → the fallback must NOT double-speak. + """ + tts = _FakeTTS() + handler = _make_handler(llm_error_message=_FALLBACK, tts=tts) + loop = _make_loop(_SpokenSentenceThenRaiseLLMProvider()) + + result = loop.run("Hi", [], {"call_id": "call-llm-err"}) + await handler._process_streaming_response(result, "call-llm-err") + + # The real sentence was spoken; the fallback line was NOT appended. + assert "Hello there." in tts.synthesized + assert _FALLBACK not in tts.synthesized + + +# --------------------------------------------------------------------------- +# Barge-in guard: speaking flipped off before the raise → no fallback +# --------------------------------------------------------------------------- + + +@pytest.mark.mocked +class TestBargeInSuppressesFallback: + async def test_fallback_not_spoken_when_not_speaking(self) -> None: + tts = _FakeTTS() + handler = _make_handler(llm_error_message=_FALLBACK, tts=tts) + + # Simulate a concurrent barge-in that flipped the floor off right as + # the stream raises. + class _FlipThenRaise: + async def stream(self, messages, tools=None, **_kwargs): + if False: # pragma: no cover + yield {} + handler._is_speaking = False + raise PatterConnectionError("gateway down during barge-in") + + loop = _make_loop(_FlipThenRaise()) + result = loop.run("Hi", [], {"call_id": "call-llm-err"}) + await handler._process_streaming_response(result, "call-llm-err") + + assert tts.synthesized == [] + handler.audio_sender.send_audio.assert_not_awaited() + + +# --------------------------------------------------------------------------- +# Authenticity invariant: the positive test exercises the REAL speak primitive +# --------------------------------------------------------------------------- + + +@pytest.mark.mocked +class TestExercisesRealSpeakPrimitive: + async def test_fails_if_synthesize_sentence_is_not_real(self) -> None: + tts = _FakeTTS() + handler = _make_handler(llm_error_message=_FALLBACK, tts=tts) + loop = _make_loop(_RaisingLLMProvider()) + + async def _broken(*_a, **_k): + raise NotImplementedError + + # Replace the real speak primitive: the fallback's own try/except must + # swallow the failure (degrade to silence) — so the line is NOT spoken, + # proving the positive test above depends on the REAL primitive running. + handler._synthesize_sentence = _broken # type: ignore[assignment] + + result = loop.run("Hi", [], {"call_id": "call-llm-err"}) + # Must not raise — a TTS/primitive outage on top of an LLM outage + # degrades to today's silence, not a handler crash. + await handler._process_streaming_response(result, "call-llm-err") + + assert tts.synthesized == [] diff --git a/libraries/typescript/package-lock.json b/libraries/typescript/package-lock.json index b0500fe2..48a562ae 100644 --- a/libraries/typescript/package-lock.json +++ b/libraries/typescript/package-lock.json @@ -1,12 +1,12 @@ { "name": "getpatter", - "version": "0.6.3", + "version": "0.6.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "getpatter", - "version": "0.6.3", + "version": "0.6.4", "license": "MIT", "dependencies": { "express": "^5.2.1", diff --git a/libraries/typescript/src/index.ts b/libraries/typescript/src/index.ts index 88a2f628..1b993338 100644 --- a/libraries/typescript/src/index.ts +++ b/libraries/typescript/src/index.ts @@ -188,6 +188,14 @@ export { LLM as CerebrasLLM } from "./llm/cerebras"; export type { CerebrasLLMOptions } from "./llm/cerebras"; export { LLM as GoogleLLM } from "./llm/google"; export type { GoogleLLMOptions } from "./llm/google"; +// Agent-runtime LLM providers (Patter as the voice shell in front of an +// OpenAI-compatible agent runtime / local inference gateway). +export { LLM as OpenAICompatibleLLM, OpenAICompatibleLLMProvider } from "./llm/openai-compatible"; +export type { OpenAICompatibleLLMOptions } from "./llm/openai-compatible"; +export { LLM as HermesLLM } from "./llm/hermes"; +export type { HermesLLMOptions } from "./llm/hermes"; +export { LLM as OpenClawLLM } from "./llm/openclaw"; +export type { OpenClawLLMOptions } from "./llm/openclaw"; // Voice Activity Detection (server-side) — Silero ONNX. export { SileroVAD } from "./providers/silero-vad"; diff --git a/libraries/typescript/src/llm-loop.ts b/libraries/typescript/src/llm-loop.ts index 4be50b9d..6005f2b4 100644 --- a/libraries/typescript/src/llm-loop.ts +++ b/libraries/typescript/src/llm-loop.ts @@ -420,6 +420,18 @@ export interface LLMChunk { */ export interface LLMStreamOptions { signal?: AbortSignal; + /** + * Stable per-call id (the same value the stream handler builds into + * ``callCtx.call_id``). Threaded through purely so session-aware providers + * — currently {@link OpenAICompatibleLLMProvider} and its Hermes / OpenClaw + * presets — can emit the OpenAI ``user`` field as ``patter-call-``, + * giving the upstream agent runtime one durable session per phone call. + * + * Additive and optional: every existing provider reads only ``signal`` and + * is unaffected. When unset (or when a provider has no session-continuity + * config) no ``user`` field is sent — fully backward compatible. + */ + callId?: string; } /** @@ -909,13 +921,25 @@ export class LLMLoop { const hasAfterLlmChunk = Boolean(hookExecutor?.hasAfterLlmChunk()); const allEmittedText: string[] = []; + // Thread the stable per-call id into the provider stream options so + // session-aware providers (OpenAI-compatible / Hermes / OpenClaw) can + // emit the ``user`` field for one runtime session per phone call. Purely + // additive: providers that read only ``signal`` ignore it. Only spread a + // string call id — leave ``opts`` untouched otherwise so existing + // behaviour is byte-identical when no call id is present. + const callId = callContext.call_id; + const streamOpts: LLMStreamOptions | undefined = + typeof callId === 'string' && callId.length > 0 + ? { ...opts, callId } + : opts; + for (let iter = 0; iter < maxIterations; iter++) { const toolCallsAccumulated = new Map(); const textParts: string[] = []; let hasToolCalls = false; let usageChunkReceived = false; - for await (const chunk of this.provider.stream(messages, this.openaiTools, opts)) { + for await (const chunk of this.provider.stream(messages, this.openaiTools, streamOpts)) { if (chunk.type === 'text' && chunk.content) { // Tier 1 — per-token sync transform. Cheap, no buffering. const content = hasAfterLlmChunk && hookExecutor diff --git a/libraries/typescript/src/llm/hermes.ts b/libraries/typescript/src/llm/hermes.ts new file mode 100644 index 00000000..ef055172 --- /dev/null +++ b/libraries/typescript/src/llm/hermes.ts @@ -0,0 +1,121 @@ +/** + * Hermes agent-runtime LLM preset for Patter's pipeline mode. + * + * Thin preset over {@link OpenAICompatibleLLMProvider}: defaults the base URL, + * model, env-key name, timeout, and session-continuity prefix for the Hermes + * agent runtime so a user just writes ``phone.agent({ llm: new hermes.LLM() })``. + * + * Hermes runs tools / memory / skills internally before replying, so a single + * conversation turn can take 30-90 s — hence the 120 s default timeout. Hermes + * is stateless and keys continuity off HEADERS, not the OpenAI ``user`` field: + * the preset sends ``X-Hermes-Session-Id: patter-call-`` on every turn + * for per-call session / transcript continuity (on by default), and optionally + * ``X-Hermes-Session-Key: `` for long-term memory scoping when you + * pass ``sessionKey``. (It also still emits ``user=patter-call-`` for + * upstream-log correlation, but that is not what drives the session.) + */ +import { + OpenAICompatibleLLMProvider, + type OpenAICompatibleLLMOptions, +} from './openai-compatible'; + +/** Default Hermes agent-runtime base URL (loopback, operator-controlled). */ +const BASE_URL = 'http://127.0.0.1:8642/v1'; +/** Fallback model when neither ``model`` nor ``API_SERVER_MODEL_NAME`` is set. */ +const DEFAULT_MODEL = 'hermes-agent'; +/** Env var Hermes reads its bearer from. */ +const API_KEY_ENV = 'API_SERVER_KEY'; +/** Env var Hermes reads its model id from. */ +const MODEL_ENV = 'API_SERVER_MODEL_NAME'; +/** Per-call ``user`` prefix (upstream-log correlation; not the session driver). */ +const SESSION_USER_PREFIX = 'patter-call-'; +/** Header carrying the per-call session id (the primary continuity mechanism). */ +const SESSION_ID_HEADER = 'X-Hermes-Session-Id'; +/** Prefix for the session-id header value → ``X-Hermes-Session-Id: patter-call-``. */ +const SESSION_ID_PREFIX = 'patter-call-'; +/** Static header scoping long-term memory (sent only when ``sessionKey`` is set). */ +const SESSION_KEY_HEADER = 'X-Hermes-Session-Key'; +/** Default timeout (seconds): runtimes run tools before replying. */ +const DEFAULT_TIMEOUT_S = 120; + +/** Constructor options for the Hermes ``LLM`` preset. */ +export interface HermesLLMOptions { + /** Bearer token. Falls back to ``API_SERVER_KEY`` env var when omitted. */ + apiKey?: string; + /** Override the Hermes base URL (rarely needed). */ + baseUrl?: string; + /** Model id. Falls back to ``API_SERVER_MODEL_NAME`` env, then ``"hermes-agent"``. */ + model?: string; + /** Per-request timeout in seconds. Default ``120``. */ + timeout?: number; + /** + * Long-term memory scope. When set, emits ``X-Hermes-Session-Key`` so Hermes + * scopes durable memory to this value across calls. ``undefined`` (default) + * means the header is not sent. Credential-grade — never logged. + */ + sessionKey?: string; + /** Extra headers merged after the SDK ``User-Agent``. */ + extraHeaders?: Record; + /** Sampling temperature [0, 2]. */ + temperature?: number; + /** Max tokens in the assistant response (sent as ``max_completion_tokens``). */ + maxTokens?: number; + /** OpenAI-style ``response_format`` for JSON mode / structured outputs. */ + responseFormat?: Record; + /** Whether to allow parallel tool calls. */ + parallelToolCalls?: boolean; + /** ``"auto" | "none" | "required"`` or a specific tool object. */ + toolChoice?: string | Record; + /** Sampling seed for reproducible outputs. */ + seed?: number; + /** Nucleus sampling cutoff in [0, 1]. */ + topP?: number; + /** Penalty in [-2, 2] applied to repeated tokens. */ + frequencyPenalty?: number; + /** Penalty in [-2, 2] applied to seen tokens. */ + presencePenalty?: number; + /** Stop sequence(s). */ + stop?: string | string[]; +} + +/** + * Hermes agent-runtime LLM provider (OpenAI-compatible, streaming). + * + * @example + * ```ts + * import * as hermes from "getpatter/llm/hermes"; + * const llm = new hermes.LLM(); // env-defaulted, keyless OK + * const llm = new hermes.LLM({ apiKey: "...", model: "hermes-7b" }); + * ``` + */ +export class LLM extends OpenAICompatibleLLMProvider { + static readonly providerKey = 'hermes'; + + constructor(opts: HermesLLMOptions = {}) { + const model = opts.model ?? process.env[MODEL_ENV] ?? DEFAULT_MODEL; + const options: OpenAICompatibleLLMOptions = { + apiKey: opts.apiKey, + apiKeyEnv: API_KEY_ENV, + baseUrl: opts.baseUrl ?? BASE_URL, + model, + timeout: opts.timeout ?? DEFAULT_TIMEOUT_S, + sessionUserPrefix: SESSION_USER_PREFIX, + sessionIdHeader: SESSION_ID_HEADER, + sessionIdPrefix: SESSION_ID_PREFIX, + sessionKeyHeader: SESSION_KEY_HEADER, + sessionKey: opts.sessionKey, + extraHeaders: opts.extraHeaders, + temperature: opts.temperature, + maxTokens: opts.maxTokens, + responseFormat: opts.responseFormat, + parallelToolCalls: opts.parallelToolCalls, + toolChoice: opts.toolChoice, + seed: opts.seed, + topP: opts.topP, + frequencyPenalty: opts.frequencyPenalty, + presencePenalty: opts.presencePenalty, + stop: opts.stop, + }; + super(options); + } +} diff --git a/libraries/typescript/src/llm/openai-compatible.ts b/libraries/typescript/src/llm/openai-compatible.ts new file mode 100644 index 00000000..2dd7f788 --- /dev/null +++ b/libraries/typescript/src/llm/openai-compatible.ts @@ -0,0 +1,353 @@ +/** + * Generic OpenAI-compatible LLM provider for Patter's pipeline mode. + * + * Drives *any* OpenAI-compatible ``/chat/completions`` endpoint — an agent + * runtime (Hermes, OpenClaw) or a local inference gateway (Ollama, vLLM, + * LM Studio). Patter owns the carrier + STT + turn-taking + TTS; this + * provider turns each conversation turn into a single + * ``POST {baseUrl}/chat/completions`` request and speaks the response. + * + * PARITY NOTE (internal divergence, allowed by ``sdk-parity.md``): on the + * Python side this provider subclasses ``OpenAILLMProvider`` and merely swaps + * the ``AsyncOpenAI`` client (passing ``timeout=`` / ``base_url=``). The TS + * base ``OpenAILLMProvider`` is a raw-``fetch`` class with a HARDCODED 30 s + * timeout and ``baseUrl`` exposed as a ``protected get`` rather than a + * constructor field, so the "swap the client" trick is impossible here. + * Instead this is a STANDALONE ``implements LLMProvider`` class (same shape as + * {@link GroqLLMProvider} / {@link CerebrasLLMProvider}) that owns its own + * configurable timeout and reuses {@link parseOpenAISseStream}. Observably + * identical to Python (same 60 s / 120 s ceilings, same ``user`` field, same + * headers); only the timeout *mechanism* differs. + * + * Two additions over the base OpenAI provider: + * + * - **Long timeout.** Agent runtimes execute tools / memory / skills before + * replying, so a turn can take 30-90 s. The default is 60 s here (the + * presets raise it to 120 s), REPLACING the base provider's hardcoded 30 s. + * - **Session continuity.** Three independent, opt-in signals — each gated on + * its own config, none coupled to another: + * - ``sessionUserPrefix`` → emits the OpenAI ``user`` field as + * ``` `${sessionUserPrefix}${callId}` ```. Used by runtimes that derive + * a session from ``user`` (e.g. OpenClaw's gateway). + * - ``sessionIdHeader`` (+ optional ``sessionIdPrefix``) → emits a per-call + * header carrying ``` `${sessionIdPrefix}${callId}` ``` for per-call + * session / transcript continuity on stateless runtimes that key off + * headers (e.g. Hermes' ``X-Hermes-Session-Id``). + * - ``sessionKeyHeader`` (+ ``sessionKey``) → emits a STATIC header for + * long-term memory scoping (e.g. Hermes' ``X-Hermes-Session-Key``); the + * value is the raw ``sessionKey``, never interpolated with the call id. + * All three are OFF by default — fully backward compatible. ``sessionKey`` is + * a credential-grade memory scope and is NEVER logged. + * + * Keyless gateways (Ollama / vLLM / LM Studio accept no key) are supported: + * the ``Authorization`` header is simply omitted from the request (sending a + * ``Bearer EMPTY`` placeholder breaks some gateways). + */ + +import type { LLMChunk, LLMProvider, LLMStreamOptions } from '../llm-loop'; +import { mergeAbortSignals } from '../llm-loop'; +import { parseOpenAISseStream } from '../providers/groq-llm'; +import { PatterConnectionError } from '../errors'; +import { getLogger } from '../logger'; +import { VERSION } from '../version'; + +/** Default per-request timeout in seconds for the generic provider. */ +const DEFAULT_TIMEOUT_S = 60; + +/** Constructor options for {@link OpenAICompatibleLLMProvider}. */ +export interface OpenAICompatibleLLMOptions { + /** + * Bearer token. If omitted and ``apiKeyEnv`` is given, read from that + * environment variable. May resolve to undefined for keyless local + * gateways — the ``Authorization`` header is then omitted entirely. + */ + apiKey?: string; + /** + * Environment variable to read the bearer from when ``apiKey`` is not given + * (e.g. ``"OPENCLAW_API_KEY"``). + */ + apiKeyEnv?: string; + /** + * OpenAI-compatible base URL ending in ``/v1`` — the whole point of this + * provider, so it is **required**. Operator-controlled config, never derived + * from caller / transcript input. + */ + baseUrl: string; + /** Model / agent target — **required**. */ + model: string; + /** + * Per-request timeout in **seconds**. Default ``60`` (the base OpenAI + * provider hardcodes 30 s — raised here because agent runtimes run tools + * before replying). Converted to ``AbortSignal.timeout(timeout * 1000)``. + */ + timeout?: number; + /** + * Extra headers merged into the request *after* the ``User-Agent`` so the + * SDK attribution is not silently clobbered (a caller can still override + * ``User-Agent`` explicitly). + */ + extraHeaders?: Record; + /** + * When set, emits the OpenAI ``user`` field as + * ``` `${sessionUserPrefix}${callId}` ``` for per-call session continuity. + * ``undefined`` (default) means no ``user`` field is sent. Independent of the + * session headers below. + */ + sessionUserPrefix?: string; + /** + * Optional header NAME carrying a per-call session id, e.g. + * ``"X-Hermes-Session-Id"`` or ``"x-openclaw-session-key"``. When set AND a + * ``callId`` is available, the header VALUE is + * ``` `${sessionIdPrefix}${callId}` ```. ``undefined`` (default) means off. + */ + sessionIdHeader?: string; + /** + * Prefix for the session-id header VALUE. Defaults to ``""`` (raw call id). + * Only meaningful when ``sessionIdHeader`` is set. + */ + sessionIdPrefix?: string; + /** + * Optional STATIC header NAME for long-term memory scoping, e.g. + * ``"X-Hermes-Session-Key"``. Emitted with the raw ``sessionKey`` value (no + * call-id interpolation) only when BOTH ``sessionKeyHeader`` and + * ``sessionKey`` are set. ``undefined`` (default) means off. + */ + sessionKeyHeader?: string; + /** + * Static value emitted in ``sessionKeyHeader``. Credential-grade memory + * scope — NEVER logged. ``undefined`` (default) means the header is omitted. + */ + sessionKey?: string; + /** Sampling temperature [0, 2]. */ + temperature?: number; + /** Max tokens in the assistant response (sent as ``max_completion_tokens``). */ + maxTokens?: number; + /** OpenAI-style ``response_format`` for JSON mode / structured outputs. */ + responseFormat?: Record; + /** Whether to allow parallel tool calls. */ + parallelToolCalls?: boolean; + /** ``"auto" | "none" | "required"`` or a specific tool object. */ + toolChoice?: string | Record; + /** Sampling seed for reproducible outputs. */ + seed?: number; + /** Nucleus sampling cutoff in [0, 1]. */ + topP?: number; + /** Penalty in [-2, 2] applied to repeated tokens. */ + frequencyPenalty?: number; + /** Penalty in [-2, 2] applied to seen tokens. */ + presencePenalty?: number; + /** Stop sequence(s). */ + stop?: string | string[]; +} + +/** + * LLM provider for any OpenAI-compatible ``/chat/completions`` endpoint. + * + * Streams in the same ``{ type: "text" | "tool_call" | "usage" }`` chunk + * format as the base OpenAI provider via the shared {@link parseOpenAISseStream}. + */ +export class OpenAICompatibleLLMProvider implements LLMProvider { + /** + * Stable pricing/dashboard key — read by stream-handler/metrics. Typed as + * ``string`` (not the narrowed literal) so the Hermes / OpenClaw presets can + * override it with their own key while still extending this class. + */ + static readonly providerKey: string = 'openai_compatible'; + + /** Resolved bearer; undefined for keyless gateways. */ + private readonly apiKey?: string; + readonly model: string; + private readonly baseUrl: string; + private readonly timeoutMs: number; + private readonly extraHeaders?: Record; + private readonly sessionUserPrefix?: string; + private readonly sessionIdHeader?: string; + private readonly sessionIdPrefix?: string; + private readonly sessionKeyHeader?: string; + private readonly sessionKey?: string; + private readonly temperature?: number; + private readonly maxTokens?: number; + private readonly responseFormat?: Record; + private readonly parallelToolCalls?: boolean; + private readonly toolChoice?: string | Record; + private readonly seed?: number; + private readonly topP?: number; + private readonly frequencyPenalty?: number; + private readonly presencePenalty?: number; + private readonly stop?: string | string[]; + + constructor(options: OpenAICompatibleLLMOptions) { + if (!options.baseUrl) { + throw new Error( + 'OpenAICompatibleLLMProvider requires a baseUrl (e.g. "http://127.0.0.1:11434/v1").', + ); + } + if (!options.model) { + throw new Error('OpenAICompatibleLLMProvider requires a model.'); + } + // Resolve the bearer: explicit apiKey wins, then apiKeyEnv, else undefined + // (keyless local gateway). Never logged. + this.apiKey = + options.apiKey ?? + (options.apiKeyEnv ? process.env[options.apiKeyEnv] : undefined); + this.model = options.model; + this.baseUrl = options.baseUrl; + this.timeoutMs = (options.timeout ?? DEFAULT_TIMEOUT_S) * 1000; + this.extraHeaders = options.extraHeaders; + this.sessionUserPrefix = options.sessionUserPrefix; + this.sessionIdHeader = options.sessionIdHeader; + this.sessionIdPrefix = options.sessionIdPrefix; + this.sessionKeyHeader = options.sessionKeyHeader; + this.sessionKey = options.sessionKey; + this.temperature = options.temperature; + this.maxTokens = options.maxTokens; + this.responseFormat = options.responseFormat; + this.parallelToolCalls = options.parallelToolCalls; + this.toolChoice = options.toolChoice; + this.seed = options.seed; + this.topP = options.topP; + this.frequencyPenalty = options.frequencyPenalty; + this.presencePenalty = options.presencePenalty; + this.stop = options.stop; + } + + /** + * Assemble the request headers. ``User-Agent`` is set first so any + * ``extraHeaders`` (and the per-call session headers) layer on top without + * silently dropping the SDK attribution, and the ``Authorization`` header is + * only added when a key is present (keyless gateways omit it). + * + * The two session headers are emitted INDEPENDENTLY, each gated on its own + * config (decoupled from ``sessionUserPrefix`` and from each other): + * - ``sessionIdHeader`` (+ ``callId``) → ``` `${sessionIdPrefix}${callId}` ``` + * - ``sessionKeyHeader`` (+ ``sessionKey``) → the static ``sessionKey`` value. + * ``sessionKey`` is a credential-grade memory scope and is never logged. + */ + private buildHeaders(callId?: string): Record { + const headers: Record = { + 'Content-Type': 'application/json', + 'User-Agent': `getpatter/${VERSION}`, + ...(this.extraHeaders ?? {}), + }; + if (this.apiKey) { + headers.Authorization = `Bearer ${this.apiKey}`; + } + if (this.sessionIdHeader && callId) { + // Per-call session id for session / transcript continuity. + headers[this.sessionIdHeader] = `${this.sessionIdPrefix ?? ''}${callId}`; + } + if (this.sessionKeyHeader && this.sessionKey) { + // Truthy check (not `!== undefined`): an empty-string session key is not + // a meaningful memory scope — treat it as unset rather than emitting a + // confusing empty header. Value is the raw key (never logged). + headers[this.sessionKeyHeader] = this.sessionKey; + } + return headers; + } + + /** + * Pre-call DNS / TLS warmup for the configured endpoint. Best-effort: + * 5 s timeout, all exceptions swallowed at debug level. The ``Authorization`` + * header is only sent when a key is present so the operator-grade bearer is + * never echoed for keyless gateways (and the key is never logged). + */ + async warmup(): Promise { + try { + const headers: Record = {}; + if (this.apiKey) headers.Authorization = `Bearer ${this.apiKey}`; + await fetch(`${this.baseUrl}/models`, { + method: 'GET', + headers, + signal: AbortSignal.timeout(5_000), + }); + } catch (err) { + getLogger().debug( + `OpenAI-compatible LLM warmup failed (best-effort): ${String(err)}`, + ); + } + } + + /** + * Build the request body. Mirrors the base OpenAI provider's sampling-kwarg + * assembly and additionally sets ``user`` for session continuity when + * ``sessionUserPrefix`` is set AND a ``callId`` is available — so the default + * (prefix unset) behaviour is byte-identical to the base provider. + */ + private buildBody( + messages: Array>, + tools?: Array> | null, + callId?: string, + ): Record { + const body: Record = { + model: this.model, + messages, + stream: true, + stream_options: { include_usage: true }, + }; + if (this.temperature !== undefined) body.temperature = this.temperature; + if (this.maxTokens !== undefined) body.max_completion_tokens = this.maxTokens; + if (this.responseFormat !== undefined) body.response_format = this.responseFormat; + if (this.parallelToolCalls !== undefined) body.parallel_tool_calls = this.parallelToolCalls; + if (this.toolChoice !== undefined) body.tool_choice = this.toolChoice; + if (this.seed !== undefined) body.seed = this.seed; + if (this.topP !== undefined) body.top_p = this.topP; + if (this.frequencyPenalty !== undefined) body.frequency_penalty = this.frequencyPenalty; + if (this.presencePenalty !== undefined) body.presence_penalty = this.presencePenalty; + if (this.stop !== undefined) body.stop = this.stop; + if (tools) body.tools = tools; + if (this.sessionUserPrefix !== undefined && callId) { + body.user = `${this.sessionUserPrefix}${callId}`; + } + return body; + } + + /** Stream Patter-format LLM chunks from the configured chat completions API. */ + async *stream( + messages: Array>, + tools?: Array> | null, + opts?: LLMStreamOptions, + ): AsyncGenerator { + const callId = opts?.callId; + const body = this.buildBody(messages, tools, callId); + + const response = await fetch(`${this.baseUrl}/chat/completions`, { + method: 'POST', + headers: this.buildHeaders(callId), + body: JSON.stringify(body), + signal: mergeAbortSignals(opts?.signal, AbortSignal.timeout(this.timeoutMs)), + }); + + if (!response.ok) { + const errText = await response.text(); + getLogger().error( + `OpenAI-compatible API error: ${response.status} ${errText}`, + ); + // Mirror the base OpenAILLMProvider.stream() — throw so LLMLoop can + // surface the failure instead of silently producing an empty turn (the + // agent would otherwise go silent with no error reaching the dashboard). + throw new PatterConnectionError( + `LLM API returned ${response.status}: ${errText.slice(0, 200)}`, + ); + } + + yield* parseOpenAISseStream(response); + } +} + +/** + * Public alias of {@link OpenAICompatibleLLMProvider} for the + * ``getpatter/llm/openai-compatible`` namespace. + * + * @example + * ```ts + * import * as openaiCompatible from "getpatter/llm/openai-compatible"; + * // Ollama / vLLM / LM Studio (keyless local gateway): + * const llm = new openaiCompatible.LLM({ + * baseUrl: "http://127.0.0.1:11434/v1", + * model: "llama3.1", + * }); + * ``` + */ +export class LLM extends OpenAICompatibleLLMProvider { + static readonly providerKey = 'openai_compatible'; +} diff --git a/libraries/typescript/src/llm/openclaw.ts b/libraries/typescript/src/llm/openclaw.ts new file mode 100644 index 00000000..005ed40f --- /dev/null +++ b/libraries/typescript/src/llm/openclaw.ts @@ -0,0 +1,128 @@ +/** + * OpenClaw agent-runtime LLM preset for Patter's pipeline mode. + * + * Thin preset over {@link OpenAICompatibleLLMProvider}, aligned with the + * shipped ``openclawConsult`` builder in ``src/consult.ts``: same loopback + * base URL (``:18789/v1``), same ``OPENCLAW_API_KEY`` env var, same + * ``model="openclaw/"`` pass-through convention, same agent-id charset + * rule, and the same ``x-openclaw-session-key`` session header. Takes an + * ``agent`` id (not a raw model string), exactly like ``openclawConsult``. + * + * OpenClaw runs tools / memory / skills internally before replying, so a turn + * can take 30-90 s — hence the 120 s default timeout (unlike the consult + * preset's phone-safe 30 s filler default; here the runtime IS the per-turn + * brain, not an on-demand escalation). It keys sessions off BOTH the OpenAI + * ``user`` field and the ``x-openclaw-session-key`` header, so the preset + * enables both for one runtime session per phone call. + */ +import { + OpenAICompatibleLLMProvider, + type OpenAICompatibleLLMOptions, +} from './openai-compatible'; + +/** Default OpenClaw base URL (loopback). Byte-identical to the consult preset. */ +const BASE_URL = 'http://127.0.0.1:18789/v1'; +/** Env var OpenClaw reads its operator-grade bearer from. */ +const API_KEY_ENV = 'OPENCLAW_API_KEY'; +/** Header OpenClaw keys sessions off (secondary to the ``user`` field). */ +const SESSION_HEADER = 'x-openclaw-session-key'; +/** Per-call session prefix → one OpenClaw session per phone call. */ +const SESSION_USER_PREFIX = 'patter-call-'; +/** Default timeout (seconds): runtimes run tools before replying. */ +const DEFAULT_TIMEOUT_S = 120; +/** + * Agent ids cross into the gateway via the model string — restrict to a safe + * set. Byte-identical to ``OPENCLAW_AGENT_RE`` in ``src/consult.ts`` so an + * agent valid for consult is valid here and vice-versa. + */ +const OPENCLAW_AGENT_RE = /^[A-Za-z0-9._:/-]+$/; + +/** Constructor options for the OpenClaw ``LLM`` preset. */ +export interface OpenClawLLMOptions { + /** + * OpenClaw agent id (e.g. ``"receptionist"``). Mapped to + * ``model="openclaw/"``; an already-namespaced id (``"openclaw/x"``, + * ``"agent:x"``) is passed through unchanged. **Required.** + */ + agent: string; + /** Override the OpenClaw base URL (rarely needed). */ + baseUrl?: string; + /** Bearer token. Falls back to ``OPENCLAW_API_KEY`` env var when omitted. */ + apiKey?: string; + /** Per-request timeout in seconds. Default ``120``. */ + timeout?: number; + /** Extra headers merged after the SDK ``User-Agent``. */ + extraHeaders?: Record; + /** Sampling temperature [0, 2]. */ + temperature?: number; + /** Max tokens in the assistant response (sent as ``max_completion_tokens``). */ + maxTokens?: number; + /** OpenAI-style ``response_format`` for JSON mode / structured outputs. */ + responseFormat?: Record; + /** Whether to allow parallel tool calls. */ + parallelToolCalls?: boolean; + /** ``"auto" | "none" | "required"`` or a specific tool object. */ + toolChoice?: string | Record; + /** Sampling seed for reproducible outputs. */ + seed?: number; + /** Nucleus sampling cutoff in [0, 1]. */ + topP?: number; + /** Penalty in [-2, 2] applied to repeated tokens. */ + frequencyPenalty?: number; + /** Penalty in [-2, 2] applied to seen tokens. */ + presencePenalty?: number; + /** Stop sequence(s). */ + stop?: string | string[]; +} + +/** + * OpenClaw agent-runtime LLM provider (OpenAI-compatible, streaming). + * + * @example + * ```ts + * import * as openclaw from "getpatter/llm/openclaw"; + * const llm = new openclaw.LLM({ agent: "receptionist" }); // reads OPENCLAW_API_KEY + * ``` + */ +export class LLM extends OpenAICompatibleLLMProvider { + static readonly providerKey = 'openclaw'; + + constructor(opts: OpenClawLLMOptions) { + const agent = opts?.agent; + if (!agent || !OPENCLAW_AGENT_RE.test(agent)) { + throw new Error( + `Invalid OpenClaw agent id: ${JSON.stringify(agent)}. ` + + 'Allowed characters: letters, digits, dot, underscore, colon, slash, dash.', + ); + } + // Already-namespaced ids (``openclaw/x``, ``agent:x``) pass through; a + // bare id is namespaced to ``openclaw/``. Identical rule to + // ``openclawConsult`` in src/consult.ts. + const model = agent.includes('/') || agent.includes(':') ? agent : `openclaw/${agent}`; + const options: OpenAICompatibleLLMOptions = { + apiKey: opts.apiKey, + apiKeyEnv: API_KEY_ENV, + baseUrl: opts.baseUrl ?? BASE_URL, + model, + timeout: opts.timeout ?? DEFAULT_TIMEOUT_S, + sessionUserPrefix: SESSION_USER_PREFIX, + // Wire-identical to the prior behaviour: header value is the raw call id + // (empty prefix), and OpenClaw's gateway also derives the session from + // the ``user`` field above. No separate memory-scope header. + sessionIdHeader: SESSION_HEADER, + sessionIdPrefix: '', + extraHeaders: opts.extraHeaders, + temperature: opts.temperature, + maxTokens: opts.maxTokens, + responseFormat: opts.responseFormat, + parallelToolCalls: opts.parallelToolCalls, + toolChoice: opts.toolChoice, + seed: opts.seed, + topP: opts.topP, + frequencyPenalty: opts.frequencyPenalty, + presencePenalty: opts.presencePenalty, + stop: opts.stop, + }; + super(options); + } +} diff --git a/libraries/typescript/src/stream-handler.ts b/libraries/typescript/src/stream-handler.ts index b78117b4..38a11983 100644 --- a/libraries/typescript/src/stream-handler.ts +++ b/libraries/typescript/src/stream-handler.ts @@ -2776,6 +2776,27 @@ export class StreamHandler { // Fix 8: record turn as interrupted so it does not leak in metrics when // the LLM throws without emitting any text. this.metricsAcc.recordTurnInterrupted(); + // Opt-in spoken fallback: speak the configured line iff no audio was + // emitted this turn (``!ttsFirstByteSent.value`` — no PCM chunk has + // reached the carrier, i.e. the caller heard SILENCE) and the agent + // still owns the floor (``this.isSpeaking``). Gated on emitted audio + // rather than received tokens, so a provider that streams partial + // tokens ('Let me check…') and then times out before a sentence + // boundary (the chunker buffered them, TTS never ran) still triggers + // the fallback. Reuses the normal per-sentence TTS primitive so the + // fallback is a regular turn utterance (barge-in honoured per chunk; + // closed by the ``finally`` ``endSpeakingWithGrace``). A non-empty + // string is required — unset / empty preserves today's + // silence-on-error behaviour. Self-synthesis failure must degrade to + // that silence, never crash the turn. + const fallback = this.deps.agent.llmErrorMessage; + if (fallback && !ttsFirstByteSent.value && this.isSpeaking) { + try { + await this.synthesizeSentence(fallback, hookExecutor, hookCtx, ttsFirstByteSent); + } catch (err) { + getLogger().error(`llmErrorMessage fallback synthesis failed (${label}):`, err); + } + } } } diff --git a/libraries/typescript/src/types.ts b/libraries/typescript/src/types.ts index ac3ec0f1..5a9bb261 100644 --- a/libraries/typescript/src/types.ts +++ b/libraries/typescript/src/types.ts @@ -512,6 +512,18 @@ export interface AgentOptions { */ readonly language?: string; readonly firstMessage?: string; + /** + * Opt-in spoken fallback for pipeline mode when the per-turn LLM stream + * throws (gateway-down / 120 s timeout) BEFORE any assistant text was + * spoken. Agent-runtime providers (Hermes / OpenClaw) run tools+memory + * internally so a turn can take 30-90 s; on failure the caller currently + * hears SILENCE then a silent turn-end. When set to a non-empty string, + * the SDK synthesizes and speaks this line through the normal TTS turn + * lifecycle (subject to barge-in). ``undefined`` (default) preserves + * today's behaviour: nothing is spoken on LLM error. Pipeline mode only. + * Mirrors Python ``llm_error_message`` on ``Patter.agent()`` / ``Agent``. + */ + readonly llmErrorMessage?: string; /** Tool definitions — ``Tool`` class instances from ``getpatter``. */ readonly tools?: ReadonlyArray; /** diff --git a/libraries/typescript/tests/llm-error-fallback.mocked.test.ts b/libraries/typescript/tests/llm-error-fallback.mocked.test.ts new file mode 100644 index 00000000..80c7c45d --- /dev/null +++ b/libraries/typescript/tests/llm-error-fallback.mocked.test.ts @@ -0,0 +1,488 @@ +/** + * [mocked] Pipeline-mode opt-in spoken fallback on an LLM stream error. + * + * Exercises the REAL pipeline turn path: + * STT final → processTranscript → runPipelineLlm → real LLMLoop.run → + * provider.stream() THROWS (gateway-down) → the EXISTING catch(e) + * non-abort branch → opt-in ``agent.llmErrorMessage`` fallback spoken via + * the same per-sentence TTS primitive (synthesizeSentence) every normal + * sentence uses. + * + * AUTHENTIC: the StreamHandler, the CallMetricsAccumulator, the real + * ``LLMLoop`` (constructed inside ``initPipeline`` from ``agent.llm``), the + * sentence chunker, and the TTS-send path are REAL. The ONLY mocked surfaces + * are the two external boundaries: + * 1. The LLM provider's ``stream()`` — the paid HTTP gateway — stubbed to + * throw ``PatterConnectionError`` (or yield text then throw / abort). + * 2. The TTS byte stream (ElevenLabsTTS ``synthesizeStream``) — replaced + * with a couple of PCM Buffers so audio-out is observable. + * Everything inward (LLMLoop.run, the catch branch, the fallback gate, the + * synthesizeSentence primitive, sendAudio) runs unmodified. + * + * The authenticity invariant: if ``synthesizeSentence`` is stubbed to throw, + * the positive test's audio-out assertion fails — proving the test drives the + * real speak primitive, not a mock (see the dedicated test at the bottom). + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { StreamHandler } from '../src/stream-handler'; +import type { TelephonyBridge, StreamHandlerDeps } from '../src/stream-handler'; +import { MetricsStore } from '../src/dashboard/store'; +import { RemoteMessageHandler } from '../src/remote-message'; +import type { AgentOptions } from '../src/types'; +import type { LLMProvider, LLMChunk, LLMStreamOptions } from '../src/llm-loop'; +import { PatterConnectionError } from '../src/errors'; +import type { WebSocket as WSWebSocket } from 'ws'; + +const FALLBACK = 'Sorry, I am having trouble right now.'; + +// --------------------------------------------------------------------------- +// Module-level TTS mock — the external byte boundary. Each `new ElevenLabsTTS` +// returns a controllable instance; `synthesizeStream` yields a couple of PCM +// Buffers and records the exact text it was asked to speak. +// --------------------------------------------------------------------------- +vi.mock('../src/providers/elevenlabs-tts', async (importOriginal) => { + const original = + await importOriginal(); + return { + ...original, + ElevenLabsTTS: vi.fn().mockImplementation(() => ({ + synthesizeStream: vi.fn(async function* () { + yield Buffer.from('tts-audio'); + }), + })), + }; +}); + +// Silence dashboard persistence side-effects. +vi.mock('../src/dashboard/persistence', () => ({ + notifyDashboard: vi.fn(), +})); + +import { ElevenLabsTTS } from '../src/providers/elevenlabs-tts'; + +// --------------------------------------------------------------------------- +// Harness helpers (mirrors tests/integration/pipeline-e2e.test.ts) +// --------------------------------------------------------------------------- + +function makeMockWs(): WSWebSocket { + return { + send: vi.fn(), + close: vi.fn(), + on: vi.fn(), + once: vi.fn(), + readyState: 1, + removeListener: vi.fn(), + addEventListener: vi.fn(), + removeEventListener: vi.fn(), + } as unknown as WSWebSocket; +} + +/** STT mock that lets tests push a final transcript manually. */ +function makeMockStt() { + let transcriptCb: + | ((t: { isFinal?: boolean; text?: string }) => Promise) + | undefined; + return { + connect: vi.fn().mockResolvedValue(undefined), + close: vi.fn(), + sendAudio: vi.fn(), + onTranscript: vi.fn( + (cb: (t: { isFinal?: boolean; text?: string }) => Promise) => { + transcriptCb = cb; + }, + ), + get requestId() { + return 'stt-fallback-req'; + }, + emitTranscript(text: string): Promise | undefined { + return transcriptCb?.({ isFinal: true, text }); + }, + }; +} + +function makeTwilioBridge( + mockStt: ReturnType, +): TelephonyBridge { + return { + label: 'Twilio', + telephonyProvider: 'twilio', + sendAudio: vi.fn(), + sendMark: vi.fn(), + sendClear: vi.fn(), + transferCall: vi.fn().mockResolvedValue(undefined), + endCall: vi.fn().mockResolvedValue(undefined), + createStt: vi.fn().mockReturnValue(mockStt), + queryTelephonyCost: vi.fn().mockResolvedValue(undefined), + } as unknown as TelephonyBridge; +} + +/** + * A real ``LLMProvider`` whose ``stream()`` is the only mocked surface — the + * external gateway boundary. ``mode`` selects the failure shape: + * - 'throw' → throws PatterConnectionError before any text (the + * gateway-down / timeout case the fallback targets). + * - 'partial-throw' → yields one PARTIAL token (no sentence boundary), THEN + * throws. The chunker buffers it, TTS never runs, so the + * caller heard SILENCE — the fallback MUST still fire. + * - 'sentence-throw' → yields a COMPLETE sentence (real audio emitted), THEN + * throws. The fallback MUST be suppressed (no double-speak). + * - 'abort' → aborts the per-turn signal then throws AbortError + * (clean barge-in cancellation — must NOT speak). + */ +function makeThrowingProvider( + mode: 'throw' | 'partial-throw' | 'sentence-throw' | 'abort', +): LLMProvider { + return { + model: 'agent-runtime-1', + async *stream( + _messages: Array>, + _tools?: Array> | null, + opts?: LLMStreamOptions, + ): AsyncGenerator { + if (mode === 'partial-throw') { + yield { type: 'text', content: 'Let me check that ' }; + throw new PatterConnectionError('LLM API returned 503: gateway down'); + } + if (mode === 'sentence-throw') { + yield { type: 'text', content: 'Hello there. ' }; + throw new PatterConnectionError('LLM API returned 503: gateway down'); + } + if (mode === 'abort') { + // Simulate a barge-in: the per-turn signal trips, then the upstream + // fetch rejects with an AbortError — the catch branch must treat this + // as a clean cancellation and stay silent. + const ac = opts?.signal as AbortSignal | undefined; + const err = new Error('The operation was aborted'); + err.name = 'AbortError'; + // Make the signal observably aborted for the isAbort check. + if (ac && !ac.aborted) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + Object.defineProperty(ac, 'aborted', { value: true, configurable: true }); + } + throw err; + } + // 'throw' — no text emitted before the gateway failure. + throw new PatterConnectionError('LLM API returned 503: gateway down'); + // eslint-disable-next-line no-unreachable + yield { type: 'text', content: '' }; + }, + } as unknown as LLMProvider; +} + +/** + * Install a custom ElevenLabsTTS synthesizeStream and return the list of texts + * it was asked to speak. Must run BEFORE handleCallStart so the factory is set + * when the StreamHandler constructs the TTS instance. + */ +function setupTtsMock( + impl: (text: string) => AsyncGenerator = async function* () { + yield Buffer.from('pcm-chunk-1'); + yield Buffer.from('pcm-chunk-2'); + }, +): { calls: string[] } { + const calls: string[] = []; + const MockTTS = ElevenLabsTTS as unknown as ReturnType; + MockTTS.mockImplementation(() => ({ + synthesizeStream: vi.fn(async function* (text: string) { + calls.push(text); + yield* impl(text); + }), + })); + return { calls }; +} + +function makeDeps( + bridge: TelephonyBridge, + agentOverrides: Partial, +): StreamHandlerDeps { + // A TTS adapter instance via the mocked ElevenLabsTTS factory so the + // pipeline's synthesizeStream path is exercised. + const mockTts = new (ElevenLabsTTS as unknown as new ( + key: string, + voice?: string, + ) => { synthesizeStream: (t: string) => AsyncIterable })( + 'el-key', + 'rachel', + ); + const agent: AgentOptions = { + systemPrompt: 'You are a test pipeline agent.', + provider: 'pipeline', + // tts/llm cast through unknown — the StreamHandler reads these adapter + // instances structurally (synthesizeStream / stream). + tts: mockTts as unknown as AgentOptions['tts'], + ...agentOverrides, + } as AgentOptions; + return { + config: {}, + agent, + bridge, + metricsStore: new MetricsStore(), + pricing: null, + remoteHandler: new RemoteMessageHandler(), + recording: false, + buildAIAdapter: vi.fn(), + sanitizeVariables: vi.fn((raw: Record) => { + const safe: Record = {}; + for (const [k, v] of Object.entries(raw)) safe[k] = String(v); + return safe; + }), + resolveVariables: vi.fn((tpl: string) => tpl), + } as unknown as StreamHandlerDeps; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('[mocked] pipeline LLM-error spoken fallback (llmErrorMessage)', () => { + beforeEach(() => { + // Generic fetch stub for any incidental network the handler might touch + // (built-in tool dispatch etc.). The LLM provider is injected directly so + // this does NOT serve the LLM turn. + vi.spyOn(globalThis, 'fetch').mockResolvedValue({ + ok: true, + status: 200, + json: async () => ({}), + text: async () => '', + } as Response); + const MockTTS = ElevenLabsTTS as unknown as ReturnType; + MockTTS.mockClear(); + MockTTS.mockImplementation(() => ({ + synthesizeStream: vi.fn(async function* () { + yield Buffer.from('tts-audio'); + }), + })); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('speaks the configured line when the LLM stream throws with zero text emitted', async () => { + const stt = makeMockStt(); + const bridge = makeTwilioBridge(stt); + const { calls: ttsCalls } = setupTtsMock(); + + const deps = makeDeps(bridge, { + llm: makeThrowingProvider('throw') as unknown as AgentOptions['llm'], + llmErrorMessage: FALLBACK, + }); + const handler = new StreamHandler( + deps, + makeMockWs(), + '+15551111111', + '+15552222222', + ); + + await handler.handleCallStart('CA-fallback-spoken'); + await stt.emitTranscript('Can you book me an appointment?'); + + // The fallback line is synthesized through the real TTS primitive and the + // resulting PCM is pushed to the carrier (observable audio-out). + await vi.waitFor( + () => expect(ttsCalls).toContain(FALLBACK), + { timeout: 5000 }, + ); + expect(ttsCalls).toEqual([FALLBACK]); + expect( + (bridge.sendAudio as ReturnType).mock.calls.length, + ).toBeGreaterThanOrEqual(1); + }, 10000); + + it('speaks NOTHING when llmErrorMessage is unset (default = today silence-on-error)', async () => { + const stt = makeMockStt(); + const bridge = makeTwilioBridge(stt); + const { calls: ttsCalls } = setupTtsMock(); + + // No llmErrorMessage → undefined → no fallback. + const deps = makeDeps(bridge, { + llm: makeThrowingProvider('throw') as unknown as AgentOptions['llm'], + }); + const handler = new StreamHandler( + deps, + makeMockWs(), + '+15551111111', + '+15552222222', + ); + + await handler.handleCallStart('CA-fallback-unset'); + await stt.emitTranscript('Can you book me an appointment?'); + + // Let the error path settle (catch branch logs + records interrupted turn). + await new Promise((r) => setTimeout(r, 300)); + + expect(ttsCalls).toHaveLength(0); + expect(bridge.sendAudio as ReturnType).not.toHaveBeenCalled(); + // The real interrupted-turn accounting still ran (turn does not leak as + // a completed turn). endCall() exposes the recorded turns. + const acc = ( + handler as unknown as { + metricsAcc: import('../src/metrics').CallMetricsAccumulator; + } + ).metricsAcc; + const metrics = acc.endCall(); + expect(metrics.turns.every((t) => t.text !== FALLBACK)).toBe(true); + }, 10000); + + it('speaks the empty-string fallback as NOTHING (falsy guard preserves silence)', async () => { + const stt = makeMockStt(); + const bridge = makeTwilioBridge(stt); + const { calls: ttsCalls } = setupTtsMock(); + + const deps = makeDeps(bridge, { + llm: makeThrowingProvider('throw') as unknown as AgentOptions['llm'], + llmErrorMessage: '', // empty string is treated as unset + }); + const handler = new StreamHandler( + deps, + makeMockWs(), + '+15551111111', + '+15552222222', + ); + + await handler.handleCallStart('CA-fallback-empty'); + await stt.emitTranscript('Hello there?'); + + await new Promise((r) => setTimeout(r, 300)); + + expect(ttsCalls).toHaveLength(0); + expect(bridge.sendAudio as ReturnType).not.toHaveBeenCalled(); + }, 10000); + + it('speaks the fallback when partial tokens were buffered but NO audio was emitted', async () => { + const stt = makeMockStt(); + const bridge = makeTwilioBridge(stt); + const { calls: ttsCalls } = setupTtsMock(); + + // Provider yields a PARTIAL token (no sentence boundary) THEN throws. The + // chunker buffers it without flushing a sentence, so TTS never ran and the + // caller heard SILENCE. Gating on emitted audio (not received tokens), the + // fallback MUST fire — this is the agent-runtime gateway-timeout regression. + const deps = makeDeps(bridge, { + llm: makeThrowingProvider( + 'partial-throw', + ) as unknown as AgentOptions['llm'], + llmErrorMessage: FALLBACK, + }); + const handler = new StreamHandler( + deps, + makeMockWs(), + '+15551111111', + '+15552222222', + ); + + await handler.handleCallStart('CA-fallback-partial'); + await stt.emitTranscript('What is the weather?'); + + await vi.waitFor(() => expect(ttsCalls).toContain(FALLBACK), { + timeout: 5000, + }); + // The buffered partial token never reached TTS, so the ONLY thing spoken is + // the fallback line, and real PCM was pushed to the carrier. + expect(ttsCalls).toEqual([FALLBACK]); + expect( + (bridge.sendAudio as ReturnType).mock.calls.length, + ).toBeGreaterThanOrEqual(1); + }, 10000); + + it('does NOT speak the fallback after a full sentence was already spoken (no double-speak)', async () => { + const stt = makeMockStt(); + const bridge = makeTwilioBridge(stt); + const { calls: ttsCalls } = setupTtsMock(); + + // Provider yields a COMPLETE sentence (real TTS audio emitted) THEN throws. + // The caller already heard speech, so the fallback must be suppressed. + const deps = makeDeps(bridge, { + llm: makeThrowingProvider( + 'sentence-throw', + ) as unknown as AgentOptions['llm'], + llmErrorMessage: FALLBACK, + }); + const handler = new StreamHandler( + deps, + makeMockWs(), + '+15551111111', + '+15552222222', + ); + + await handler.handleCallStart('CA-fallback-sentence'); + await stt.emitTranscript('What is the weather?'); + + await vi.waitFor(() => expect(ttsCalls).toContain('Hello there.'), { + timeout: 5000, + }); + await new Promise((r) => setTimeout(r, 200)); + + // The real sentence was spoken; the fallback line must NOT be appended. + expect(ttsCalls).not.toContain(FALLBACK); + }, 10000); + + it('does NOT speak the fallback on a clean barge-in abort (AbortError branch stays silent)', async () => { + const stt = makeMockStt(); + const bridge = makeTwilioBridge(stt); + const { calls: ttsCalls } = setupTtsMock(); + + const deps = makeDeps(bridge, { + llm: makeThrowingProvider('abort') as unknown as AgentOptions['llm'], + llmErrorMessage: FALLBACK, + }); + const handler = new StreamHandler( + deps, + makeMockWs(), + '+15551111111', + '+15552222222', + ); + + await handler.handleCallStart('CA-fallback-abort'); + await stt.emitTranscript('Cancel that please.'); + + await new Promise((r) => setTimeout(r, 300)); + + // The abort is a clean cancellation — the fallback MUST NOT be spoken. + expect(ttsCalls).not.toContain(FALLBACK); + }, 10000); + + it('authenticity: stubbing synthesizeSentence to throw makes the positive path emit no audio', async () => { + const stt = makeMockStt(); + const bridge = makeTwilioBridge(stt); + setupTtsMock(); + + const deps = makeDeps(bridge, { + llm: makeThrowingProvider('throw') as unknown as AgentOptions['llm'], + llmErrorMessage: FALLBACK, + }); + const handler = new StreamHandler( + deps, + makeMockWs(), + '+15551111111', + '+15552222222', + ); + + // Replace the REAL speak primitive with a thrower. The fallback wiring + // wraps the call in try/catch, so the turn must not crash, but NO audio + // can reach the carrier — proving the positive test exercised the real + // synthesizeSentence rather than a mock. + const synthSpy = vi + .spyOn( + handler as unknown as { + synthesizeSentence: (...args: unknown[]) => Promise; + }, + 'synthesizeSentence', + ) + .mockRejectedValue(new Error('synthesizeSentence disabled')); + + await handler.handleCallStart('CA-fallback-authentic'); + await stt.emitTranscript('Book me in.'); + + await new Promise((r) => setTimeout(r, 300)); + + expect(synthSpy).toHaveBeenCalledWith( + FALLBACK, + expect.anything(), + expect.anything(), + expect.anything(), + ); + expect(bridge.sendAudio as ReturnType).not.toHaveBeenCalled(); + }, 10000); +}); diff --git a/libraries/typescript/tests/llm-hermes-openclaw.mocked.test.ts b/libraries/typescript/tests/llm-hermes-openclaw.mocked.test.ts new file mode 100644 index 00000000..35325e6a --- /dev/null +++ b/libraries/typescript/tests/llm-hermes-openclaw.mocked.test.ts @@ -0,0 +1,152 @@ +/** + * Tests for the Hermes and OpenClaw LLM presets. + * + * Construction, default resolution, env-key fallback, agent-id validation, and + * agent→model namespacing are ALL real code. The only mocked surface is + * ``global.fetch`` — used to inspect the request the preset would POST (base + * URL, body.user, session header, Authorization) without touching the network. + */ + +import { describe, expect, it, vi, afterEach } from 'vitest'; +import { LLM as HermesLLM } from '../src/llm/hermes'; +import { LLM as OpenClawLLM } from '../src/llm/openclaw'; +import { openclawConsult } from '../src/consult'; + +const originalFetch = globalThis.fetch; + +afterEach(() => { + globalThis.fetch = originalFetch; + vi.restoreAllMocks(); + delete process.env.API_SERVER_MODEL_NAME; + delete process.env.API_SERVER_KEY; + delete process.env.OPENCLAW_API_KEY; +}); + +/** Capture the single fetch a preset issues, returning a 200 + empty body. */ +function captureFetch(): { calls: Array<{ url: string; init: RequestInit }> } { + const calls: Array<{ url: string; init: RequestInit }> = []; + globalThis.fetch = vi.fn( + async (url: string | URL | Request, init?: RequestInit) => { + calls.push({ url: String(url), init: init ?? {} }); + return new Response('', { status: 200 }); + }, + ) as unknown as typeof fetch; + return { calls }; +} + +async function inspectRequest( + provider: { stream: (m: Array>, t?: unknown, o?: { callId?: string }) => AsyncGenerator }, + callId?: string, +): Promise<{ url: string; body: Record; headers: Record }> { + const { calls } = captureFetch(); + for await (const _ of provider.stream( + [{ role: 'user', content: 'hi' }], + null, + callId ? { callId } : undefined, + )) { + // drain + } + const init = calls[0].init; + return { + url: calls[0].url, + body: JSON.parse(init.body as string) as Record, + headers: init.headers as Record, + }; +} + +describe('[unit] HermesLLM preset', () => { + it('defaults baseUrl, model, timeout (120 s), the user prefix and X-Hermes-Session-Id header', async () => { + const timeoutSpy = vi.spyOn(AbortSignal, 'timeout'); + const llm = new HermesLLM(); + expect(llm.model).toBe('hermes-agent'); + const { url, body, headers } = await inspectRequest(llm, 'c1'); + expect(url).toBe('http://127.0.0.1:8642/v1/chat/completions'); + expect(body.model).toBe('hermes-agent'); + expect(body.user).toBe('patter-call-c1'); // upstream-log correlation, kept + // PRIMARY mechanism: per-call session id header, on by default. + expect(headers['X-Hermes-Session-Id']).toBe('patter-call-c1'); + expect(timeoutSpy).toHaveBeenCalledWith(120_000); + }); + + it('omits X-Hermes-Session-Key by default and emits it only when sessionKey is set', async () => { + // Default: no memory-scope header (opt-in). + const { headers: defaultHeaders } = await inspectRequest(new HermesLLM(), 'c1'); + expect(defaultHeaders['X-Hermes-Session-Key']).toBeUndefined(); + + // Configured: the static memory-scope header is sent on the wire. + const scoped = new HermesLLM({ sessionKey: 'mem-123' }); + const { headers: scopedHeaders } = await inspectRequest(scoped, 'c1'); + expect(scopedHeaders['X-Hermes-Session-Key']).toBe('mem-123'); + // Per-call session id still flows alongside the memory scope. + expect(scopedHeaders['X-Hermes-Session-Id']).toBe('patter-call-c1'); + }); + + it('reads the model from API_SERVER_MODEL_NAME, with an explicit model still winning', () => { + process.env.API_SERVER_MODEL_NAME = 'hermes-7b'; + expect(new HermesLLM().model).toBe('hermes-7b'); + expect(new HermesLLM({ model: 'explicit-model' }).model).toBe('explicit-model'); + }); + + it('resolves the bearer from API_SERVER_KEY, and stays keyless when absent', async () => { + process.env.API_SERVER_KEY = 'hermes-secret'; + const { headers } = await inspectRequest(new HermesLLM()); + expect(headers.Authorization).toBe('Bearer hermes-secret'); + + delete process.env.API_SERVER_KEY; + const { headers: keyless } = await inspectRequest(new HermesLLM()); + expect(keyless.Authorization).toBeUndefined(); // keyless local Hermes + }); +}); + +describe('[unit] OpenClawLLM preset', () => { + it('maps a bare agent id to openclaw/', () => { + expect(new OpenClawLLM({ agent: 'receptionist' }).model).toBe('openclaw/receptionist'); + }); + + it('passes through an already-namespaced agent id unchanged', () => { + expect(new OpenClawLLM({ agent: 'openclaw/receptionist' }).model).toBe('openclaw/receptionist'); + expect(new OpenClawLLM({ agent: 'agent:receptionist' }).model).toBe('agent:receptionist'); + }); + + it('rejects an invalid agent id (charset) with a thrown error', () => { + expect(() => new OpenClawLLM({ agent: 'a b' })).toThrow(/agent id/i); + expect(() => new OpenClawLLM({ agent: '' })).toThrow(/agent id/i); + }); + + it('defaults baseUrl :18789, OPENCLAW_API_KEY, the session header and 120 s timeout', async () => { + const timeoutSpy = vi.spyOn(AbortSignal, 'timeout'); + process.env.OPENCLAW_API_KEY = 'oc-operator-secret'; + const llm = new OpenClawLLM({ agent: 'receptionist' }); + const { url, body, headers } = await inspectRequest(llm, 'c2'); + expect(url).toBe('http://127.0.0.1:18789/v1/chat/completions'); + expect(headers.Authorization).toBe('Bearer oc-operator-secret'); + expect(body.user).toBe('patter-call-c2'); + expect(headers['x-openclaw-session-key']).toBe('c2'); + expect(timeoutSpy).toHaveBeenCalledWith(120_000); + }); +}); + +describe('[unit] OpenClaw LLM ↔ consult preset parity', () => { + it('shares base URL, api-key env, session header and agent→model mapping with openclawConsult', () => { + const cfg = openclawConsult('receptionist'); + const oc = cfg.openaiCompatible!; + + // Same loopback base URL. + const llm = new OpenClawLLM({ agent: 'receptionist' }); + expect(new URL(oc.baseUrl).href).toBe('http://127.0.0.1:18789/v1'); + + // Same env var name and session header (the consult preset is the + // shipped source of truth — both must stay byte-identical). + expect(oc.apiKeyEnv).toBe('OPENCLAW_API_KEY'); + expect(oc.sessionHeader).toBe('x-openclaw-session-key'); + + // Same agent→model namespacing rule. + expect(oc.model).toBe('openclaw/receptionist'); + expect(llm.model).toBe(oc.model); + + // Already-namespaced ids map identically in both code paths. + expect(openclawConsult('openclaw/x').openaiCompatible!.model).toBe( + new OpenClawLLM({ agent: 'openclaw/x' }).model, + ); + }); +}); diff --git a/libraries/typescript/tests/llm-loop-call-id-threading.test.ts b/libraries/typescript/tests/llm-loop-call-id-threading.test.ts new file mode 100644 index 00000000..6c8e843b --- /dev/null +++ b/libraries/typescript/tests/llm-loop-call-id-threading.test.ts @@ -0,0 +1,125 @@ +/** + * Tests for the per-call id threading from ``LLMLoop.run`` into + * ``provider.stream``'s ``LLMStreamOptions.callId``. + * + * Uses the REAL ``LLMLoop`` with a tiny in-process recording provider (not a + * mock of the unit under test — a real ``LLMProvider`` that records the + * ``opts`` object it was handed). Proves (a) the loop spreads + * ``callContext.call_id`` into ``opts.callId``, and (b) a provider that reads + * only ``opts.signal`` is unaffected by the added field. + */ + +import { describe, expect, it } from 'vitest'; +import { LLMLoop } from '../src/llm-loop'; +import type { LLMChunk, LLMProvider, LLMStreamOptions } from '../src/llm-loop'; + +/** Records the stream options it received and yields a single text chunk. */ +class RecordingProvider implements LLMProvider { + static readonly providerKey = 'recording'; + public lastOpts: LLMStreamOptions | undefined; + public callCount = 0; + + async *stream( + _messages: Array>, + _tools?: Array> | null, + opts?: LLMStreamOptions, + ): AsyncGenerator { + this.callCount += 1; + this.lastOpts = opts; + yield { type: 'text', content: 'ok' }; + } +} + +/** A stock provider that only reads ``opts.signal`` — never ``opts.callId``. */ +class SignalOnlyProvider implements LLMProvider { + static readonly providerKey = 'signal_only'; + public sawAbort = false; + + async *stream( + _messages: Array>, + _tools?: Array> | null, + opts?: LLMStreamOptions, + ): AsyncGenerator { + this.sawAbort = opts?.signal?.aborted ?? false; + yield { type: 'text', content: 'hello' }; + } +} + +/** + * The most minimal custom provider: a ``stream(messages, tools)`` that declares + * NO third ``opts`` parameter at all. Proves the loop's extra positional + * options arg silently no-ops in TS (unlike Python keyword args, which require + * an inspect.signature guard) — no guard code needed on the TS side. + */ +class NoOptsProvider implements LLMProvider { + static readonly providerKey = 'no_opts'; + public callCount = 0; + + // Intentionally omits the opts parameter — the loop still passes streamOpts + // as a third positional argument, which a generator ignoring it discards. + async *stream( + _messages: Array>, + _tools?: Array> | null, + ): AsyncGenerator { + this.callCount += 1; + yield { type: 'text', content: 'done' }; + } +} + +async function drain(gen: AsyncGenerator): Promise { + let out = ''; + for await (const tok of gen) out += tok; + return out; +} + +describe('[unit] LLMLoop call_id threading', () => { + it('forwards call_id from call_context into provider.stream opts.callId', async () => { + const provider = new RecordingProvider(); + const loop = new LLMLoop('', 'm', 'be helpful', null, provider); + + const out = await drain( + loop.run('hi', [], { call_id: 'xyz', caller: '+15555550100', callee: '+15555550101' }), + ); + + expect(out).toBe('ok'); + expect(provider.callCount).toBe(1); + expect(provider.lastOpts?.callId).toBe('xyz'); + }); + + it('does not set callId when call_context has no call_id (no synthetic value)', async () => { + const provider = new RecordingProvider(); + const loop = new LLMLoop('', 'm', 'be helpful', null, provider); + + await drain(loop.run('hi', [], {})); + + // The loop leaves opts untouched (undefined) — no '' or 'undefined' leaks. + expect(provider.lastOpts?.callId).toBeUndefined(); + }); + + it('leaves signal-only providers unaffected by the added callId field', async () => { + const provider = new SignalOnlyProvider(); + const loop = new LLMLoop('', 'm', 'be helpful', null, provider); + + const out = await drain( + loop.run('hi', [], { call_id: 'abc' }, undefined, undefined, undefined, { + signal: AbortSignal.timeout(60_000), + }), + ); + + // Backward compatible: the provider runs to completion, reads only signal. + expect(out).toBe('hello'); + expect(provider.sawAbort).toBe(false); + }); + + it('runs a minimal provider whose stream omits the opts param entirely (no guard needed)', async () => { + const provider = new NoOptsProvider(); + const loop = new LLMLoop('', 'm', 'be helpful', null, provider); + + // call_id is present, so the loop builds and passes streamOpts as the third + // positional arg. The provider's two-arg generator simply ignores it. + const out = await drain(loop.run('hi', [], { call_id: 'abc' })); + + expect(out).toBe('done'); + expect(provider.callCount).toBe(1); + }); +}); diff --git a/libraries/typescript/tests/llm-openai-compatible.mocked.test.ts b/libraries/typescript/tests/llm-openai-compatible.mocked.test.ts new file mode 100644 index 00000000..f10bcf54 --- /dev/null +++ b/libraries/typescript/tests/llm-openai-compatible.mocked.test.ts @@ -0,0 +1,364 @@ +/** + * Tests for the generic OpenAI-compatible LLM provider. + * + * The provider construction, request-body assembly, header assembly, timeout + * selection, env-key resolution, and SSE normalisation are ALL real code. The + * only mocked surface is ``global.fetch`` — the paid/external HTTP boundary — + * stubbed to return either a captured request or a real SSE ``ReadableStream`` + * fixture. Everything inward (``buildBody`` / ``buildHeaders`` / + * ``parseOpenAISseStream``) runs unmodified. + */ + +import { describe, expect, it, vi, afterEach, beforeEach } from 'vitest'; +import { + OpenAICompatibleLLMProvider, + LLM, +} from '../src/llm/openai-compatible'; +import type { LLMChunk } from '../src/llm-loop'; +import { PatterConnectionError } from '../src/errors'; + +const originalFetch = globalThis.fetch; + +afterEach(() => { + globalThis.fetch = originalFetch; + vi.restoreAllMocks(); + delete process.env.OAICOMPAT_TEST_KEY; +}); + +/** Capture the single fetch the provider issues, returning a 200 + empty body. */ +function captureFetch(): { calls: Array<{ url: string; init: RequestInit }> } { + const calls: Array<{ url: string; init: RequestInit }> = []; + globalThis.fetch = vi.fn( + async (url: string | URL | Request, init?: RequestInit) => { + calls.push({ url: String(url), init: init ?? {} }); + // Empty SSE body — the stream parser drains it cleanly. + return new Response('', { status: 200 }); + }, + ) as unknown as typeof fetch; + return { calls }; +} + +/** A real streaming OpenAI-format SSE body (text + tool_call + usage). */ +function sseFixtureResponse(): Response { + const lines = [ + 'data: {"choices":[{"delta":{"content":"Hi"}}]}\n\n', + 'data: {"choices":[{"delta":{"content":" there"}}]}\n\n', + 'data: {"choices":[{"delta":{"tool_calls":[{"index":0,"id":"call_1","function":{"name":"book","arguments":"{}"}}]}}]}\n\n', + 'data: {"choices":[],"usage":{"prompt_tokens":11,"completion_tokens":4}}\n\n', + 'data: [DONE]\n\n', + ]; + const stream = new ReadableStream({ + start(controller) { + const enc = new TextEncoder(); + for (const l of lines) controller.enqueue(enc.encode(l)); + controller.close(); + }, + }); + return new Response(stream, { status: 200 }); +} + +async function drainBody( + provider: OpenAICompatibleLLMProvider, + callId?: string, +): Promise<{ body: Record; headers: Record }> { + const { calls } = captureFetch(); + for await (const _ of provider.stream( + [{ role: 'user', content: 'hi' }], + null, + callId ? { callId } : undefined, + )) { + // drain + } + const init = calls[0].init; + return { + body: JSON.parse(init.body as string) as Record, + headers: init.headers as Record, + }; +} + +describe('[unit] OpenAICompatibleLLMProvider construction', () => { + it('points the request at the configured base URL and model', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + }); + expect(provider.model).toBe('m'); + const { calls } = captureFetch(); + for await (const _ of provider.stream([{ role: 'user', content: 'hi' }])) { + // drain + } + expect(calls[0].url).toBe('http://127.0.0.1:9/v1/chat/completions'); + }); + + it('constructs a keyless gateway without error and omits the Authorization header', async () => { + // base_url set, no apiKey, no apiKeyEnv → Ollama/vLLM/LM-Studio path. + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:11434/v1', + model: 'llama3.1', + }); + const { headers } = await drainBody(provider); + expect(headers.Authorization).toBeUndefined(); + expect(headers['User-Agent']).toMatch(/^getpatter\//); + }); + + it('resolves the api key from the named environment variable', async () => { + process.env.OAICOMPAT_TEST_KEY = 'secret-token-xyz'; + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + apiKeyEnv: 'OAICOMPAT_TEST_KEY', + }); + const { headers } = await drainBody(provider); + expect(headers.Authorization).toBe('Bearer secret-token-xyz'); + }); + + it('the LLM alias subclass is constructable and points at its base URL', () => { + const llm = new LLM({ baseUrl: 'http://127.0.0.1:9/v1', model: 'm' }); + expect(llm).toBeInstanceOf(OpenAICompatibleLLMProvider); + expect(llm.model).toBe('m'); + }); +}); + +describe('[unit] OpenAICompatibleLLMProvider session continuity', () => { + it('omits the user field by default (sessionUserPrefix unset)', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + }); + const { body } = await drainBody(provider, 'abc'); + expect(body.user).toBeUndefined(); + }); + + it('emits a stable patter-call user id when sessionUserPrefix + callId set', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionUserPrefix: 'patter-call-', + }); + const { body } = await drainBody(provider, 'abc'); + expect(body.user).toBe('patter-call-abc'); + }); + + it('omits the user field when sessionUserPrefix set but no callId is available', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionUserPrefix: 'patter-call-', + }); + const { body } = await drainBody(provider); // no callId + expect(body.user).toBeUndefined(); + }); + + it('emits the session-id header as `${prefix}${callId}` INDEPENDENT of sessionUserPrefix', async () => { + // No sessionUserPrefix → no user field, but the session-id header still fires. + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionIdHeader: 'X-Hermes-Session-Id', + sessionIdPrefix: 'patter-call-', + }); + const { body, headers } = await drainBody(provider, 'abc'); + expect(body.user).toBeUndefined(); + expect(headers['X-Hermes-Session-Id']).toBe('patter-call-abc'); + }); + + it('defaults the session-id prefix to "" (raw call id) when sessionIdPrefix unset', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionIdHeader: 'x-openclaw-session-key', + }); + const { headers } = await drainBody(provider, 'c2'); + // Regression: wire-identical to the old session_header behaviour. + expect(headers['x-openclaw-session-key']).toBe('c2'); + }); + + it('omits the session-id header when no callId is available', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionIdHeader: 'X-Hermes-Session-Id', + sessionIdPrefix: 'patter-call-', + }); + const { headers } = await drainBody(provider); // no callId + expect(headers['X-Hermes-Session-Id']).toBeUndefined(); + }); + + it('emits a STATIC session-key header (value == sessionKey, no call-id interpolation)', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionKeyHeader: 'X-Hermes-Session-Key', + sessionKey: 'mem-123', + }); + // Independent of call id: present with AND without a callId, value unchanged. + const withCall = await drainBody(provider, 'abc'); + expect(withCall.headers['X-Hermes-Session-Key']).toBe('mem-123'); + const noCall = await drainBody(provider); + expect(noCall.headers['X-Hermes-Session-Key']).toBe('mem-123'); + }); + + it('omits the session-key header when sessionKeyHeader set but sessionKey undefined (opt-in)', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionKeyHeader: 'X-Hermes-Session-Key', + }); + const { headers } = await drainBody(provider, 'abc'); + expect(headers['X-Hermes-Session-Key']).toBeUndefined(); + }); + + it('combines all three signals into one request, preserving extraHeaders (no clobber)', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionUserPrefix: 'patter-call-', + sessionIdHeader: 'X-Hermes-Session-Id', + sessionIdPrefix: 'patter-call-', + sessionKeyHeader: 'X-Hermes-Session-Key', + sessionKey: 'mem-123', + extraHeaders: { 'X-Foo': '1' }, + }); + const { body, headers } = await drainBody(provider, 'abc'); + expect(body.user).toBe('patter-call-abc'); + expect(headers['X-Hermes-Session-Id']).toBe('patter-call-abc'); + expect(headers['X-Hermes-Session-Key']).toBe('mem-123'); + // Pre-existing headers survive alongside the session headers. + expect(headers['X-Foo']).toBe('1'); + expect(headers['User-Agent']).toMatch(/^getpatter\//); + }); + + it('sends no user field and no session headers when none configured (byte-identical baseline)', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + }); + const { body, headers } = await drainBody(provider, 'abc'); + expect(body.user).toBeUndefined(); + // Only the baseline headers — no session signals. + expect(Object.keys(headers).sort()).toEqual(['Content-Type', 'User-Agent']); + }); +}); + +describe('[unit] OpenAICompatibleLLMProvider headers and timeout', () => { + it('merges extraHeaders alongside the getpatter User-Agent', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + extraHeaders: { 'X-Foo': '1' }, + }); + const { headers } = await drainBody(provider); + expect(headers['X-Foo']).toBe('1'); + expect(headers['User-Agent']).toMatch(/^getpatter\//); + }); + + it('honours the configurable timeout instead of the base 30 s ceiling', async () => { + const timeoutSpy = vi.spyOn(AbortSignal, 'timeout'); + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + timeout: 120, + }); + const { calls } = captureFetch(); + for await (const _ of provider.stream([{ role: 'user', content: 'hi' }])) { + // drain + } + expect(calls.length).toBe(1); + // The request timeout (not the 5 s warmup) must be 120_000 ms, proving + // the base provider's hardcoded 30_000 ms ceiling was replaced. + expect(timeoutSpy).toHaveBeenCalledWith(120_000); + expect(timeoutSpy).not.toHaveBeenCalledWith(30_000); + }); + + it('defaults the generic timeout to 60 s', async () => { + const timeoutSpy = vi.spyOn(AbortSignal, 'timeout'); + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + }); + const { calls } = captureFetch(); + for await (const _ of provider.stream([{ role: 'user', content: 'hi' }])) { + // drain + } + expect(calls.length).toBe(1); + expect(timeoutSpy).toHaveBeenCalledWith(60_000); + }); +}); + +describe('[mocked] OpenAICompatibleLLMProvider streaming over the HTTP boundary', () => { + let captured: { url: string; init: RequestInit } | undefined; + + beforeEach(() => { + captured = undefined; + globalThis.fetch = vi.fn( + async (url: string | URL | Request, init?: RequestInit) => { + captured = { url: String(url), init: init ?? {} }; + return sseFixtureResponse(); + }, + ) as unknown as typeof fetch; + }); + + it('sends the user field + session-id header on the wire and normalises real SSE chunks', async () => { + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + sessionUserPrefix: 'patter-call-', + sessionIdHeader: 'X-Hermes-Session-Id', + sessionIdPrefix: 'patter-call-', + }); + + const chunks: LLMChunk[] = []; + for await (const c of provider.stream( + [{ role: 'user', content: 'book me in' }], + null, + { callId: 'call-99' }, + )) { + chunks.push(c); + } + + // The POST carried the session continuity signals on the wire. + const body = JSON.parse(captured!.init.body as string) as Record; + expect(body.user).toBe('patter-call-call-99'); + const headers = captured!.init.headers as Record; + expect(headers['X-Hermes-Session-Id']).toBe('patter-call-call-99'); + + // Real SSE chunks normalised by the shared parser. + const texts = chunks.filter((c) => c.type === 'text').map((c) => c.content); + expect(texts.join('')).toBe('Hi there'); + + const toolCall = chunks.find((c) => c.type === 'tool_call'); + expect(toolCall).toMatchObject({ type: 'tool_call', name: 'book', id: 'call_1' }); + + const usage = chunks.find((c) => c.type === 'usage'); + expect(usage).toMatchObject({ type: 'usage', inputTokens: 11, outputTokens: 4 }); + }); + + it('throws PatterConnectionError on a non-OK gateway response instead of yielding an empty turn', async () => { + globalThis.fetch = vi.fn( + async () => + new Response('upstream gateway unavailable', { + status: 503, + statusText: 'Service Unavailable', + }), + ) as unknown as typeof fetch; + + const provider = new OpenAICompatibleLLMProvider({ + baseUrl: 'http://127.0.0.1:9/v1', + model: 'm', + }); + + // The stream must surface the failure (matching the base OpenAILLMProvider) + // so LLMLoop marks the turn errored rather than silently completing empty. + const drain = async (): Promise => { + for await (const _ of provider.stream( + [{ role: 'user', content: 'hi' }], + null, + )) { + // should never reach here + } + }; + + await expect(drain()).rejects.toBeInstanceOf(PatterConnectionError); + await expect(drain()).rejects.toThrow('503'); + }); +});