lemonade-sdk · abn · Jun 22, 2026 · Jun 22, 2026 · jeremyfowers · Jun 25, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -408,8 +408,8 @@ add_compile_definitions(CPPHTTPLIB_THREAD_POOL_COUNT=8)
 
 # Platform-specific compiler definitions
 if(WIN32)
-    # Set Windows target version to Windows 10 for httplib v0.26.0
-    add_compile_definitions(_WIN32_WINNT=0x0A00)
+    # Set Windows target version to Windows 10 for httplib v0.26.0, and prevent Windows.h min/max macro definitions
+    add_compile_definitions(_WIN32_WINNT=0x0A00 NOMINMAX)
     if(MSVC)
         # Add security-hardening compiler flags for MSVC
         # Control Flow Guard - prevents control flow hijacking
@@ -596,6 +596,7 @@ set(SOURCES_CORE
     src/cpp/server/system_info.cpp
     src/cpp/server/recipe_options.cpp
     src/cpp/server/runtime_config.cpp
+    src/cpp/server/telemetry.cpp
     src/cpp/server/logging_config.cpp
     src/cpp/server/log_stream.cpp
     src/cpp/server/prometheus_metrics.cpp
@@ -1834,3 +1835,24 @@ if(EXISTS "${_AUTO_TUNE_TEST_SRC}")
     include(CTest)
     add_test(NAME AutoTuneTest COMMAND test_auto_tune)
 endif()
+
+set(_TELEMETRY_HELPERS_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/test/cpp/test_telemetry_helpers.cpp")
+if(EXISTS "${_TELEMETRY_HELPERS_TEST_SRC}")
+    add_executable(test_telemetry_helpers test/cpp/test_telemetry_helpers.cpp)
+    target_link_libraries(test_telemetry_helpers PRIVATE lemonade-server-core)
+    add_test(NAME TelemetryHelpersTest COMMAND test_telemetry_helpers)
+endif()
+
+set(_CONFIG_MIGRATION_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/test/cpp/test_config_migration.cpp")
+if(EXISTS "${_CONFIG_MIGRATION_TEST_SRC}")
+    add_executable(test_config_migration test/cpp/test_config_migration.cpp)
+    target_link_libraries(test_config_migration PRIVATE lemonade-server-core)
+    add_test(NAME ConfigMigrationTest COMMAND test_config_migration)
+endif()
+
+set(_CONFIG_TELEMETRY_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/test/cpp/test_config_telemetry.cpp")
+if(EXISTS "${_CONFIG_TELEMETRY_TEST_SRC}")
+    add_executable(test_config_telemetry test/cpp/test_config_telemetry.cpp)
+    target_link_libraries(test_config_telemetry PRIVATE lemonade-server-core)
+    add_test(NAME ConfigTelemetryTest COMMAND test_config_telemetry)
+endif()
diff --git a/docs/api/lemonade.md b/docs/api/lemonade.md
@@ -27,6 +27,7 @@ We have designed a set of Lemonade-specific endpoints to enable client applicati
 | `WS` | [`/logs/stream`](#log-streaming-api-websocket) | Log Streaming |
 | `GET` | [`/live`](#get-live) | Check server liveness for load balancers and orchestrators |
 | `GET` | [`/metrics`](#get-metrics) | Prometheus metrics scrape endpoint |
+| `POST` | [`/internal/telemetry/flush`](#post-internaltelemetryflush) | Force-flush all queued telemetry trace spans |
 
 ## `POST /v1/pull`
 <sub>![Status](https://img.shields.io/badge/status-fully_available-green)</sub>
@@ -683,6 +684,9 @@ curl http://localhost:13305/v1/health
     "llm":1,
     "reranking":1,
     "tts":1
+  },
+  "telemetry": {
+    "enabled": false
   }
 }
 ```
@@ -712,6 +716,9 @@ curl http://localhost:13305/v1/health
   - `image` - Maximum image models
   - `tts` - Maximum text-to-speech models
 - `websocket_port` - *(optional)* Port of the WebSocket server for the [Realtime Audio Transcription API](./openai.md#ws-realtime) and [Log Streaming API](#log-streaming-api-websocket). Only present when the WebSocket server is running. The port is OS-assigned or set via `--websocket-port`.
+- `telemetry` - Structured telemetry state object:
+  - `enabled` - Boolean indicating if telemetry collection is active
+  - `captures` - *(optional)* Array of captured telemetry components (e.g., `["inputs", "outputs", "thinking"]`), only present when `enabled` is `true`.
 
 ## `GET /v1/stats`
 <sub>![Status](https://img.shields.io/badge/status-fully_available-green)</sub>
@@ -1383,3 +1390,32 @@ curl http://localhost:13305/live
 ```json
 {"status":"ok"}
 ```
+
+## Internal Endpoints
+
+Internal endpoints are used for server control and configuration. By default, they are secured by `LEMONADE_ADMIN_API_KEY` (if set) to separate control privileges from standard inference operations.
+
+## `POST /internal/telemetry/flush`
+<sub>![Status](https://img.shields.io/badge/status-fully_available-green)</sub>
+
+Forces the in-memory telemetry queue to flush all buffered trace spans immediately to the configured OTLP collector. This call blocks until all currently queued spans are serialized and sent.
+
+#### Parameters
+
+None.
+
+Example request:
+
+```bash
+curl -X POST http://localhost:13305/internal/telemetry/flush
+```
+
+#### Response Format
+
+Returns a JSON object indicating successful completion of the flush operation:
+
+```json
+{
+  "status": "flushed"
+}
+```
diff --git a/docs/guide/cli.md b/docs/guide/cli.md
@@ -16,6 +16,7 @@ The `lemonade` CLI is the primary tool for interacting with Lemonade Server from
 - [Options for launch](#options-for-launch)
 - [Options for bench](#options-for-bench)
 - [Options for scan](#options-for-scan)
+- [Options for telemetry](#options-for-telemetry)
 
 ## Commands
 
@@ -38,6 +39,7 @@ The `lemonade` CLI is the primary tool for interacting with Lemonade Server from
 | `backends`          | List supported recipes and backends or list all available recipes and backends with `--all`. Use `install` or `uninstall` to manage backends. |
 | `cloud`             | Manage cloud OpenAI-compatible providers. See command options [below](#options-for-cloud). |
 | `scan`              | Scan for network beacons on the local network. See command options [below](#options-for-scan). |
+| `telemetry`         | Dynamically enable or disable telemetry tracing. See command options [below](#options-for-telemetry). |
 
 ### Model Management
 
@@ -659,6 +661,29 @@ lemonade scan
 lemonade scan --duration 5
 ```
 
+## Options for telemetry
+
+Dynamically toggle telemetry tracing on the server. This setting is applied immediately in-memory without requiring a server restart, but is not persisted to the server's `config.json` (meaning it will revert when the server restarts).
+
+```bash
+lemonade telemetry <on|off>
+```
+
+| Argument | Description |
+|----------|-------------|
+| `on`     | Enable telemetry tracing. |
+| `off`    | Disable telemetry tracing. |
+
+**Examples:**
+
+```bash
+# Enable telemetry tracing dynamically
+lemonade telemetry on
+
+# Disable telemetry tracing dynamically
+lemonade telemetry off
+```
+
 ## Options for bench
 
 The `bench` command measures chat completion performance (TTFT and tokens-per-second) for one or more models across one or more installed backends, context sizes, and scenario workloads. It sends `POST /api/v1/chat/completions` requests and extracts timing data from the server response.

diff --git a/docs/guide/configuration/README.md b/docs/guide/configuration/README.md
@@ -33,7 +33,7 @@ Values set in the user's `config.json` always take precedence over these seeded
 
 ```json
 {
-  "config_version": 1,
+  "config_version": 2,
   "port": 13305,
   "host": "localhost",
   "log_level": "info",
@@ -83,13 +83,30 @@ Values set in the user's `config.json` always take precedence over these seeded
     "vulkan_bin": "builtin"
   },
   "flm": {
-    "args": "",
+    "args": ""
   },
   "ryzenai": {
     "server_bin": "builtin"
   },
   "kokoro": {
     "cpu_bin": "builtin"
+  },
+  "telemetry": {
+    "enabled": false,
+    "hide_inputs": false,
+    "hide_outputs": false,
+    "hide_thinking": false,
+    "max_queue_capacity": 1000,
+    "otlp": {
+      "endpoint": "http://localhost:4318/v1/traces",
+      "protocol": "http/protobuf",
+      "semantics": ["openinference", "otel_genai"],
+      "headers": {},
+      "max_retries": 0,
+      "retry_backoff_base_s": 5.0,
+      "send_batch_size": 100,
+      "batch_timeout_s": 1.0
+    }
   }
 }
 ```
@@ -171,6 +188,53 @@ Backend-specific settings are nested under their backend name:
 
 API keys for these providers are **not** stored in `config.json` — they live in `LEMONADE_<PROVIDER>_API_KEY` env vars (persistent) or `lemond` process memory via `POST /v1/cloud/auth` (ephemeral). Manage providers with `lemonade cloud install/uninstall/auth/list` rather than editing this section by hand.
 
+**telemetry** — Unified telemetry and tracing configurations:
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `enabled` | bool | false | Enable or disable telemetry tracing. |
+| `hide_inputs` | bool | false | Redact prompt message content from spans. |
+| `hide_outputs` | bool | false | Redact generated assistant message content from spans. |
+| `hide_thinking` | bool | false | Redact reasoning/thought content from spans. |
+| `max_queue_capacity` | int | 1000 | The maximum capacity of the in-memory telemetry queue buffer. Oldest spans are dropped when full. Must be `> 0`. |
+| `otlp` | object | (nested object) | Sub-block grouping OTLP transport details (see below). |
+
+**telemetry.otlp** — Nested OTLP settings:
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `endpoint` | string | "http://localhost:4318/v1/traces" | The OTLP endpoint to send traces to. |
+| `protocol` | string | "http/protobuf" | Supported OTLP trace protocol: `"http/protobuf"` or `"http/json"`. |
+| `semantics` | array of strings | ["openinference", "otel_genai"] | Active trace semantics. Supported values: `"openinference"` and `"otel_genai"`. |
+| `headers` | object | {} | Map of custom HTTP headers to pass to the OTLP receiver. |
+| `max_retries` | int | 0 | Maximum number of retry attempts for failed exports. Set to `0` to disable retries and discard failed spans immediately. Must be `>= 0`. |
+| `retry_backoff_base_s` | double | 5.0 | Base delay in seconds for exponential backoff retries. Must be `>= 0`. |
+| `send_batch_size` | int | 100 | Target maximum number of spans to group in a single batched OTLP request. Must be `>= 1`. |
+| `batch_timeout_s` | double | 1.0 | Maximum time to wait in seconds before exporting a partially filled batch of spans. Must be `> 0`. |
+
+#### Telemetry and Tracing Details
+
+Lemonade uses a unified telemetry subsystem to trace requests and capture critical execution spans. The following technical behaviors apply:
+
+- **Multi-Standard Semantic Conventions**: Supports exporting traces using two co-existing semantics:
+  - **OpenInference**: Uses Arize Phoenix-compatible properties (always prefixed with `openinference.span.kind`, `llm.model_name`, `llm.token_count.*`).
+  - **OpenTelemetry GenAI**: Uses standard OpenTelemetry GenAI properties (`gen_ai.system`, `gen_ai.request.model`, `gen_ai.usage.input_tokens`, `gen_ai.input.messages`, `gen_ai.output.messages`).
+  When both semantics are specified in `telemetry.otlp.semantics`, trace spans carry attributes for both conventions in a single network payload. This allows the collector to parse either convention without duplicate network requests.
+- **Dynamic Attribute Prefixing**: Span attributes are dynamically prefixed based on the query type to simplify filtering:
+  - `llm.*` for standard chat and completion spans.
+  - `embedding.*` for text embedding generation spans.
+  - `reranker.*` for document reranking spans.
+- **Token Tracking**: Captures and reports token usage metrics using semantic attributes depending on the enabled semantics:
+  - For **OpenInference**: Token count is prefixed with `llm.token_count` across all span kinds (`llm.token_count.prompt`, `llm.token_count.completion`, `llm.token_count.total`) alongside legacy keys like `llm.usage.prompt_tokens`.
+  - For **OpenTelemetry GenAI**: Token count uses standard fields like `gen_ai.usage.input_tokens` and `gen_ai.usage.output_tokens`.
+- **Calculated Performance Metrics**: In streaming mode, the server automatically computes and records throughput (`llm.performance.tokens_per_second` / `gen_ai.usage.tokens_per_second` depending on semantic conventions) and prefill latency (`llm.performance.time_to_first_token` / `gen_ai.performance.time_to_first_token`) if not natively returned by the backend (e.g., for vLLM and Cloud models).
+- **vLLM Engine Telemetry**: For the vLLM backend, the server queries the local `/metrics` endpoint on completion to attach scheduler queue metrics (`llm.vllm.num_requests_waiting`, `llm.vllm.num_requests_running`, `llm.vllm.num_requests_swapped`) and KV cache utilization (`llm.vllm.gpu_cache_usage_factor`, `llm.vllm.cpu_cache_usage_factor`) directly to the trace spans.
+- **Reasoning Model Support**: For reasoning models (e.g., DeepSeek models), the server extracts and records `reasoning_content` from the assistant's generation. Any variant thought-termination tags (e.g., `</think|>`) are automatically standardized to the canonical `</think>` tag.
+- **Exporter Retry Backoff**: When retries are enabled (i.e., `max_retries > 0`), the exporter uses an exponential backoff strategy combined with randomized jitter for failed posts. The base retry interval starts at `retry_backoff_base_s` seconds (defaulting to 5), doubling on each subsequent failure (e.g., 5s, 10s, 20s, 40s), up to a maximum cap of 60 seconds. A randomized jitter factor between `0.5` and `1.5` is applied to each calculated delay to prevent a "thundering herd" when the collector recovers. Permanent client errors (`4xx` HTTP status codes, excluding `429 Too Many Requests`) are classified as non-retryable and cause the batch to be dropped immediately to save resources.
+- **OTLP Trace Batching**: Spans are aggregated in an in-memory queue buffer and exported in batches to minimize network overhead and maximize compression efficiency. Batching operates on a dual-trigger system: a batch is immediately serialized and dispatched if it reaches `send_batch_size` (default: `100`), or if `batch_timeout_s` (default: `1.0` second) has elapsed since the oldest span in the batch arrived. All remaining traces are flushed cleanly to the OTel collector upon server shutdown. Users can also trigger a manual flush at any time via the `POST /internal/telemetry/flush` endpoint.
+- **Request Failure Tracing**: Captures request failures directly on the telemetry spans. If a model fails to load, a request is rejected by the router, or a streaming connection encounters an exception or a non-200 HTTP status code from the backend, the span is ended with `Error` status and the specific error message is attached.
+- **Queue Blocking & Thundering Herd Prevention**: To prevent client requests from hanging and to avoid exhausting resources when the telemetry receiver endpoint is down, Lemonade employs a fail-fast mechanism. The exporter memory buffer is strictly bounded to a capacity of `max_queue_capacity` spans (default: `1000`). When full, a head-drop (FIFO) eviction policy is applied to drop the oldest telemetry spans to make room for newer ones, prioritizing current application state. If a telemetry transmission task fails all of its retries and is dropped, the endpoint is marked as **unreachable**. While in this unreachable state, subsequent spans in the transmission queue are attempted only once and immediately dropped without backoff delay if they fail, preventing the telemetry queue from blocking server operations. A single successful span delivery to the endpoint automatically resets the unreachable state and restores normal retry behavior.
+
 ### Backend binary selection
 
 Every `*_bin` key (e.g. `llamacpp.vulkan_bin`, `whispercpp.cpu_bin`, `sdcpp.rocm_bin`) accepts the same set of values:
@@ -185,7 +249,7 @@ Every `*_bin` key (e.g. `llamacpp.vulkan_bin`, `whispercpp.cpu_bin`, `sdcpp.rocm
 
 > Note: the `latest` setting is experimental.
 
-> **Important — `llamacpp.rocm_bin` version tags are channel-specific.** Each ROCm channel downloads from a different GitHub repository, so you must set the correct `rocm_channel` before pinning `rocm_bin` to a specific tag. See [Pinning to a Specific Version Tag](./llamacpp.md#pinning-to-a-specific-version-tag) for details.
+> Note: `llamacpp.rocm_bin` version tags are channel-specific. Each ROCm channel downloads from a different GitHub repository, so you must set the correct `rocm_channel` before pinning `rocm_bin` to a specific tag. See [Pinning to a Specific Version Tag](./llamacpp.md#pinning-to-a-specific-version-tag) for details.
 
 Examples: