open-telemetry · trask · Jun 26, 2026 · Jun 25, 2026 · Jun 26, 2026
@@ -80,9 +80,55 @@
 ---END THREAD---
 """
 
+# Copilot CLI JSONL is an external boundary rather than a typed API. Usage can
+# appear in CLI event-, provider-, model-, or version-specific shapes, so
+# normalize known spellings at ingest and keep the dashboard schema stable.
+_COPILOT_USAGE_FIELDS = {
+    "input_tokens": ("input_tokens", "prompt_tokens", "promptTokens", "inputTokens"),
+    "output_tokens": ("output_tokens", "completion_tokens", "completionTokens", "outputTokens"),
+    "total_tokens": ("total_tokens", "totalTokens", "tokens"),
+}
+
+
+def _numeric_usage_value(value: Any) -> int | None:
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float) and value.is_integer():
+        return int(value)
+    return None
+
+
+def _first_usage_value(usage: dict[str, Any], field_names: tuple[str, ...]) -> int:
+    for field_name in field_names:
+        if field_name not in usage:
+            continue
+        value = _numeric_usage_value(usage[field_name])
+        if value is not None:
+            return value
+    return 0
+
+
+def normalize_copilot_usage(usage: Any) -> dict[str, int]:
+    if not isinstance(usage, dict):
+        return {}
+    input_tokens = _first_usage_value(usage, _COPILOT_USAGE_FIELDS["input_tokens"])
+    output_tokens = _first_usage_value(usage, _COPILOT_USAGE_FIELDS["output_tokens"])
+    total_tokens = _first_usage_value(usage, _COPILOT_USAGE_FIELDS["total_tokens"])
+    if not total_tokens and (input_tokens or output_tokens):
+        total_tokens = input_tokens + output_tokens
+    normalized = {
+        "input_tokens": input_tokens,
+        "output_tokens": output_tokens,
+        "total_tokens": total_tokens,
+    }
+    return {key: value for key, value in normalized.items() if value}
+
 
-def parse_copilot_jsonl(s: str) -> str:
+def parse_copilot_jsonl(s: str) -> tuple[str, dict[str, int]]:
     parts: list[str] = []
+    usage: dict[str, int] = {}
     for line in s.splitlines():
         line = line.strip()
         if not line.startswith("{"):
@@ -95,7 +141,12 @@ def parse_copilot_jsonl(s: str) -> str:
             content = (evt.get("data") or {}).get("content")
             if isinstance(content, str):
                 parts.append(content)
-    return "\n".join(parts)
+        event_usage = normalize_copilot_usage(evt.get("usage"))
+        if not event_usage:
+            event_usage = normalize_copilot_usage((evt.get("data") or {}).get("usage"))
+        for key, value in event_usage.items():
+            usage[key] = usage.get(key, 0) + value
+    return "\n".join(parts), usage
 
 
 def extract_json_object(s: str) -> dict[str, Any] | None:
@@ -206,14 +257,18 @@ def run_llm_for_thread(thread: dict[str, Any], model: str) -> dict[str, Any]:
         errors="replace",
         timeout=LLM_THREAD_TIMEOUT_SECONDS,
     )
-    response_text = parse_copilot_jsonl(proc.stdout)
+    response_text, usage = parse_copilot_jsonl(proc.stdout)
     decision, valid_response = parse_thread_decision(response_text)
-    return {
+    record = {
         "thread_id": thread["thread_id"],
         "thread_kind": thread["thread_kind"],
+        "_copilot_cli_call": True,
         "failed": proc.returncode != 0 or not valid_response,
         "decision": decision,
     }
+    if usage:
+        record["usage"] = usage
+    return record
 
 
 def thread_cache_key(thread: dict[str, Any], model: str) -> str:
@@ -247,6 +302,14 @@ def save_classification_cache(pr_number: int, cache: dict[str, dict[str, Any]])
     path.write_text(json.dumps(cache, sort_keys=True, indent=2), encoding="utf-8")
 
 
+def cached_classification_record(record: dict[str, Any]) -> dict[str, Any]:
+    return {
+        k: v
+        for k, v in record.items()
+        if k not in ("_copilot_cli_call", "error", "response_text", "usage")
+    }
+
+
 def prune_classification_cache(open_pr_numbers: set[int]) -> None:
     if not CLASSIFICATION_CACHE_DIR.exists():
         return
@@ -265,7 +328,7 @@ def classify_threads(number: int, threads: list[dict[str, Any]], model: str) ->
         key = thread_cache_key(thread, model)
         cached = cache_in.get(key)
         if isinstance(cached, dict):
-            record = {k: v for k, v in cached.items() if k not in ("error", "response_text", "usage")}
+            record = cached_classification_record(cached)
             record["thread_id"] = thread["thread_id"]
             record["thread_kind"] = thread["thread_kind"]
             classifications.append(record)
@@ -277,6 +340,7 @@ def classify_threads(number: int, threads: list[dict[str, Any]], model: str) ->
             record = {
                 "thread_id": thread["thread_id"],
                 "thread_kind": thread["thread_kind"],
+                "_copilot_cli_call": True,
                 "failed": True,
                 "decision": {"thread_action": "unclear", "reason": "LLM timeout"},
             }
@@ -294,6 +358,6 @@ def classify_threads(number: int, threads: list[dict[str, Any]], model: str) ->
             }
         classifications.append(record)
         if not record.get("failed"):
-            cache_out[key] = record
+            cache_out[key] = cached_classification_record(record)
     save_classification_cache(number, cache_out)
     return classifications
@@ -895,12 +895,62 @@ def build_pr_result(
 class DashboardCalculation:
     results: dict[int, dict[str, Any]]
     dashboard_state: dict[str, Any]
+    copilot_usage: dict[str, int]
     trigger_pr_result: dict[str, Any] | None = None
     current_pr_result: dict[str, Any] | None = None
     starting_pr_result: dict[str, Any] | None = None
     used_cached_dashboard_state: bool = False
 
 
+def empty_copilot_usage() -> dict[str, int]:
+    return {
+        "calls": 0,
+        "reported_calls": 0,
+        "missing_usage_calls": 0,
+        "input_tokens": 0,
+        "output_tokens": 0,
+        "total_tokens": 0,
+    }
+
+
+def add_copilot_usage(aggregate: dict[str, int], result: dict[str, Any] | None) -> None:
+    if not result:
+        return
+    for classification in result.get("classifications") or []:
+        if not classification.get("_copilot_cli_call"):
+            continue
+        aggregate["calls"] += 1
+        usage = classification.get("usage") or {}
+        if not usage:
+            aggregate["missing_usage_calls"] += 1
+            continue
+        aggregate["reported_calls"] += 1
+        aggregate["input_tokens"] += int(usage.get("input_tokens") or 0)
+        aggregate["output_tokens"] += int(usage.get("output_tokens") or 0)
+        aggregate["total_tokens"] += int(usage.get("total_tokens") or 0)
+
+
+def copilot_usage_from_results(results: dict[int, dict[str, Any]]) -> dict[str, int]:
+    aggregate = empty_copilot_usage()
+    for result in results.values():
+        add_copilot_usage(aggregate, result)
+    return aggregate
+
+
+def print_copilot_usage_summary(repo: str, model: str, usage: dict[str, int]) -> None:
+    print(
+        "copilot token usage for "
+        f"{repo} (model={model}): "
+        f"calls={usage.get('calls', 0)}, "
+        f"reported={usage.get('reported_calls', 0)}, "
+        f"missing_usage={usage.get('missing_usage_calls', 0)}, "
+        f"input_tokens={usage.get('input_tokens', 0)}, "
+        f"output_tokens={usage.get('output_tokens', 0)}, "
+        f"total_tokens={usage.get('total_tokens', 0)}",
+        file=sys.stderr,
+    )
+
+
 def compute_pr_results(
     repo: str,
     owner: str,
@@ -930,6 +980,9 @@ def compute_pr_results(
         return DashboardCalculation(
             results=results,
             dashboard_state=dashboard_state,
+            copilot_usage=copilot_usage_from_results(
+                {pr_number: trigger_pr_result} if trigger_pr_result else {}
+            ),
             trigger_pr_result=trigger_pr_result,
             current_pr_result=current_pr_result,
             starting_pr_result=starting_pr_result,
@@ -973,6 +1026,7 @@ def compute_pr_results(
     return DashboardCalculation(
         results=results,
         dashboard_state=dashboard_state,
+        copilot_usage=copilot_usage_from_results(results),
         trigger_pr_result=trigger_pr_result,
         current_pr_result=current_pr_result,
     )
@@ -1050,6 +1104,7 @@ def update_dashboard(args: argparse.Namespace) -> int:
         args.model,
         args.required_approvals,
     )
+    print_copilot_usage_summary(repo, args.model, calculation.copilot_usage)
 
     calculation, dashboard_state_unchanged = reconcile_with_latest_dashboard(
         calculation,