Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 70 additions & 6 deletions .github/scripts/pull-request-dashboard/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,55 @@
---END THREAD---
"""

# Copilot CLI JSONL is an external boundary rather than a typed API. Usage can
# appear in CLI event-, provider-, model-, or version-specific shapes, so
# normalize known spellings at ingest and keep the dashboard schema stable.
_COPILOT_USAGE_FIELDS = {
"input_tokens": ("input_tokens", "prompt_tokens", "promptTokens", "inputTokens"),
"output_tokens": ("output_tokens", "completion_tokens", "completionTokens", "outputTokens"),
"total_tokens": ("total_tokens", "totalTokens", "tokens"),
}


def _numeric_usage_value(value: Any) -> int | None:
if isinstance(value, bool):
return None
if isinstance(value, int):
return value
if isinstance(value, float) and value.is_integer():
return int(value)
return None


def _first_usage_value(usage: dict[str, Any], field_names: tuple[str, ...]) -> int:
for field_name in field_names:
if field_name not in usage:
continue
value = _numeric_usage_value(usage[field_name])
if value is not None:
return value
return 0


def normalize_copilot_usage(usage: Any) -> dict[str, int]:
if not isinstance(usage, dict):
return {}
input_tokens = _first_usage_value(usage, _COPILOT_USAGE_FIELDS["input_tokens"])
output_tokens = _first_usage_value(usage, _COPILOT_USAGE_FIELDS["output_tokens"])
total_tokens = _first_usage_value(usage, _COPILOT_USAGE_FIELDS["total_tokens"])
if not total_tokens and (input_tokens or output_tokens):
total_tokens = input_tokens + output_tokens
normalized = {
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"total_tokens": total_tokens,
}
return {key: value for key, value in normalized.items() if value}


def parse_copilot_jsonl(s: str) -> str:
def parse_copilot_jsonl(s: str) -> tuple[str, dict[str, int]]:
parts: list[str] = []
usage: dict[str, int] = {}
for line in s.splitlines():
line = line.strip()
if not line.startswith("{"):
Expand All @@ -95,7 +141,12 @@ def parse_copilot_jsonl(s: str) -> str:
content = (evt.get("data") or {}).get("content")
if isinstance(content, str):
parts.append(content)
return "\n".join(parts)
event_usage = normalize_copilot_usage(evt.get("usage"))
if not event_usage:
event_usage = normalize_copilot_usage((evt.get("data") or {}).get("usage"))
for key, value in event_usage.items():
usage[key] = usage.get(key, 0) + value
return "\n".join(parts), usage


def extract_json_object(s: str) -> dict[str, Any] | None:
Expand Down Expand Up @@ -206,14 +257,18 @@ def run_llm_for_thread(thread: dict[str, Any], model: str) -> dict[str, Any]:
errors="replace",
timeout=LLM_THREAD_TIMEOUT_SECONDS,
)
response_text = parse_copilot_jsonl(proc.stdout)
response_text, usage = parse_copilot_jsonl(proc.stdout)
decision, valid_response = parse_thread_decision(response_text)
return {
record = {
"thread_id": thread["thread_id"],
"thread_kind": thread["thread_kind"],
"_copilot_cli_call": True,
"failed": proc.returncode != 0 or not valid_response,
"decision": decision,
}
if usage:
record["usage"] = usage
return record


def thread_cache_key(thread: dict[str, Any], model: str) -> str:
Expand Down Expand Up @@ -247,6 +302,14 @@ def save_classification_cache(pr_number: int, cache: dict[str, dict[str, Any]])
path.write_text(json.dumps(cache, sort_keys=True, indent=2), encoding="utf-8")


def cached_classification_record(record: dict[str, Any]) -> dict[str, Any]:
return {
k: v
for k, v in record.items()
if k not in ("_copilot_cli_call", "error", "response_text", "usage")
}


def prune_classification_cache(open_pr_numbers: set[int]) -> None:
if not CLASSIFICATION_CACHE_DIR.exists():
return
Expand All @@ -265,7 +328,7 @@ def classify_threads(number: int, threads: list[dict[str, Any]], model: str) ->
key = thread_cache_key(thread, model)
cached = cache_in.get(key)
if isinstance(cached, dict):
record = {k: v for k, v in cached.items() if k not in ("error", "response_text", "usage")}
record = cached_classification_record(cached)
record["thread_id"] = thread["thread_id"]
record["thread_kind"] = thread["thread_kind"]
classifications.append(record)
Expand All @@ -277,6 +340,7 @@ def classify_threads(number: int, threads: list[dict[str, Any]], model: str) ->
record = {
"thread_id": thread["thread_id"],
"thread_kind": thread["thread_kind"],
"_copilot_cli_call": True,
"failed": True,
"decision": {"thread_action": "unclear", "reason": "LLM timeout"},
}
Expand All @@ -294,6 +358,6 @@ def classify_threads(number: int, threads: list[dict[str, Any]], model: str) ->
}
classifications.append(record)
if not record.get("failed"):
cache_out[key] = record
cache_out[key] = cached_classification_record(record)
save_classification_cache(number, cache_out)
return classifications
55 changes: 55 additions & 0 deletions .github/scripts/pull-request-dashboard/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,12 +895,62 @@ def build_pr_result(
class DashboardCalculation:
results: dict[int, dict[str, Any]]
dashboard_state: dict[str, Any]
copilot_usage: dict[str, int]
trigger_pr_result: dict[str, Any] | None = None
current_pr_result: dict[str, Any] | None = None
starting_pr_result: dict[str, Any] | None = None
used_cached_dashboard_state: bool = False


def empty_copilot_usage() -> dict[str, int]:
return {
"calls": 0,
"reported_calls": 0,
"missing_usage_calls": 0,
"input_tokens": 0,
"output_tokens": 0,
"total_tokens": 0,
}


def add_copilot_usage(aggregate: dict[str, int], result: dict[str, Any] | None) -> None:
if not result:
return
for classification in result.get("classifications") or []:
if not classification.get("_copilot_cli_call"):
continue
aggregate["calls"] += 1
usage = classification.get("usage") or {}
if not usage:
aggregate["missing_usage_calls"] += 1
continue
aggregate["reported_calls"] += 1
aggregate["input_tokens"] += int(usage.get("input_tokens") or 0)
aggregate["output_tokens"] += int(usage.get("output_tokens") or 0)
aggregate["total_tokens"] += int(usage.get("total_tokens") or 0)


def copilot_usage_from_results(results: dict[int, dict[str, Any]]) -> dict[str, int]:
aggregate = empty_copilot_usage()
for result in results.values():
add_copilot_usage(aggregate, result)
return aggregate


def print_copilot_usage_summary(repo: str, model: str, usage: dict[str, int]) -> None:
print(
"copilot token usage for "
f"{repo} (model={model}): "
f"calls={usage.get('calls', 0)}, "
f"reported={usage.get('reported_calls', 0)}, "
f"missing_usage={usage.get('missing_usage_calls', 0)}, "
f"input_tokens={usage.get('input_tokens', 0)}, "
f"output_tokens={usage.get('output_tokens', 0)}, "
f"total_tokens={usage.get('total_tokens', 0)}",
file=sys.stderr,
)


def compute_pr_results(
repo: str,
owner: str,
Expand Down Expand Up @@ -930,6 +980,9 @@ def compute_pr_results(
return DashboardCalculation(
results=results,
dashboard_state=dashboard_state,
copilot_usage=copilot_usage_from_results(
{pr_number: trigger_pr_result} if trigger_pr_result else {}
),
trigger_pr_result=trigger_pr_result,
current_pr_result=current_pr_result,
starting_pr_result=starting_pr_result,
Expand Down Expand Up @@ -973,6 +1026,7 @@ def compute_pr_results(
return DashboardCalculation(
results=results,
dashboard_state=dashboard_state,
copilot_usage=copilot_usage_from_results(results),
trigger_pr_result=trigger_pr_result,
current_pr_result=current_pr_result,
)
Expand Down Expand Up @@ -1050,6 +1104,7 @@ def update_dashboard(args: argparse.Namespace) -> int:
args.model,
args.required_approvals,
)
print_copilot_usage_summary(repo, args.model, calculation.copilot_usage)

calculation, dashboard_state_unchanged = reconcile_with_latest_dashboard(
calculation,
Expand Down