llm-benchmark-script/reporting.py at main · Shuichi346/llm-benchmark-script · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
ベンチマーク結果の保存・表示

JSON ファイルへの保存と、ターミナルへのテーブル表示を担当する。
"""

import datetime
import json
import os
from typing import Any

from tabulate import tabulate

from benchmarks import normalize_score


def format_score(value: Any) -> str:
    """スコア表示用の文字列に変換する"""
    score = normalize_score(value)
    if score is None:
        return "N/A"
    return f"{score:.4f}"


def _get_short_name(model_name: str) -> str:
    """長いモデル名を表示用に短縮する"""
    if "/" in model_name:
        return model_name.split("/")[-1]
    return model_name


def save_results(all_results: dict[str, Any], output_dir: str = "results") -> str:
    """全結果をタイムスタンプ付き JSON ファイルに保存する"""
    os.makedirs(output_dir, exist_ok=True)

    now = datetime.datetime.now().astimezone()
    timestamp = now.strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join(output_dir, f"benchmark_{timestamp}.json")

    with open(filepath, "w", encoding="utf-8") as file:
        json.dump(all_results, file, ensure_ascii=False, indent=2, default=str)

    return filepath


def _extract_task_and_score(
    record: dict[str, Any],
) -> tuple[str | None, float | None]:
    """タスク別スコアの1行からタスク名とスコアを取り出す"""
    normalized_record = {
        str(key).strip().lower().replace(" ", "_"): value
        for key, value in record.items()
    }

    task_value = None
    for key in ("task", "task_name", "subject", "category"):
        if key in normalized_record:
            task_value = normalized_record[key]
            break

    score_value = None
    for key in ("score", "overall_score", "accuracy"):
        if key in normalized_record:
            score_value = normalized_record[key]
            break

    task_name = None if task_value is None else str(task_value)
    score = normalize_score(score_value)

    return task_name, score


def print_summary_table(all_results: dict[str, Any]) -> None:
    """全モデルの overall_score を比較するテーブルを表示する"""
    results = all_results.get("results", {})
    benchmark_names = all_results.get("metadata", {}).get("benchmarks", [])

    if not results or not benchmark_names:
        print("結果がありません。")
        return

    headers = ["モデル"] + benchmark_names
    rows: list[list[str]] = []

    for model_name, model_data in results.items():
        row = [_get_short_name(model_name)]

        for benchmark_name in benchmark_names:
            benchmark_result = model_data.get(benchmark_name, {})

            if "error" in benchmark_result:
                row.append("ERROR")
            else:
                row.append(format_score(benchmark_result.get("overall_score")))

        rows.append(row)

    print("\n" + "=" * 70)
    print(" ベンチマーク結果比較（overall_score）")
    print("=" * 70)
    print(tabulate(rows, headers=headers, tablefmt="grid"))
    print()


def print_task_detail_tables(all_results: dict[str, Any]) -> None:
    """タスク別スコアの詳細テーブルを表示する"""
    results = all_results.get("results", {})
    benchmark_names = all_results.get("metadata", {}).get("benchmarks", [])

    for benchmark_name in benchmark_names:
        model_task_scores: dict[str, dict[str, float | None]] = {}
        all_task_names: set[str] = set()

        for model_name, model_data in results.items():
            benchmark_result = model_data.get(benchmark_name, {})
            task_scores = benchmark_result.get("task_scores")

            if not isinstance(task_scores, list):
                continue

            model_task_scores[model_name] = {}

            for record in task_scores:
                if not isinstance(record, dict):
                    continue

                task_name, score = _extract_task_and_score(record)
                if task_name is None:
                    continue

                model_task_scores[model_name][task_name] = score
                all_task_names.add(task_name)

        if not all_task_names:
            continue

        print(f"--- {benchmark_name} タスク別スコア ---")

        model_names = list(results.keys())
        headers = ["タスク"] + [_get_short_name(name) for name in model_names]
        rows: list[list[str]] = []

        for task_name in sorted(all_task_names):
            row = [task_name]

            for model_name in model_names:
                score = model_task_scores.get(model_name, {}).get(task_name)
                row.append("-" if score is None else format_score(score))

            rows.append(row)

        print(tabulate(rows, headers=headers, tablefmt="grid"))
        print()