diff --git a/scripts/evaluate.py b/scripts/evaluate.py index 5489990..293c47c 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -15,7 +15,7 @@ from sumlens.eval.ablation import ablation_table from sumlens.types import AnalysisConfig -_COLUMNS = ["condition", "precision", "recall", "f1", "ece"] +_COLUMNS = ["condition", "roc_auc", "pr_auc", "precision", "recall", "f1", "ece"] def _read(path: Path) -> list[dict[str, str]]: diff --git a/scripts/extract_features.py b/scripts/extract_features.py index e9a9c02..0e1503b 100644 --- a/scripts/extract_features.py +++ b/scripts/extract_features.py @@ -1,11 +1,12 @@ -"""Run signals A/B(/C) over a RAGTruth split and write a fusion features CSV. +"""Run signals A/B/C over a RAGTruth split and write a fusion features CSV. -For each summary sentence: classifier (A), NLI (B), and optionally attribution (C) -scores + the grounded gold label. Output feeds scripts/train_fusion.py. +For each summary sentence: classifier (A), NLI (B), and support attribution (C = +attr_conc + attr_loo) scores + the grounded gold label. Output feeds the ablation. -Attribution is off by default: RAGTruth summaries were not generated by our local -model, so Inseq attribution is not well-defined for them (see research-plan.md §8). -Enable with --with-attribution only when summaries come from our own summariser. +Signal C here is the generator-agnostic support attribution (signals/support.py), +derived from an NLI matrix, so it is well-defined for RAGTruth even though those +summaries were not generated by our local model (unlike Inseq attribution; see +research-plan.md §8). It therefore always runs. This runs the REAL models — launch on HPC (si-gpu / sbatch), not in CI. """ @@ -16,9 +17,9 @@ from sumlens.eval.features import FIELDNAMES, feature_rows from sumlens.eval.ragtruth import load_split -from sumlens.signals.attribution import attribute from sumlens.signals.classifier import classify from sumlens.signals.nli import entail, extract_claims +from sumlens.signals.support import support_attribution from sumlens.types import AnalysisConfig @@ -27,7 +28,6 @@ def main() -> None: parser.add_argument("--data-dir", type=Path, default=Path("data/ragtruth")) parser.add_argument("--split", default="train") parser.add_argument("--out", type=Path, default=Path("features.csv")) - parser.add_argument("--with-attribution", action="store_true") parser.add_argument("--limit", type=int, default=0, help="cap summaries (0 = all)") args = parser.parse_args() @@ -40,9 +40,9 @@ def main() -> None: for document, summary, hallucinated in examples: classifier_out = classify(document, summary, cfg) nli_out = entail(extract_claims(summary), document, cfg) - attribution_out = attribute(document, summary, cfg) if args.with_attribution else {} + support_out = support_attribution(document, summary, cfg) rows.extend( - feature_rows(summary, hallucinated, classifier_out, nli_out, attribution_out) + feature_rows(summary, hallucinated, classifier_out, nli_out, support_out) ) with args.out.open("w", encoding="utf-8", newline="") as fh: diff --git a/scripts/jobs/run_eval.sbatch b/scripts/jobs/run_eval.sbatch index efdbfc8..f68a47c 100755 --- a/scripts/jobs/run_eval.sbatch +++ b/scripts/jobs/run_eval.sbatch @@ -45,7 +45,9 @@ python -c "import torch; print('CUDA available:', torch.cuda.is_available())" # after a crash/timeout resumes instead of redoing finished work) --- echo ">>> extract features (train)"; [ -f features_train.csv ] || python scripts/extract_features.py --split train --data-dir data/ragtruth --out features_train.csv echo ">>> extract features (test)"; [ -f features_test.csv ] || python scripts/extract_features.py --split test --data-dir data/ragtruth --out features_test.csv -echo ">>> train fusion"; [ -f models/fusion.pkl ] || python scripts/train_fusion.py --features features_train.csv --out-dir models +# train_fusion (live model) is intentionally skipped: this experiment compares +# signals via the ablation, which fits its own per-subset models. Promote a live +# fusion model only after the ablation shows the new attribution signals help. echo ">>> ablation table"; [ -f ablation.csv ] || python scripts/evaluate.py --train features_train.csv --test features_test.csv --out ablation.csv echo "=== DONE $(date) ===" diff --git a/scripts/train_fusion.py b/scripts/train_fusion.py index 90eb67d..e3d693b 100644 --- a/scripts/train_fusion.py +++ b/scripts/train_fusion.py @@ -14,13 +14,19 @@ from sumlens.fuse import fit_fusion, fit_platt +def _num(value: str, impute: float = 0.5) -> float: + # A signal column is empty when that signal was off for the run (e.g. + # attribution is off for RAGTruth). Impute neutral, matching the ablation. + return impute if value == "" else float(value) + + def _read(path: Path) -> tuple[list[list[float]], list[int]]: features: list[list[float]] = [] grounded: list[int] = [] with path.open(encoding="utf-8") as fh: for row in csv.DictReader(fh): features.append( - [float(row["classifier"]), float(row["nli"]), float(row["attribution"])] + [_num(row["classifier"]), _num(row["nli"]), _num(row["attribution"])] ) grounded.append(int(row["grounded"])) return features, grounded diff --git a/sumlens/eval/ablation.py b/sumlens/eval/ablation.py index 25773a9..7daeb2e 100644 --- a/sumlens/eval/ablation.py +++ b/sumlens/eval/ablation.py @@ -1,22 +1,24 @@ """Ablation over signal subsets — the report's centrepiece table. -For each non-empty subset of {classifier (A), NLI (B), attribution (C)} we fit a -fusion LogisticRegression on the train rows (using only that subset's columns), -predict on the test rows, and report detection precision/recall/F1 (positive class -= hallucinated) plus the calibration error of the grounding probability. - -Rows are mappings with keys: classifier, nli, attribution (float or None/""), -and grounded (1 grounded / 0 hallucinated). Missing signal values are imputed. +For each non-empty subset of {classifier (A), NLI (B), attr_conc (C), attr_loo (D)} +we fit a fusion LogisticRegression on the train rows (using only that subset's +columns), predict on the test rows, and report detection precision/recall/F1 +(positive class = hallucinated) plus the calibration error of the grounding +probability. C and D are the two scalars of the generator-agnostic support +attribution (signals/support.py). + +Rows are mappings with keys: classifier, nli, attr_conc, attr_loo (float or +None/""), and grounded (1 grounded / 0 hallucinated). Missing values are imputed. """ from collections.abc import Mapping, Sequence from itertools import combinations -from sumlens.eval.metrics import expected_calibration_error +from sumlens.eval.metrics import expected_calibration_error, pr_auc, roc_auc from sumlens.fuse import fit_fusion -_SIGNALS = ("classifier", "nli", "attribution") -_LETTER = {"classifier": "A", "nli": "B", "attribution": "C"} +_SIGNALS = ("classifier", "nli", "attr_conc", "attr_loo") +_LETTER = {"classifier": "A", "nli": "B", "attr_conc": "C", "attr_loo": "D"} Row = Mapping[str, object] @@ -46,8 +48,15 @@ def _evaluate_combo( true_hallucinated = [1 - g for g in y_test] precision, recall, f1 = _prf(true_hallucinated, pred_hallucinated) + # Threshold-free detection metrics (positive class = hallucinated). f1 above + # is a single fixed-0.5 operating point and is misleading under the heavy + # hallucination class imbalance; roc_auc/pr_auc are the headline numbers. + proba_hallucinated = [1.0 - p for p in grounded_proba] + return { "condition": "+".join(_LETTER[s] for s in combo), + "roc_auc": roc_auc(proba_hallucinated, true_hallucinated), + "pr_auc": pr_auc(proba_hallucinated, true_hallucinated), "precision": precision, "recall": recall, "f1": f1, diff --git a/sumlens/eval/features.py b/sumlens/eval/features.py index a9ca58d..3d56b75 100644 --- a/sumlens/eval/features.py +++ b/sumlens/eval/features.py @@ -1,16 +1,26 @@ """Assemble fusion training rows from per-sentence signal outputs + gold labels. -One row per summary sentence: the three signal scores (None if a signal did not -run for that sentence) and the grounded label (1 if grounded, 0 if the RAGTruth -gold marks the sentence hallucinated). This pure function is the testable core of -`scripts/extract_features.py`, which supplies the real signal outputs. +One row per summary sentence: the signal scores (None if a signal did not run for +that sentence) and the grounded label (1 if grounded, 0 if the RAGTruth gold marks +the sentence hallucinated). Signal C is the generator-agnostic support attribution +(`signals/support.py`), which yields two scalars per sentence: attr_conc (support +concentration) and attr_loo (best-supporter necessity margin). This pure function +is the testable core of `scripts/extract_features.py`. """ from collections.abc import Mapping from sumlens.types import Claim, Summary -FIELDNAMES = ["summary_id", "sentence_id", "classifier", "nli", "attribution", "grounded"] +FIELDNAMES = [ + "summary_id", + "sentence_id", + "classifier", + "nli", + "attr_conc", + "attr_loo", + "grounded", +] def feature_rows( @@ -18,7 +28,7 @@ def feature_rows( hallucinated_ids: list[str], classifier_out: dict[str, tuple[float, list[tuple[int, int]]]], nli_out: dict[str, tuple[float, list[Claim]]], - attribution_out: dict[str, tuple[float, list[str]]], + support_out: dict[str, tuple[float, float, list[str]]], ) -> list[dict[str, object]]: hallucinated = set(hallucinated_ids) rows: list[dict[str, object]] = [] @@ -27,15 +37,18 @@ def feature_rows( { "summary_id": summary.id, "sentence_id": sentence.id, - "classifier": _score(classifier_out, sentence.id), - "nli": _score(nli_out, sentence.id), - "attribution": _score(attribution_out, sentence.id), + "classifier": _at(classifier_out, sentence.id, 0), + "nli": _at(nli_out, sentence.id, 0), + "attr_conc": _at(support_out, sentence.id, 0), + "attr_loo": _at(support_out, sentence.id, 1), "grounded": 0 if sentence.id in hallucinated else 1, } ) return rows -def _score(signal_out: Mapping[str, tuple[float, object]], sentence_id: str) -> float | None: +def _at( + signal_out: Mapping[str, tuple[object, ...]], sentence_id: str, index: int +) -> float | None: entry = signal_out.get(sentence_id) - return entry[0] if entry is not None else None + return float(entry[index]) if entry is not None else None # type: ignore[arg-type] diff --git a/sumlens/eval/metrics.py b/sumlens/eval/metrics.py index 6a35054..a121fcb 100644 --- a/sumlens/eval/metrics.py +++ b/sumlens/eval/metrics.py @@ -22,6 +22,51 @@ def sentence_f1(preds: dict[str, set[str]], golds: dict[str, set[str]]) -> dict[ return {"precision": precision, "recall": recall, "f1": f1} +def roc_auc(scores: list[float], labels: list[int]) -> float: + """Threshold-free ROC-AUC (rank-based, ties averaged). `scores` rank the + positive class (label 1). Returns 0.0 if either class is absent.""" + n_pos = sum(labels) + n_neg = len(labels) - n_pos + if not n_pos or not n_neg: + return 0.0 + order = sorted(zip(scores, labels, strict=True), key=lambda p: p[0]) + ranks = [0.0] * len(order) + i = 0 + while i < len(order): + j = i + while j < len(order) and order[j][0] == order[i][0]: + j += 1 + rank = (i + j - 1) / 2 + 1 # 1-based average rank for the tie group + for k in range(i, j): + ranks[k] = rank + i = j + rank_sum_pos = sum(r for r, (_, label) in zip(ranks, order, strict=True) if label == 1) + return (rank_sum_pos - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg) + + +def pr_auc(scores: list[float], labels: list[int]) -> float: + """Average precision (area under precision-recall curve). `scores` rank the + positive class (label 1). Returns 0.0 if no positives. Better than ROC-AUC + under heavy class imbalance; floor is the positive base rate.""" + n_pos = sum(labels) + if not n_pos: + return 0.0 + order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) + tp = fp = 0 + ap = 0.0 + prev_recall = 0.0 + for i in order: + if labels[i] == 1: + tp += 1 + else: + fp += 1 + recall = tp / n_pos + precision = tp / (tp + fp) + ap += (recall - prev_recall) * precision + prev_recall = recall + return ap + + def expected_calibration_error( scores: list[float], labels: list[int], n_bins: int = 10 ) -> float: diff --git a/sumlens/signals/support.py b/sumlens/signals/support.py new file mode 100644 index 0000000..1401dc4 --- /dev/null +++ b/sumlens/signals/support.py @@ -0,0 +1,51 @@ +"""Signal C (redesign) — generator-agnostic source attribution from an NLI matrix. + +Inseq attribution (`attribution.py`) is gradient-based and needs the *generating* +model, so it is undefined for RAGTruth (external-model summaries). This signal +derives attribution from entailment alone, so it is defined for any (source, +summary) pair. For each summary sentence ``s`` we score entailment against every +source sentence ``j``, ``M[s][j] = P(src_j entails s)``, then collapse the row: + +- ``attr_conc(s) = max_j M - mean_j M`` — support concentration. A grounded + sentence has one sharp supporter; a fabricated one has diffuse, flat-low support. +- ``attr_loo(s) = top1 - top2`` — necessity margin of the single best supporter. +- top-k source sentence ids — the UI heatmap (generator-free, no token offsets). + +Reuses signal B's NLI model and batched call. Pure given the NLI boundary, which +tests mock via `_get_nli`. Consumed by `scripts/extract_features.py`. +""" + +from sumlens.signals.nli import _entail_prob, _get_nli +from sumlens.types import AnalysisConfig, Document, Summary + +_BATCH_SIZE = 64 +_TOP_K = 5 + + +def support_attribution( + document: Document, summary: Summary, cfg: AnalysisConfig +) -> dict[str, tuple[float, float, list[str]]]: + """Per summary sentence: (attr_conc, attr_loo, top-k source sentence ids).""" + sources = document.sentences + sentences = summary.sentences + if not sentences or not sources: + return {s.id: (0.0, 0.0, []) for s in sentences} + + nli = _get_nli(cfg.nli_model) + pairs = [ + {"text": src.text, "text_pair": sent.text} for sent in sentences for src in sources + ] + batched = nli(pairs, top_k=None, batch_size=_BATCH_SIZE) + n = len(sources) + + results: dict[str, tuple[float, float, list[str]]] = {} + for i, sentence in enumerate(sentences): + row = [_entail_prob(scores) for scores in batched[i * n : (i + 1) * n]] + order = sorted(range(n), key=lambda j: row[j], reverse=True) + top1 = row[order[0]] + top2 = row[order[1]] if n > 1 else 0.0 + conc = top1 - sum(row) / n + loo = top1 - top2 + top_ids = [sources[j].id for j in order[:_TOP_K]] + results[sentence.id] = (conc, loo, top_ids) + return results diff --git a/tests/test_ablation.py b/tests/test_ablation.py index 7e448f9..8fab62c 100644 --- a/tests/test_ablation.py +++ b/tests/test_ablation.py @@ -2,8 +2,8 @@ from sumlens.eval.ablation import ablation_table -_GROUNDED = {"classifier": 0.9, "nli": 0.8, "attribution": 0.7, "grounded": 1} -_HALLUCINATED = {"classifier": 0.1, "nli": 0.2, "attribution": 0.3, "grounded": 0} +_GROUNDED = {"classifier": 0.9, "nli": 0.8, "attr_conc": 0.7, "attr_loo": 0.6, "grounded": 1} +_HALLUCINATED = {"classifier": 0.1, "nli": 0.2, "attr_conc": 0.3, "attr_loo": 0.2, "grounded": 0} _ROWS = [_GROUNDED, _HALLUCINATED] * 10 @@ -11,19 +11,26 @@ def test_ablation_table_conditions_and_scores() -> None: table = ablation_table(_ROWS, _ROWS) conditions = {row["condition"] for row in table} - assert conditions == {"A", "B", "C", "A+B", "A+C", "B+C", "A+B+C"} + assert conditions == { + "A", "B", "C", "D", + "A+B", "A+C", "A+D", "B+C", "B+D", "C+D", + "A+B+C", "A+B+D", "A+C+D", "B+C+D", + "A+B+C+D", + } for row in table: - for key in ("precision", "recall", "f1", "ece"): + for key in ("roc_auc", "pr_auc", "precision", "recall", "f1", "ece"): assert isinstance(row[key], float) - fused = next(row for row in table if row["condition"] == "A+B+C") + fused = next(row for row in table if row["condition"] == "A+B+C+D") assert fused["f1"] == 1.0 # perfectly separable -> perfect detection + assert fused["roc_auc"] == 1.0 + assert fused["pr_auc"] == 1.0 def test_ablation_imputes_missing_attribution() -> None: - # attribution missing ("") on every row -> still runs via imputation - rows = [{**r, "attribution": ""} for r in _ROWS] + # attr_conc missing ("") on every row -> still runs via imputation + rows = [{**r, "attr_conc": ""} for r in _ROWS] table = ablation_table(rows, rows) c_only = next(row for row in table if row["condition"] == "C") assert isinstance(c_only["f1"], float) diff --git a/tests/test_features.py b/tests/test_features.py index 41c5950..62c6d38 100644 --- a/tests/test_features.py +++ b/tests/test_features.py @@ -21,9 +21,10 @@ def test_feature_rows_labels_and_missing_signals() -> None: classifier_out = {"sum-0000": (0.1, []), "sum-0001": (0.9, [(0, 4)])} failed = Claim(id="c", sentence_id="sum-0001", text="x") nli_out = {"sum-0000": (0.8, []), "sum-0001": (0.2, [failed])} - attribution_out = {"sum-0001": (0.3, ["src-0000"])} # only the gated sentence has C + # support attribution: (attr_conc, attr_loo, top_source_ids); sum-0000 absent + support_out = {"sum-0001": (0.3, 0.15, ["src-0000"])} - rows = feature_rows(_summary(), ["sum-0001"], classifier_out, nli_out, attribution_out) + rows = feature_rows(_summary(), ["sum-0001"], classifier_out, nli_out, support_out) assert rows == [ { @@ -31,7 +32,8 @@ def test_feature_rows_labels_and_missing_signals() -> None: "sentence_id": "sum-0000", "classifier": 0.1, "nli": 0.8, - "attribution": None, # C did not run for this sentence + "attr_conc": None, # C did not run for this sentence + "attr_loo": None, "grounded": 1, }, { @@ -39,7 +41,8 @@ def test_feature_rows_labels_and_missing_signals() -> None: "sentence_id": "sum-0001", "classifier": 0.9, "nli": 0.2, - "attribution": 0.3, + "attr_conc": 0.3, + "attr_loo": 0.15, "grounded": 0, # marked hallucinated in gold }, ] diff --git a/tests/test_metrics.py b/tests/test_metrics.py index c030132..9a9d129 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -6,7 +6,9 @@ from sumlens.eval.metrics import ( expected_calibration_error, + pr_auc, reliability_diagram, + roc_auc, sentence_f1, ) @@ -41,6 +43,38 @@ def test_ece_empty() -> None: assert expected_calibration_error([], []) == 0.0 +def test_roc_auc_perfect_separation() -> None: + assert roc_auc([0.1, 0.2, 0.8, 0.9], [0, 0, 1, 1]) == 1.0 + + +def test_roc_auc_inverted_is_zero() -> None: + assert roc_auc([0.9, 0.8, 0.2, 0.1], [0, 0, 1, 1]) == 0.0 + + +def test_roc_auc_ties_give_half() -> None: + # all scores equal -> every pair tied -> AUC 0.5 + assert roc_auc([0.5, 0.5, 0.5, 0.5], [0, 1, 0, 1]) == 0.5 + + +def test_roc_auc_single_class_returns_zero() -> None: + assert roc_auc([0.1, 0.9], [1, 1]) == 0.0 + + +def test_pr_auc_perfect_separation() -> None: + assert pr_auc([0.1, 0.2, 0.8, 0.9], [0, 0, 1, 1]) == 1.0 + + +def test_pr_auc_floor_is_base_rate() -> None: + # scores carry no signal (descending but labels random) -> AP near base rate + assert pr_auc([0.4, 0.3, 0.2, 0.1], [1, 0, 0, 0]) == pytest.approx(1.0) + # worst ranking: the only positive is last -> precision 1/4 at recall 1 + assert pr_auc([0.4, 0.3, 0.2, 0.1], [0, 0, 0, 1]) == pytest.approx(0.25) + + +def test_pr_auc_no_positives_returns_zero() -> None: + assert pr_auc([0.1, 0.9], [0, 0]) == 0.0 + + def test_reliability_diagram_writes_file(tmp_path: Path) -> None: out = tmp_path / "reliability.png" reliability_diagram([0.1, 0.4, 0.9, 0.95], [0, 0, 1, 1], out) diff --git a/tests/test_support.py b/tests/test_support.py new file mode 100644 index 0000000..d8ed9c0 --- /dev/null +++ b/tests/test_support.py @@ -0,0 +1,69 @@ +"""Support attribution (signal C) tests — NLI mocked at the `_get_nli` boundary.""" + +import pytest + +from sumlens.signals import support as support_mod +from sumlens.signals.support import support_attribution +from sumlens.types import AnalysisConfig, Document, Sentence, Summary + +# Entailment lookup: (premise source sentence, hypothesis summary sentence) -> prob. +_TABLE = { + ("Src A.", "Claim one."): 0.9, + ("Src B.", "Claim one."): 0.2, + ("Src C.", "Claim one."): 0.1, +} + + +class _FakeNLI: + def __call__( + self, pairs: list[dict[str, str]], top_k: object = None, batch_size: object = None + ) -> list[list[dict[str, object]]]: + return [ + [ + {"label": "entailment", "score": _TABLE[(p["text"], p["text_pair"])]}, + {"label": "contradiction", "score": 0.0}, + ] + for p in pairs + ] + + +def _document() -> Document: + return Document( + id="doc-1", + raw_text="Src A. Src B. Src C.", + sentences=[ + Sentence(id="src-0000", text="Src A.", char_start=0, char_end=6), + Sentence(id="src-0001", text="Src B.", char_start=7, char_end=13), + Sentence(id="src-0002", text="Src C.", char_start=14, char_end=20), + ], + source="text", + ) + + +def _summary() -> Summary: + return Summary( + id="doc-1-summary", + document_id="doc-1", + text="Claim one.", + sentences=[Sentence(id="sum-0000", text="Claim one.", char_start=0, char_end=10)], + model_name="m", + ) + + +def test_support_concentration_and_loo(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(support_mod, "_get_nli", lambda model_name: _FakeNLI()) + + result = support_attribution(_document(), _summary(), AnalysisConfig()) + + conc, loo, top_ids = result["sum-0000"] + # row = [0.9, 0.2, 0.1]: top1=0.9, top2=0.2, mean=0.4 + assert conc == pytest.approx(0.9 - 0.4) # peak minus mean + assert loo == pytest.approx(0.9 - 0.2) # best-supporter margin + assert top_ids[0] == "src-0000" # strongest supporting source first + + +def test_support_empty_source(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(support_mod, "_get_nli", lambda model_name: _FakeNLI()) + empty_doc = Document(id="d", raw_text="", sentences=[], source="text") + result = support_attribution(empty_doc, _summary(), AnalysisConfig()) + assert result == {"sum-0000": (0.0, 0.0, [])}