Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from sumlens.eval.ablation import ablation_table
from sumlens.types import AnalysisConfig

_COLUMNS = ["condition", "precision", "recall", "f1", "ece"]
_COLUMNS = ["condition", "roc_auc", "pr_auc", "precision", "recall", "f1", "ece"]


def _read(path: Path) -> list[dict[str, str]]:
Expand Down
20 changes: 10 additions & 10 deletions scripts/extract_features.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""Run signals A/B(/C) over a RAGTruth split and write a fusion features CSV.
"""Run signals A/B/C over a RAGTruth split and write a fusion features CSV.

For each summary sentence: classifier (A), NLI (B), and optionally attribution (C)
scores + the grounded gold label. Output feeds scripts/train_fusion.py.
For each summary sentence: classifier (A), NLI (B), and support attribution (C =
attr_conc + attr_loo) scores + the grounded gold label. Output feeds the ablation.

Attribution is off by default: RAGTruth summaries were not generated by our local
model, so Inseq attribution is not well-defined for them (see research-plan.md §8).
Enable with --with-attribution only when summaries come from our own summariser.
Signal C here is the generator-agnostic support attribution (signals/support.py),
derived from an NLI matrix, so it is well-defined for RAGTruth even though those
summaries were not generated by our local model (unlike Inseq attribution; see
research-plan.md §8). It therefore always runs.

This runs the REAL models — launch on HPC (si-gpu / sbatch), not in CI.
"""
Expand All @@ -16,9 +17,9 @@

from sumlens.eval.features import FIELDNAMES, feature_rows
from sumlens.eval.ragtruth import load_split
from sumlens.signals.attribution import attribute
from sumlens.signals.classifier import classify
from sumlens.signals.nli import entail, extract_claims
from sumlens.signals.support import support_attribution
from sumlens.types import AnalysisConfig


Expand All @@ -27,7 +28,6 @@ def main() -> None:
parser.add_argument("--data-dir", type=Path, default=Path("data/ragtruth"))
parser.add_argument("--split", default="train")
parser.add_argument("--out", type=Path, default=Path("features.csv"))
parser.add_argument("--with-attribution", action="store_true")
parser.add_argument("--limit", type=int, default=0, help="cap summaries (0 = all)")
args = parser.parse_args()

Expand All @@ -40,9 +40,9 @@ def main() -> None:
for document, summary, hallucinated in examples:
classifier_out = classify(document, summary, cfg)
nli_out = entail(extract_claims(summary), document, cfg)
attribution_out = attribute(document, summary, cfg) if args.with_attribution else {}
support_out = support_attribution(document, summary, cfg)
rows.extend(
feature_rows(summary, hallucinated, classifier_out, nli_out, attribution_out)
feature_rows(summary, hallucinated, classifier_out, nli_out, support_out)
)

with args.out.open("w", encoding="utf-8", newline="") as fh:
Expand Down
4 changes: 3 additions & 1 deletion scripts/jobs/run_eval.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
# after a crash/timeout resumes instead of redoing finished work) ---
echo ">>> extract features (train)"; [ -f features_train.csv ] || python scripts/extract_features.py --split train --data-dir data/ragtruth --out features_train.csv
echo ">>> extract features (test)"; [ -f features_test.csv ] || python scripts/extract_features.py --split test --data-dir data/ragtruth --out features_test.csv
echo ">>> train fusion"; [ -f models/fusion.pkl ] || python scripts/train_fusion.py --features features_train.csv --out-dir models
# train_fusion (live model) is intentionally skipped: this experiment compares
# signals via the ablation, which fits its own per-subset models. Promote a live
# fusion model only after the ablation shows the new attribution signals help.
echo ">>> ablation table"; [ -f ablation.csv ] || python scripts/evaluate.py --train features_train.csv --test features_test.csv --out ablation.csv

echo "=== DONE $(date) ==="
Expand Down
8 changes: 7 additions & 1 deletion scripts/train_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,19 @@
from sumlens.fuse import fit_fusion, fit_platt


def _num(value: str, impute: float = 0.5) -> float:
# A signal column is empty when that signal was off for the run (e.g.
# attribution is off for RAGTruth). Impute neutral, matching the ablation.
return impute if value == "" else float(value)


def _read(path: Path) -> tuple[list[list[float]], list[int]]:
features: list[list[float]] = []
grounded: list[int] = []
with path.open(encoding="utf-8") as fh:
for row in csv.DictReader(fh):
features.append(
[float(row["classifier"]), float(row["nli"]), float(row["attribution"])]
[_num(row["classifier"]), _num(row["nli"]), _num(row["attribution"])]
)
grounded.append(int(row["grounded"]))
return features, grounded
Expand Down
29 changes: 19 additions & 10 deletions sumlens/eval/ablation.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
"""Ablation over signal subsets — the report's centrepiece table.

For each non-empty subset of {classifier (A), NLI (B), attribution (C)} we fit a
fusion LogisticRegression on the train rows (using only that subset's columns),
predict on the test rows, and report detection precision/recall/F1 (positive class
= hallucinated) plus the calibration error of the grounding probability.

Rows are mappings with keys: classifier, nli, attribution (float or None/""),
and grounded (1 grounded / 0 hallucinated). Missing signal values are imputed.
For each non-empty subset of {classifier (A), NLI (B), attr_conc (C), attr_loo (D)}
we fit a fusion LogisticRegression on the train rows (using only that subset's
columns), predict on the test rows, and report detection precision/recall/F1
(positive class = hallucinated) plus the calibration error of the grounding
probability. C and D are the two scalars of the generator-agnostic support
attribution (signals/support.py).

Rows are mappings with keys: classifier, nli, attr_conc, attr_loo (float or
None/""), and grounded (1 grounded / 0 hallucinated). Missing values are imputed.
"""

from collections.abc import Mapping, Sequence
from itertools import combinations

from sumlens.eval.metrics import expected_calibration_error
from sumlens.eval.metrics import expected_calibration_error, pr_auc, roc_auc
from sumlens.fuse import fit_fusion

_SIGNALS = ("classifier", "nli", "attribution")
_LETTER = {"classifier": "A", "nli": "B", "attribution": "C"}
_SIGNALS = ("classifier", "nli", "attr_conc", "attr_loo")
_LETTER = {"classifier": "A", "nli": "B", "attr_conc": "C", "attr_loo": "D"}

Row = Mapping[str, object]

Expand Down Expand Up @@ -46,8 +48,15 @@ def _evaluate_combo(
true_hallucinated = [1 - g for g in y_test]
precision, recall, f1 = _prf(true_hallucinated, pred_hallucinated)

# Threshold-free detection metrics (positive class = hallucinated). f1 above
# is a single fixed-0.5 operating point and is misleading under the heavy
# hallucination class imbalance; roc_auc/pr_auc are the headline numbers.
proba_hallucinated = [1.0 - p for p in grounded_proba]

return {
"condition": "+".join(_LETTER[s] for s in combo),
"roc_auc": roc_auc(proba_hallucinated, true_hallucinated),
"pr_auc": pr_auc(proba_hallucinated, true_hallucinated),
"precision": precision,
"recall": recall,
"f1": f1,
Expand Down
35 changes: 24 additions & 11 deletions sumlens/eval/features.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,34 @@
"""Assemble fusion training rows from per-sentence signal outputs + gold labels.

One row per summary sentence: the three signal scores (None if a signal did not
run for that sentence) and the grounded label (1 if grounded, 0 if the RAGTruth
gold marks the sentence hallucinated). This pure function is the testable core of
`scripts/extract_features.py`, which supplies the real signal outputs.
One row per summary sentence: the signal scores (None if a signal did not run for
that sentence) and the grounded label (1 if grounded, 0 if the RAGTruth gold marks
the sentence hallucinated). Signal C is the generator-agnostic support attribution
(`signals/support.py`), which yields two scalars per sentence: attr_conc (support
concentration) and attr_loo (best-supporter necessity margin). This pure function
is the testable core of `scripts/extract_features.py`.
"""

from collections.abc import Mapping

from sumlens.types import Claim, Summary

FIELDNAMES = ["summary_id", "sentence_id", "classifier", "nli", "attribution", "grounded"]
FIELDNAMES = [
"summary_id",
"sentence_id",
"classifier",
"nli",
"attr_conc",
"attr_loo",
"grounded",
]


def feature_rows(
summary: Summary,
hallucinated_ids: list[str],
classifier_out: dict[str, tuple[float, list[tuple[int, int]]]],
nli_out: dict[str, tuple[float, list[Claim]]],
attribution_out: dict[str, tuple[float, list[str]]],
support_out: dict[str, tuple[float, float, list[str]]],
) -> list[dict[str, object]]:
hallucinated = set(hallucinated_ids)
rows: list[dict[str, object]] = []
Expand All @@ -27,15 +37,18 @@ def feature_rows(
{
"summary_id": summary.id,
"sentence_id": sentence.id,
"classifier": _score(classifier_out, sentence.id),
"nli": _score(nli_out, sentence.id),
"attribution": _score(attribution_out, sentence.id),
"classifier": _at(classifier_out, sentence.id, 0),
"nli": _at(nli_out, sentence.id, 0),
"attr_conc": _at(support_out, sentence.id, 0),
"attr_loo": _at(support_out, sentence.id, 1),
"grounded": 0 if sentence.id in hallucinated else 1,
}
)
return rows


def _score(signal_out: Mapping[str, tuple[float, object]], sentence_id: str) -> float | None:
def _at(
signal_out: Mapping[str, tuple[object, ...]], sentence_id: str, index: int
) -> float | None:
entry = signal_out.get(sentence_id)
return entry[0] if entry is not None else None
return float(entry[index]) if entry is not None else None # type: ignore[arg-type]
45 changes: 45 additions & 0 deletions sumlens/eval/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,51 @@ def sentence_f1(preds: dict[str, set[str]], golds: dict[str, set[str]]) -> dict[
return {"precision": precision, "recall": recall, "f1": f1}


def roc_auc(scores: list[float], labels: list[int]) -> float:
"""Threshold-free ROC-AUC (rank-based, ties averaged). `scores` rank the
positive class (label 1). Returns 0.0 if either class is absent."""
n_pos = sum(labels)
n_neg = len(labels) - n_pos
if not n_pos or not n_neg:
return 0.0
order = sorted(zip(scores, labels, strict=True), key=lambda p: p[0])
ranks = [0.0] * len(order)
i = 0
while i < len(order):
j = i
while j < len(order) and order[j][0] == order[i][0]:
j += 1
rank = (i + j - 1) / 2 + 1 # 1-based average rank for the tie group
for k in range(i, j):
ranks[k] = rank
i = j
rank_sum_pos = sum(r for r, (_, label) in zip(ranks, order, strict=True) if label == 1)
return (rank_sum_pos - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg)


def pr_auc(scores: list[float], labels: list[int]) -> float:
"""Average precision (area under precision-recall curve). `scores` rank the
positive class (label 1). Returns 0.0 if no positives. Better than ROC-AUC
under heavy class imbalance; floor is the positive base rate."""
n_pos = sum(labels)
if not n_pos:
return 0.0
order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
tp = fp = 0
ap = 0.0
prev_recall = 0.0
for i in order:
if labels[i] == 1:
tp += 1
else:
fp += 1
recall = tp / n_pos
precision = tp / (tp + fp)
ap += (recall - prev_recall) * precision
prev_recall = recall
return ap


def expected_calibration_error(
scores: list[float], labels: list[int], n_bins: int = 10
) -> float:
Expand Down
51 changes: 51 additions & 0 deletions sumlens/signals/support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Signal C (redesign) — generator-agnostic source attribution from an NLI matrix.

Inseq attribution (`attribution.py`) is gradient-based and needs the *generating*
model, so it is undefined for RAGTruth (external-model summaries). This signal
derives attribution from entailment alone, so it is defined for any (source,
summary) pair. For each summary sentence ``s`` we score entailment against every
source sentence ``j``, ``M[s][j] = P(src_j entails s)``, then collapse the row:

- ``attr_conc(s) = max_j M - mean_j M`` — support concentration. A grounded
sentence has one sharp supporter; a fabricated one has diffuse, flat-low support.
- ``attr_loo(s) = top1 - top2`` — necessity margin of the single best supporter.
- top-k source sentence ids — the UI heatmap (generator-free, no token offsets).

Reuses signal B's NLI model and batched call. Pure given the NLI boundary, which
tests mock via `_get_nli`. Consumed by `scripts/extract_features.py`.
"""

from sumlens.signals.nli import _entail_prob, _get_nli
from sumlens.types import AnalysisConfig, Document, Summary

_BATCH_SIZE = 64
_TOP_K = 5


def support_attribution(
document: Document, summary: Summary, cfg: AnalysisConfig
) -> dict[str, tuple[float, float, list[str]]]:
"""Per summary sentence: (attr_conc, attr_loo, top-k source sentence ids)."""
sources = document.sentences
sentences = summary.sentences
if not sentences or not sources:
return {s.id: (0.0, 0.0, []) for s in sentences}

nli = _get_nli(cfg.nli_model)
pairs = [
{"text": src.text, "text_pair": sent.text} for sent in sentences for src in sources
]
batched = nli(pairs, top_k=None, batch_size=_BATCH_SIZE)
n = len(sources)

results: dict[str, tuple[float, float, list[str]]] = {}
for i, sentence in enumerate(sentences):
row = [_entail_prob(scores) for scores in batched[i * n : (i + 1) * n]]
order = sorted(range(n), key=lambda j: row[j], reverse=True)
top1 = row[order[0]]
top2 = row[order[1]] if n > 1 else 0.0
conc = top1 - sum(row) / n
loo = top1 - top2
top_ids = [sources[j].id for j in order[:_TOP_K]]
results[sentence.id] = (conc, loo, top_ids)
return results
21 changes: 14 additions & 7 deletions tests/test_ablation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,35 @@

from sumlens.eval.ablation import ablation_table

_GROUNDED = {"classifier": 0.9, "nli": 0.8, "attribution": 0.7, "grounded": 1}
_HALLUCINATED = {"classifier": 0.1, "nli": 0.2, "attribution": 0.3, "grounded": 0}
_GROUNDED = {"classifier": 0.9, "nli": 0.8, "attr_conc": 0.7, "attr_loo": 0.6, "grounded": 1}
_HALLUCINATED = {"classifier": 0.1, "nli": 0.2, "attr_conc": 0.3, "attr_loo": 0.2, "grounded": 0}
_ROWS = [_GROUNDED, _HALLUCINATED] * 10


def test_ablation_table_conditions_and_scores() -> None:
table = ablation_table(_ROWS, _ROWS)

conditions = {row["condition"] for row in table}
assert conditions == {"A", "B", "C", "A+B", "A+C", "B+C", "A+B+C"}
assert conditions == {
"A", "B", "C", "D",
"A+B", "A+C", "A+D", "B+C", "B+D", "C+D",
"A+B+C", "A+B+D", "A+C+D", "B+C+D",
"A+B+C+D",
}

for row in table:
for key in ("precision", "recall", "f1", "ece"):
for key in ("roc_auc", "pr_auc", "precision", "recall", "f1", "ece"):
assert isinstance(row[key], float)

fused = next(row for row in table if row["condition"] == "A+B+C")
fused = next(row for row in table if row["condition"] == "A+B+C+D")
assert fused["f1"] == 1.0 # perfectly separable -> perfect detection
assert fused["roc_auc"] == 1.0
assert fused["pr_auc"] == 1.0


def test_ablation_imputes_missing_attribution() -> None:
# attribution missing ("") on every row -> still runs via imputation
rows = [{**r, "attribution": ""} for r in _ROWS]
# attr_conc missing ("") on every row -> still runs via imputation
rows = [{**r, "attr_conc": ""} for r in _ROWS]
table = ablation_table(rows, rows)
c_only = next(row for row in table if row["condition"] == "C")
assert isinstance(c_only["f1"], float)
11 changes: 7 additions & 4 deletions tests/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,28 @@ def test_feature_rows_labels_and_missing_signals() -> None:
classifier_out = {"sum-0000": (0.1, []), "sum-0001": (0.9, [(0, 4)])}
failed = Claim(id="c", sentence_id="sum-0001", text="x")
nli_out = {"sum-0000": (0.8, []), "sum-0001": (0.2, [failed])}
attribution_out = {"sum-0001": (0.3, ["src-0000"])} # only the gated sentence has C
# support attribution: (attr_conc, attr_loo, top_source_ids); sum-0000 absent
support_out = {"sum-0001": (0.3, 0.15, ["src-0000"])}

rows = feature_rows(_summary(), ["sum-0001"], classifier_out, nli_out, attribution_out)
rows = feature_rows(_summary(), ["sum-0001"], classifier_out, nli_out, support_out)

assert rows == [
{
"summary_id": "r1",
"sentence_id": "sum-0000",
"classifier": 0.1,
"nli": 0.8,
"attribution": None, # C did not run for this sentence
"attr_conc": None, # C did not run for this sentence
"attr_loo": None,
"grounded": 1,
},
{
"summary_id": "r1",
"sentence_id": "sum-0001",
"classifier": 0.9,
"nli": 0.2,
"attribution": 0.3,
"attr_conc": 0.3,
"attr_loo": 0.15,
"grounded": 0, # marked hallucinated in gold
},
]
Loading
Loading