Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 48 additions & 1 deletion python/server_routes/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,47 @@
from compare_bundles import build_canonical_lexemes_report_tsv, build_compare_bundles
from concept_identity import identity_payload, load_concept_identity, write_concept_identity_override


def find_cognate_partition_violations(cognate_sets: _server.Any) -> set[tuple[str, str]]:
"""Return (concept, speaker) pairs assigned to multiple manual cognate groups."""
if not isinstance(cognate_sets, dict):
return set()
violations: set[tuple[str, str]] = set()
for concept_key, groups in cognate_sets.items():
if not isinstance(groups, dict):
continue
concept = str(concept_key).strip()
first_group_by_speaker: dict[str, str] = {}
for group_key, speakers in groups.items():
if not isinstance(speakers, list):
continue
group = str(group_key).strip()
seen_in_group: set[str] = set()
for raw_speaker in speakers:
speaker = str(raw_speaker).strip()
if not speaker or speaker in seen_in_group:
continue
seen_in_group.add(speaker)
previous_group = first_group_by_speaker.setdefault(speaker, group)
if previous_group != group:
violations.add((concept, speaker))
return violations


def _format_cognate_partition_error(violations: set[tuple[str, str]]) -> str:
details = ', '.join('{0}: {1}'.format(concept, speaker) for concept, speaker in sorted(violations))
return 'Invalid cognate partition: manual_overrides.cognate_sets lists speaker(s) in multiple groups for the same concept: {0}'.format(details)


def _manual_cognate_sets_from_payload(payload: _server.Any) -> _server.Any:
if not isinstance(payload, dict):
return None
manual_overrides = payload.get('manual_overrides')
if not isinstance(manual_overrides, dict):
return None
return manual_overrides.get('cognate_sets')


def _compute_cognates(job_id: str, payload: _server.Dict[str, _server.Any]) -> _server.Dict[str, _server.Any]:
if _server.cognate_compute_module is None:
raise RuntimeError('compare.cognate_compute is unavailable')
Expand Down Expand Up @@ -63,6 +104,9 @@ def _compute_cognates(job_id: str, payload: _server.Dict[str, _server.Any]) -> _
enrichments_payload = _server._read_json_file(output_path, _server._default_enrichments_payload())
enrichments_payload.update({'computed_at': _server._utc_now_iso(), 'config': {'contact_languages': list(contact_languages), 'speakers_included': speakers_included, 'concepts_included': [_server._concept_out_value(concept_id) for concept_id in selected_concept_ids], 'lexstat_threshold': round(float(threshold), 3)}, 'cognate_sets': cognate_sets, 'similarity': similarity})
enrichments_payload.setdefault('manual_overrides', {})
violations = find_cognate_partition_violations(_manual_cognate_sets_from_payload(enrichments_payload))
if violations:
raise RuntimeError(_format_cognate_partition_error(violations))
_server._set_job_progress(job_id, 92.0, message='Writing parse-enrichments.json')
_server._write_json_file(output_path, enrichments_payload)
return {'type': 'cognates', 'outputPath': str(output_path), 'computedAt': enrichments_payload['computed_at'], 'conceptCount': len(enrichments_payload['config']['concepts_included']), 'speakerCount': len(enrichments_payload['config']['speakers_included']), 'skippedFormCount': len(skipped_forms), 'skippedForms': skipped_forms[:50]}
Expand Down Expand Up @@ -103,6 +147,9 @@ def _api_post_enrichments(self) -> None:
incoming_manual = enrichments_payload.get('manual_overrides')
if isinstance(existing_manual, dict) and existing_manual and (not isinstance(incoming_manual, dict) or not incoming_manual):
merged_payload['manual_overrides'] = _server.copy.deepcopy(existing_manual)
violations = find_cognate_partition_violations(_manual_cognate_sets_from_payload(merged_payload))
if violations:
raise _server.ApiError(_server.HTTPStatus.BAD_REQUEST, _format_cognate_partition_error(violations))
_server._write_json_file(output_path, merged_payload)
self._send_json(_server.HTTPStatus.OK, {'success': True})

Expand Down Expand Up @@ -281,5 +328,5 @@ def _api_get_canonical_lexemes_report(self) -> None:
self.wfile.write(body)


__all__ = ['_compute_cognates', '_api_get_enrichments', '_api_post_enrichments', '_api_post_lexeme_note', '_api_post_lexeme_notes_import', '_api_get_lexeme_search', '_api_get_tags', '_api_post_tags_merge', '_api_get_concept_identity', '_api_post_concept_identity', '_api_get_compare_bundles', '_api_put_compare_canonical_lexeme', '_api_delete_compare_canonical_lexeme', '_api_get_canonical_lexemes_report']
__all__ = ['find_cognate_partition_violations', '_compute_cognates', '_api_get_enrichments', '_api_post_enrichments', '_api_post_lexeme_note', '_api_post_lexeme_notes_import', '_api_get_lexeme_search', '_api_get_tags', '_api_post_tags_merge', '_api_get_concept_identity', '_api_post_concept_identity', '_api_get_compare_bundles', '_api_put_compare_canonical_lexeme', '_api_delete_compare_canonical_lexeme', '_api_get_canonical_lexemes_report']

58 changes: 57 additions & 1 deletion python/test_compare_enrichments_manual_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent))

import server # noqa: E402
from server_routes import jobs # noqa: E402
from server_routes import compare, jobs # noqa: E402


@dataclass
Expand Down Expand Up @@ -113,6 +113,15 @@ def _install_compute_fakes(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPat
server._install_route_bindings()


def test_find_cognate_partition_violations_reports_cross_group_speaker_membership() -> None:
cognate_sets = {
"black": {"A": ["Kalh02", "Kalh02"], "B": ["Mand01", "Kalh02"]},
"white": {"A": ["Mand01"], "B": []},
}

assert compare.find_cognate_partition_violations(cognate_sets) == {("black", "Kalh02")}


@pytest.mark.parametrize("compute_type", ["cognates", "similarity"])
def test_compute_job_preserves_manual_overrides_and_unrelated_enrichment_keys(
tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch, compute_type: str
Expand Down Expand Up @@ -162,6 +171,21 @@ def test_compute_cognates_without_existing_enrichments_defaults_manual_overrides
assert written["cognate_sets"] == {"538": {"AUTO": ["Mand01", "Qasr01", "Saha01"]}}


def test_compute_cognates_rejects_malformed_existing_manual_cognate_partition(
tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
) -> None:
_install_compute_fakes(tmp_path, monkeypatch)
existing = {"manual_overrides": {"cognate_sets": {"black": {"A": ["Kalh02"], "B": ["Kalh02"]}}}}
_seed_enrichments(tmp_path, existing)

with pytest.raises(RuntimeError) as exc_info:
server._compute_cognates("job-1", {"threshold": 0.6})

assert "black" in str(exc_info.value)
assert "Kalh02" in str(exc_info.value)
assert _read_enrichments(tmp_path) == existing


@pytest.mark.parametrize("incoming_enrichments", [{"cognate_sets": {"538": {"AUTO": ["Mand01"]}}}, {"manual_overrides": {}}])
def test_post_enrichments_preserves_non_empty_disk_manual_overrides_when_client_omits_them(
tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch, incoming_enrichments: dict[str, Any]
Expand All @@ -187,6 +211,38 @@ def test_post_enrichments_preserves_non_empty_disk_manual_overrides_when_client_
assert written["manual_overrides"] == manual_overrides


def test_post_enrichments_rejects_duplicate_manual_cognate_membership_after_merge(
tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
) -> None:
existing = {"manual_overrides": {"cognate_sets": {"black": {"A": ["Kalh02"], "B": ["Mand01"]}}}}
_seed_enrichments(tmp_path, existing)
monkeypatch.setattr(server, "_project_root", lambda: tmp_path)
server._install_route_bindings()
handler = _Handler({"enrichments": {"manual_overrides": {"cognate_sets": {"black": {"B": ["Mand01", "Kalh02"]}}}}})

with pytest.raises(server.ApiError) as exc_info:
handler._api_post_enrichments()

assert exc_info.value.status == HTTPStatus.BAD_REQUEST
assert "black" in exc_info.value.message
assert "Kalh02" in exc_info.value.message
assert _read_enrichments(tmp_path) == existing


def test_post_enrichments_persists_single_membership_manual_cognate_partition(
tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
) -> None:
_seed_enrichments(tmp_path, {"manual_overrides": {"cognate_sets": {"black": {"A": ["Kalh02"], "B": []}}}})
monkeypatch.setattr(server, "_project_root", lambda: tmp_path)
server._install_route_bindings()
handler = _Handler({"enrichments": {"manual_overrides": {"cognate_sets": {"black": {"A": [], "B": ["Kalh02"]}}}}})

handler._api_post_enrichments()

assert handler.status == HTTPStatus.OK
assert _read_enrichments(tmp_path)["manual_overrides"]["cognate_sets"]["black"] == {"A": [], "B": ["Kalh02"]}


def test_post_enrichments_persists_emptied_cognate_group_from_full_snapshot(
tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
) -> None:
Expand Down
Loading