Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20260622000000000000.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Fix incorrect deduplication of entities that share a title but have different types in finalize_entities (issue #1718)."
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,19 @@ async def finalize_entities(
) -> list[dict[str, Any]]:
"""Read entity rows, enrich with degree, and write back.

Streams through the entities table, deduplicates by title,
Streams through the entities table, deduplicates by ``(title, type)``,
assigns degree from the pre-computed degree map, and writes
each finalized row back to the same table (safe when using
truncate=True, which reads from the original and writes to
a temp file).

Deduplication is keyed on the ``(title, type)`` pair rather than on
``title`` alone. Distinct entities can share a title while having
different types (for example ``"IBM"`` extracted both as an
``ORGANIZATION`` and as a ``GEO``); keying on title alone silently
dropped every entity after the first for a given title, discarding
their descriptions and breaking their edges (see issue #1718).

Args
----
entities_table: Table
Expand All @@ -36,14 +43,17 @@ async def finalize_entities(
Sample of up to 5 entity rows for logging.
"""
sample_rows: list[dict[str, Any]] = []
seen_titles: set[str] = set()
seen_keys: set[tuple[str, Any]] = set()
human_readable_id = 0

async for row in entities_table:
title = row.get("title")
if not title or title in seen_titles:
if not title:
continue
key = (title, row.get("type"))
if key in seen_keys:
continue
seen_titles.add(title)
seen_keys.add(key)
row["degree"] = degree_map.get(title, 0)
row["human_readable_id"] = human_readable_id
row["id"] = str(uuid4())
Expand Down
30 changes: 30 additions & 0 deletions tests/unit/indexing/test_finalize_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,36 @@ async def test_deduplicates_by_title(self):
titles = [r["title"] for r in table.written]
assert titles == ["A", "B"]


async def test_keeps_same_title_different_type(self):
"""Entities sharing a title but with different types must all be kept (issue #1718)."""
table = FakeTable([
_make_entity_row("IBM", entity_type="ORGANIZATION"),
_make_entity_row("IBM", entity_type="GEO"),
])
degree_map = {"IBM": 4}
await finalize_entities(table, degree_map)

assert len(table.written) == 2
types = {r["type"] for r in table.written}
assert types == {"ORGANIZATION", "GEO"}
# both retain the degree looked up by title
assert all(r["degree"] == 4 for r in table.written)

async def test_deduplicates_same_title_and_type(self):
"""True duplicates (same title AND type) are still collapsed to one."""
table = FakeTable([
_make_entity_row("IBM", entity_type="ORGANIZATION"),
_make_entity_row("IBM", entity_type="ORGANIZATION"),
_make_entity_row("IBM", entity_type="GEO"),
])
degree_map = {"IBM": 1}
await finalize_entities(table, degree_map)

assert len(table.written) == 2
pairs = [(r["title"], r["type"]) for r in table.written]
assert pairs == [("IBM", "ORGANIZATION"), ("IBM", "GEO")]

async def test_skips_empty_title(self):
"""Rows with empty or missing title should be skipped."""
table = FakeTable([
Expand Down
Loading