diff --git a/.semversioner/next-release/patch-20260622000000000000.json b/.semversioner/next-release/patch-20260622000000000000.json new file mode 100644 index 0000000000..ec9bef7618 --- /dev/null +++ b/.semversioner/next-release/patch-20260622000000000000.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Fix incorrect deduplication of entities that share a title but have different types in finalize_entities (issue #1718)." +} diff --git a/packages/graphrag/graphrag/index/operations/finalize_entities.py b/packages/graphrag/graphrag/index/operations/finalize_entities.py index a0121a5b72..5b0ddd5d7b 100644 --- a/packages/graphrag/graphrag/index/operations/finalize_entities.py +++ b/packages/graphrag/graphrag/index/operations/finalize_entities.py @@ -17,12 +17,19 @@ async def finalize_entities( ) -> list[dict[str, Any]]: """Read entity rows, enrich with degree, and write back. - Streams through the entities table, deduplicates by title, + Streams through the entities table, deduplicates by ``(title, type)``, assigns degree from the pre-computed degree map, and writes each finalized row back to the same table (safe when using truncate=True, which reads from the original and writes to a temp file). + Deduplication is keyed on the ``(title, type)`` pair rather than on + ``title`` alone. Distinct entities can share a title while having + different types (for example ``"IBM"`` extracted both as an + ``ORGANIZATION`` and as a ``GEO``); keying on title alone silently + dropped every entity after the first for a given title, discarding + their descriptions and breaking their edges (see issue #1718). + Args ---- entities_table: Table @@ -36,14 +43,17 @@ async def finalize_entities( Sample of up to 5 entity rows for logging. """ sample_rows: list[dict[str, Any]] = [] - seen_titles: set[str] = set() + seen_keys: set[tuple[str, Any]] = set() human_readable_id = 0 async for row in entities_table: title = row.get("title") - if not title or title in seen_titles: + if not title: + continue + key = (title, row.get("type")) + if key in seen_keys: continue - seen_titles.add(title) + seen_keys.add(key) row["degree"] = degree_map.get(title, 0) row["human_readable_id"] = human_readable_id row["id"] = str(uuid4()) diff --git a/tests/unit/indexing/test_finalize_graph.py b/tests/unit/indexing/test_finalize_graph.py index 20daa49666..5e053267a8 100644 --- a/tests/unit/indexing/test_finalize_graph.py +++ b/tests/unit/indexing/test_finalize_graph.py @@ -205,6 +205,36 @@ async def test_deduplicates_by_title(self): titles = [r["title"] for r in table.written] assert titles == ["A", "B"] + + async def test_keeps_same_title_different_type(self): + """Entities sharing a title but with different types must all be kept (issue #1718).""" + table = FakeTable([ + _make_entity_row("IBM", entity_type="ORGANIZATION"), + _make_entity_row("IBM", entity_type="GEO"), + ]) + degree_map = {"IBM": 4} + await finalize_entities(table, degree_map) + + assert len(table.written) == 2 + types = {r["type"] for r in table.written} + assert types == {"ORGANIZATION", "GEO"} + # both retain the degree looked up by title + assert all(r["degree"] == 4 for r in table.written) + + async def test_deduplicates_same_title_and_type(self): + """True duplicates (same title AND type) are still collapsed to one.""" + table = FakeTable([ + _make_entity_row("IBM", entity_type="ORGANIZATION"), + _make_entity_row("IBM", entity_type="ORGANIZATION"), + _make_entity_row("IBM", entity_type="GEO"), + ]) + degree_map = {"IBM": 1} + await finalize_entities(table, degree_map) + + assert len(table.written) == 2 + pairs = [(r["title"], r["type"]) for r in table.written] + assert pairs == [("IBM", "ORGANIZATION"), ("IBM", "GEO")] + async def test_skips_empty_title(self): """Rows with empty or missing title should be skipped.""" table = FakeTable([