microsoft · HashwanthVen · Jun 22, 2026
@@ -0,0 +1,4 @@
+{
+    "type": "patch",
+    "description": "Fix incorrect deduplication of entities that share a title but have different types in finalize_entities (issue #1718)."
+}
@@ -17,12 +17,19 @@ async def finalize_entities(
 ) -> list[dict[str, Any]]:
     """Read entity rows, enrich with degree, and write back.
 
-    Streams through the entities table, deduplicates by title,
+    Streams through the entities table, deduplicates by ``(title, type)``,
     assigns degree from the pre-computed degree map, and writes
     each finalized row back to the same table (safe when using
     truncate=True, which reads from the original and writes to
     a temp file).
 
+    Deduplication is keyed on the ``(title, type)`` pair rather than on
+    ``title`` alone. Distinct entities can share a title while having
+    different types (for example ``"IBM"`` extracted both as an
+    ``ORGANIZATION`` and as a ``GEO``); keying on title alone silently
+    dropped every entity after the first for a given title, discarding
+    their descriptions and breaking their edges (see issue #1718).
+
     Args
     ----
         entities_table: Table
@@ -36,14 +43,17 @@ async def finalize_entities(
             Sample of up to 5 entity rows for logging.
     """
     sample_rows: list[dict[str, Any]] = []
-    seen_titles: set[str] = set()
+    seen_keys: set[tuple[str, Any]] = set()
     human_readable_id = 0
 
     async for row in entities_table:
         title = row.get("title")
-        if not title or title in seen_titles:
+        if not title:
+            continue
+        key = (title, row.get("type"))
+        if key in seen_keys:
             continue
-        seen_titles.add(title)
+        seen_keys.add(key)
         row["degree"] = degree_map.get(title, 0)
         row["human_readable_id"] = human_readable_id
         row["id"] = str(uuid4())

@@ -205,6 +205,36 @@ async def test_deduplicates_by_title(self):
         titles = [r["title"] for r in table.written]
         assert titles == ["A", "B"]
 
+
+    async def test_keeps_same_title_different_type(self):
+        """Entities sharing a title but with different types must all be kept (issue #1718)."""
+        table = FakeTable([
+            _make_entity_row("IBM", entity_type="ORGANIZATION"),
+            _make_entity_row("IBM", entity_type="GEO"),
+        ])
+        degree_map = {"IBM": 4}
+        await finalize_entities(table, degree_map)
+
+        assert len(table.written) == 2
+        types = {r["type"] for r in table.written}
+        assert types == {"ORGANIZATION", "GEO"}
+        # both retain the degree looked up by title
+        assert all(r["degree"] == 4 for r in table.written)
+
+    async def test_deduplicates_same_title_and_type(self):
+        """True duplicates (same title AND type) are still collapsed to one."""
+        table = FakeTable([
+            _make_entity_row("IBM", entity_type="ORGANIZATION"),
+            _make_entity_row("IBM", entity_type="ORGANIZATION"),
+            _make_entity_row("IBM", entity_type="GEO"),
+        ])
+        degree_map = {"IBM": 1}
+        await finalize_entities(table, degree_map)
+
+        assert len(table.written) == 2
+        pairs = [(r["title"], r["type"]) for r in table.written]
+        assert pairs == [("IBM", "ORGANIZATION"), ("IBM", "GEO")]
+
     async def test_skips_empty_title(self):
         """Rows with empty or missing title should be skipped."""
         table = FakeTable([