Addressed review suggestions in issue hed-standard#1178

VisLab · VisLab · commit 6ae48c4ae267 · 2026-03-13T10:30:50.000-05:00
diff --git a/hed/errors/schema_error_messages.py b/hed/errors/schema_error_messages.py
@@ -169,7 +169,12 @@ def schema_error_SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE(tag, conversion_factor):
 @hed_error(
     SchemaAttributeErrors.SCHEMA_HED_ID_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID
 )
-def schema_error_SCHEMA_HED_ID_INVALID(tag, new_id, old_id=None, valid_min=None, valid_max=None):
+def schema_error_SCHEMA_HED_ID_INVALID(tag, new_id, old_id=None, valid_min=None, valid_max=None, duplicate_tag=None):
+    if duplicate_tag:
+        return (
+            f"Tag '{tag}' has hedId '{new_id}' which is already used by '{duplicate_tag}'.  "
+            f"Each hedId must be unique across all schema sections."
+        )
     if old_id:
         return (
             f"Tag '{tag}' has an invalid hedId '{new_id}'.  "
diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py
@@ -248,9 +248,12 @@ def load_dataframes(filenames):
             elif os.path.exists(filename):
                 # Handle the extra files if they are present.
                 dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
-        except OSError:
-            # todo: consider if we want to report this error(we probably do)
-            pass  # We will use a blank one for this
+        except FileNotFoundError:
+            pass  # Missing section files are valid for partial/library schemas; caller gets an empty DataFrame
+        except OSError as e:
+            raise HedFileError(
+                HedExceptions.INVALID_FILE_FORMAT, f"Could not load schema file '{filename}': {e}", filename
+            ) from e
     return dataframes
 
 
diff --git a/hed/schema/schema_io/hed_id_util.py b/hed/schema/schema_io/hed_id_util.py
@@ -90,7 +90,7 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", assign_mis
         schema_name = schema.library
     # 1. Verify existing HED ids don't conflict between schema/dataframes
     for df_key, df in dataframes.items():
-        if df_key in constants.DF_SUFFIXES:
+        if df_key in constants.DF_EXTRAS:
             continue
         section_key = constants.section_mapping_hed_id.get(df_key)
         if not section_key:
@@ -173,7 +173,7 @@ def _verify_hedid_matches(section, df, unused_tag_ids):
             id_value = df_id.removeprefix("HED_")
             try:
                 id_int = int(id_value)
-                if id_int not in unused_tag_ids:
+                if unused_tag_ids and id_int not in unused_tag_ids:
                     hedid_errors += schema_util.format_error(
                         row_number,
                         row,
@@ -213,8 +213,7 @@ def assign_hed_ids_section(df, unused_tag_ids):
         # we already verified existing ones
         if hed_id:
             continue
-        hed_id = f"HED_{sorted_unused_ids.pop():07d}"
-        row[constants.hed_id] = hed_id
+        df.at[_row_number, constants.hed_id] = f"HED_{sorted_unused_ids.pop():07d}"
 
 
 def merge_dfs(dest_df, source_df):
diff --git a/hed/schema/schema_validation/compliance.py b/hed/schema/schema_validation/compliance.py
@@ -79,6 +79,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl
     issues += validator.check_invalid_characters()
     issues += validator.check_attributes()
     issues += validator.check_duplicate_names()
+    issues += validator.check_duplicate_hed_ids()
     issues += validator.check_extras_columns()
     issues += validator.check_annotation_attribute_values()
 
@@ -309,6 +310,35 @@ def check_attributes(self):
         self.summary.record_issues(len(issues))
         return issues
 
+    def check_duplicate_hed_ids(self):
+        """Check for duplicate hedId values across all schema sections."""
+        self.summary.start_check(
+            "duplicate_hed_ids",
+            "Check for duplicate hedId values within or across schema sections.",
+        )
+        issues = []
+        seen_ids: dict[str, str] = {}  # maps hedId string → first tag name that used it
+        for section_key in HedSectionKey:
+            for entry in self.hed_schema[section_key].values():
+                hed_id = entry.attributes.get(HedKey.HedID)
+                if not hed_id:
+                    continue
+                if hed_id in seen_ids:
+                    self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, str(section_key))
+                    self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, entry.name)
+                    issues += self.error_handler.format_error_with_context(
+                        SchemaAttributeErrors.SCHEMA_HED_ID_INVALID,
+                        entry.name,
+                        new_id=hed_id,
+                        duplicate_tag=seen_ids[hed_id],
+                    )
+                    self.error_handler.pop_error_context()
+                    self.error_handler.pop_error_context()
+                else:
+                    seen_ids[hed_id] = entry.name
+        self.summary.record_issues(len(issues))
+        return issues
+
     def check_duplicate_names(self):
         """Check for duplicate entry names across library merges."""
         self.summary.start_check(
diff --git a/hed/schema/schema_validation/hed_id_validator.py b/hed/schema/schema_validation/hed_id_validator.py
@@ -96,7 +96,7 @@ def verify_tag_id(self, hed_schema, tag_entry, attribute_name):
             except ValueError:
                 return ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_HED_ID_INVALID, tag_entry.name, new_id)
         # Nothing to verify
-        if new_id is None and old_id is None:
+        if not new_id and old_id is None:
             return []
 
         issues = []
diff --git a/tests/schema/test_hed_id_util.py b/tests/schema/test_hed_id_util.py
@@ -97,6 +97,24 @@ def test_not_int(self):
         errors = _verify_hedid_matches(self.schema_82.tags, df, hed_id_util._get_hedid_range("", constants.TAG_KEY))
         self.assertEqual(len(errors), 1)
 
+    def test_verify_unknown_library_skips_range_check(self):
+        """An unregistered library returns empty range — IDs should not be reported as out-of-range."""
+        empty_range = set()
+        df = pd.DataFrame([{"rdfs:label": "Event", "hedId": "HED_0012001"}])
+        # testlib has no library_data entry, so _get_hedid_range returns {}
+        errors = _verify_hedid_matches(self.schema_82.tags, df, empty_range)
+        self.assertEqual(len(errors), 0, "Unknown-library empty range should not trigger range errors")
+
+    def test_empty_unused_ids_no_crash(self):
+        """_verify_hedid_matches must not crash when unused_tag_ids is empty (covers min/max guard)."""
+        empty_range = set()
+        df = pd.DataFrame(
+            [{"rdfs:label": "Event", "hedId": "HED_0099999"}, {"rdfs:label": "Age-#", "hedId": "HED_0000001"}]
+        )
+        # Should complete without raising ValueError from min()/max()
+        errors = _verify_hedid_matches(self.schema_82.tags, df, empty_range)
+        self.assertEqual(len(errors), 0)
+
     def test_get_all_ids_exists(self):
         # Test when hedId column exists and has proper prefixed IDs
         df = pd.DataFrame({"hedId": ["HED_0000001", "HED_0000002", "HED_0000003"]})
@@ -156,29 +174,53 @@ def test_assign_hed_ids_section(self):
 
         self.assertTrue(df.equals(expected_result))
 
+    def test_assign_actually_mutates_df(self):
+        """assign_hed_ids_section must write IDs back into the original DataFrame."""
+        df = pd.DataFrame({"hedId": ["", "", ""], "label": ["A", "B", "C"]})
+        assign_hed_ids_section(df, {1, 2, 3})
+        # All rows should now have a non-empty hedId
+        self.assertTrue(all(df["hedId"].str.startswith("HED_")), "IDs were not written into the DataFrame")
+
+    def test_assign_preserves_existing_ids(self):
+        """assign_hed_ids_section must not overwrite rows that already have an ID."""
+        df = pd.DataFrame({"hedId": ["HED_0000005", "", "HED_0000010"], "label": ["A", "B", "C"]})
+        assign_hed_ids_section(df, {1, 2, 3, 4, 5, 10})
+        self.assertEqual(df.loc[0, "hedId"], "HED_0000005")
+        self.assertEqual(df.loc[2, "hedId"], "HED_0000010")
+        self.assertTrue(df.loc[1, "hedId"].startswith("HED_"))
+
 
 class TestUpdateDataframes(unittest.TestCase):
     def test_update_dataframes_from_schema(self):
-        # valid direction first
-        schema_dataframes = hed_schema_global.get_as_dataframes()
-        schema_83 = load_schema_version("8.3.0")
+        # Use matching schema + dataframes so the ID verification passes
+        schema = load_schema_version("8.4.0")
+        schema_dataframes = schema.get_as_dataframes()
         # Add a test column and ensure it stays around
         fixed_value = "test_column_value"
         for _key, df in schema_dataframes.items():
             df["test_column"] = fixed_value
 
-        updated_dataframes = update_dataframes_from_schema(schema_dataframes, schema_83)
+        updated_dataframes = update_dataframes_from_schema(schema_dataframes, schema)
 
         for key, df in updated_dataframes.items():
             if key not in constants.DF_EXTRAS:
                 self.assertTrue((df["test_column"] == fixed_value).all())
-        # this is expected to bomb horribly, since schema lacks many of the spreadsheet entries.
-        schema = load_schema_version("8.3.0")
-        schema_dataframes_new = load_schema_version("8.3.0").get_as_dataframes()
-        try:
-            update_dataframes_from_schema(schema_dataframes_new, schema)
-        except HedFileError as e:
-            self.assertEqual(len(e.issues), 115)
+
+    def test_conflict_detected(self):
+        """Bug #1 regression: verify HedFileError IS raised when a hedId in the dataframe mismatches the schema."""
+        schema = load_schema_version("8.4.0")
+        schema_dataframes = schema.get_as_dataframes()
+
+        # Corrupt a hedId in the Tag dataframe so it mismatches the schema
+        tag_df = schema_dataframes[constants.TAG_KEY]
+        # Change the first non-empty hedId to an out-of-range value
+        non_empty_mask = tag_df["hedId"].str.startswith("HED_", na=False)
+        first_idx = tag_df.index[non_empty_mask][0]
+        schema_dataframes[constants.TAG_KEY].loc[first_idx, "hedId"] = "HED_0000000"
+
+        with self.assertRaises(HedFileError) as ctx:
+            update_dataframes_from_schema(schema_dataframes, schema)
+        self.assertGreater(len(ctx.exception.issues), 0)
 
 
 if __name__ == "__main__":
diff --git a/tests/schema/test_schema_compliance.py b/tests/schema/test_schema_compliance.py
@@ -53,6 +53,7 @@ def test_has_all_checks(self):
             "invalid_characters",
             "attributes",
             "duplicate_names",
+            "duplicate_hed_ids",
             "extras_columns",
             "annotation_attributes",
         ]
diff --git a/tests/schema/test_schema_validator_hed_id.py b/tests/schema/test_schema_validator_hed_id.py
@@ -48,9 +48,19 @@ def test_verify_tag_id(self):
         id_validator = HedIDValidator(self.hed_schema84)
 
         issues = id_validator.verify_tag_id(self.hed_schema84, event_entry, HedKey.HedID)
-        self.assertTrue("It has changed", issues[0]["message"])
-        self.assertTrue("between 10000", issues[0]["message"])
+        self.assertGreater(len(issues), 0)
+        messages = [i["message"] for i in issues]
+        self.assertTrue(any("It has changed" in m for m in messages))
+        self.assertTrue(any("between 10000" in m for m in messages))
 
-        event_entry = self.hed_schema84.tags["Event"]
+    def test_verify_tag_id_invalid_format(self):
+        """A non-integer hedId should produce an INVALID format error."""
+        schema84 = copy.deepcopy(self.hed_schema)
+        schema84.header_attributes[hed_schema_constants.VERSION_ATTRIBUTE] = "8.4.0"
+        event_entry = schema84.tags["Event"]
         event_entry.attributes[HedKey.HedID] = "HED_XXXXXXX"
-        self.assertTrue("It must be an integer in the format", issues[0]["message"])
+
+        id_validator = HedIDValidator(schema84)
+        issues = id_validator.verify_tag_id(schema84, event_entry, HedKey.HedID)
+        self.assertGreater(len(issues), 0)
+        self.assertIn("It must be an integer in the format", issues[0]["message"])

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,7 @@ def test_has_all_checks(self):`
`53`	`53`	`"invalid_characters",`
`54`	`54`	`"attributes",`
`55`	`55`	`"duplicate_names",`
	`56`	`+ "duplicate_hed_ids",`
`56`	`57`	`"extras_columns",`
`57`	`58`	`"annotation_attributes",`
`58`	`59`	`]`