Skip to content

Commit 6ae48c4

Browse files
committed
Addressed review suggestions in issue hed-standard#1178
1 parent fd45e7c commit 6ae48c4

8 files changed

Lines changed: 114 additions & 24 deletions

File tree

hed/errors/schema_error_messages.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,12 @@ def schema_error_SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE(tag, conversion_factor):
169169
@hed_error(
170170
SchemaAttributeErrors.SCHEMA_HED_ID_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID
171171
)
172-
def schema_error_SCHEMA_HED_ID_INVALID(tag, new_id, old_id=None, valid_min=None, valid_max=None):
172+
def schema_error_SCHEMA_HED_ID_INVALID(tag, new_id, old_id=None, valid_min=None, valid_max=None, duplicate_tag=None):
173+
if duplicate_tag:
174+
return (
175+
f"Tag '{tag}' has hedId '{new_id}' which is already used by '{duplicate_tag}'. "
176+
f"Each hedId must be unique across all schema sections."
177+
)
173178
if old_id:
174179
return (
175180
f"Tag '{tag}' has an invalid hedId '{new_id}'. "

hed/schema/schema_io/df_util.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,9 +248,12 @@ def load_dataframes(filenames):
248248
elif os.path.exists(filename):
249249
# Handle the extra files if they are present.
250250
dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
251-
except OSError:
252-
# todo: consider if we want to report this error(we probably do)
253-
pass # We will use a blank one for this
251+
except FileNotFoundError:
252+
pass # Missing section files are valid for partial/library schemas; caller gets an empty DataFrame
253+
except OSError as e:
254+
raise HedFileError(
255+
HedExceptions.INVALID_FILE_FORMAT, f"Could not load schema file '{filename}': {e}", filename
256+
) from e
254257
return dataframes
255258

256259

hed/schema/schema_io/hed_id_util.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", assign_mis
9090
schema_name = schema.library
9191
# 1. Verify existing HED ids don't conflict between schema/dataframes
9292
for df_key, df in dataframes.items():
93-
if df_key in constants.DF_SUFFIXES:
93+
if df_key in constants.DF_EXTRAS:
9494
continue
9595
section_key = constants.section_mapping_hed_id.get(df_key)
9696
if not section_key:
@@ -173,7 +173,7 @@ def _verify_hedid_matches(section, df, unused_tag_ids):
173173
id_value = df_id.removeprefix("HED_")
174174
try:
175175
id_int = int(id_value)
176-
if id_int not in unused_tag_ids:
176+
if unused_tag_ids and id_int not in unused_tag_ids:
177177
hedid_errors += schema_util.format_error(
178178
row_number,
179179
row,
@@ -213,8 +213,7 @@ def assign_hed_ids_section(df, unused_tag_ids):
213213
# we already verified existing ones
214214
if hed_id:
215215
continue
216-
hed_id = f"HED_{sorted_unused_ids.pop():07d}"
217-
row[constants.hed_id] = hed_id
216+
df.at[_row_number, constants.hed_id] = f"HED_{sorted_unused_ids.pop():07d}"
218217

219218

220219
def merge_dfs(dest_df, source_df):

hed/schema/schema_validation/compliance.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl
7979
issues += validator.check_invalid_characters()
8080
issues += validator.check_attributes()
8181
issues += validator.check_duplicate_names()
82+
issues += validator.check_duplicate_hed_ids()
8283
issues += validator.check_extras_columns()
8384
issues += validator.check_annotation_attribute_values()
8485

@@ -309,6 +310,35 @@ def check_attributes(self):
309310
self.summary.record_issues(len(issues))
310311
return issues
311312

313+
def check_duplicate_hed_ids(self):
314+
"""Check for duplicate hedId values across all schema sections."""
315+
self.summary.start_check(
316+
"duplicate_hed_ids",
317+
"Check for duplicate hedId values within or across schema sections.",
318+
)
319+
issues = []
320+
seen_ids: dict[str, str] = {} # maps hedId string → first tag name that used it
321+
for section_key in HedSectionKey:
322+
for entry in self.hed_schema[section_key].values():
323+
hed_id = entry.attributes.get(HedKey.HedID)
324+
if not hed_id:
325+
continue
326+
if hed_id in seen_ids:
327+
self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, str(section_key))
328+
self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, entry.name)
329+
issues += self.error_handler.format_error_with_context(
330+
SchemaAttributeErrors.SCHEMA_HED_ID_INVALID,
331+
entry.name,
332+
new_id=hed_id,
333+
duplicate_tag=seen_ids[hed_id],
334+
)
335+
self.error_handler.pop_error_context()
336+
self.error_handler.pop_error_context()
337+
else:
338+
seen_ids[hed_id] = entry.name
339+
self.summary.record_issues(len(issues))
340+
return issues
341+
312342
def check_duplicate_names(self):
313343
"""Check for duplicate entry names across library merges."""
314344
self.summary.start_check(

hed/schema/schema_validation/hed_id_validator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def verify_tag_id(self, hed_schema, tag_entry, attribute_name):
9696
except ValueError:
9797
return ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_HED_ID_INVALID, tag_entry.name, new_id)
9898
# Nothing to verify
99-
if new_id is None and old_id is None:
99+
if not new_id and old_id is None:
100100
return []
101101

102102
issues = []

tests/schema/test_hed_id_util.py

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,24 @@ def test_not_int(self):
9797
errors = _verify_hedid_matches(self.schema_82.tags, df, hed_id_util._get_hedid_range("", constants.TAG_KEY))
9898
self.assertEqual(len(errors), 1)
9999

100+
def test_verify_unknown_library_skips_range_check(self):
101+
"""An unregistered library returns empty range — IDs should not be reported as out-of-range."""
102+
empty_range = set()
103+
df = pd.DataFrame([{"rdfs:label": "Event", "hedId": "HED_0012001"}])
104+
# testlib has no library_data entry, so _get_hedid_range returns {}
105+
errors = _verify_hedid_matches(self.schema_82.tags, df, empty_range)
106+
self.assertEqual(len(errors), 0, "Unknown-library empty range should not trigger range errors")
107+
108+
def test_empty_unused_ids_no_crash(self):
109+
"""_verify_hedid_matches must not crash when unused_tag_ids is empty (covers min/max guard)."""
110+
empty_range = set()
111+
df = pd.DataFrame(
112+
[{"rdfs:label": "Event", "hedId": "HED_0099999"}, {"rdfs:label": "Age-#", "hedId": "HED_0000001"}]
113+
)
114+
# Should complete without raising ValueError from min()/max()
115+
errors = _verify_hedid_matches(self.schema_82.tags, df, empty_range)
116+
self.assertEqual(len(errors), 0)
117+
100118
def test_get_all_ids_exists(self):
101119
# Test when hedId column exists and has proper prefixed IDs
102120
df = pd.DataFrame({"hedId": ["HED_0000001", "HED_0000002", "HED_0000003"]})
@@ -156,29 +174,53 @@ def test_assign_hed_ids_section(self):
156174

157175
self.assertTrue(df.equals(expected_result))
158176

177+
def test_assign_actually_mutates_df(self):
178+
"""assign_hed_ids_section must write IDs back into the original DataFrame."""
179+
df = pd.DataFrame({"hedId": ["", "", ""], "label": ["A", "B", "C"]})
180+
assign_hed_ids_section(df, {1, 2, 3})
181+
# All rows should now have a non-empty hedId
182+
self.assertTrue(all(df["hedId"].str.startswith("HED_")), "IDs were not written into the DataFrame")
183+
184+
def test_assign_preserves_existing_ids(self):
185+
"""assign_hed_ids_section must not overwrite rows that already have an ID."""
186+
df = pd.DataFrame({"hedId": ["HED_0000005", "", "HED_0000010"], "label": ["A", "B", "C"]})
187+
assign_hed_ids_section(df, {1, 2, 3, 4, 5, 10})
188+
self.assertEqual(df.loc[0, "hedId"], "HED_0000005")
189+
self.assertEqual(df.loc[2, "hedId"], "HED_0000010")
190+
self.assertTrue(df.loc[1, "hedId"].startswith("HED_"))
191+
159192

160193
class TestUpdateDataframes(unittest.TestCase):
161194
def test_update_dataframes_from_schema(self):
162-
# valid direction first
163-
schema_dataframes = hed_schema_global.get_as_dataframes()
164-
schema_83 = load_schema_version("8.3.0")
195+
# Use matching schema + dataframes so the ID verification passes
196+
schema = load_schema_version("8.4.0")
197+
schema_dataframes = schema.get_as_dataframes()
165198
# Add a test column and ensure it stays around
166199
fixed_value = "test_column_value"
167200
for _key, df in schema_dataframes.items():
168201
df["test_column"] = fixed_value
169202

170-
updated_dataframes = update_dataframes_from_schema(schema_dataframes, schema_83)
203+
updated_dataframes = update_dataframes_from_schema(schema_dataframes, schema)
171204

172205
for key, df in updated_dataframes.items():
173206
if key not in constants.DF_EXTRAS:
174207
self.assertTrue((df["test_column"] == fixed_value).all())
175-
# this is expected to bomb horribly, since schema lacks many of the spreadsheet entries.
176-
schema = load_schema_version("8.3.0")
177-
schema_dataframes_new = load_schema_version("8.3.0").get_as_dataframes()
178-
try:
179-
update_dataframes_from_schema(schema_dataframes_new, schema)
180-
except HedFileError as e:
181-
self.assertEqual(len(e.issues), 115)
208+
209+
def test_conflict_detected(self):
210+
"""Bug #1 regression: verify HedFileError IS raised when a hedId in the dataframe mismatches the schema."""
211+
schema = load_schema_version("8.4.0")
212+
schema_dataframes = schema.get_as_dataframes()
213+
214+
# Corrupt a hedId in the Tag dataframe so it mismatches the schema
215+
tag_df = schema_dataframes[constants.TAG_KEY]
216+
# Change the first non-empty hedId to an out-of-range value
217+
non_empty_mask = tag_df["hedId"].str.startswith("HED_", na=False)
218+
first_idx = tag_df.index[non_empty_mask][0]
219+
schema_dataframes[constants.TAG_KEY].loc[first_idx, "hedId"] = "HED_0000000"
220+
221+
with self.assertRaises(HedFileError) as ctx:
222+
update_dataframes_from_schema(schema_dataframes, schema)
223+
self.assertGreater(len(ctx.exception.issues), 0)
182224

183225

184226
if __name__ == "__main__":

tests/schema/test_schema_compliance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def test_has_all_checks(self):
5353
"invalid_characters",
5454
"attributes",
5555
"duplicate_names",
56+
"duplicate_hed_ids",
5657
"extras_columns",
5758
"annotation_attributes",
5859
]

tests/schema/test_schema_validator_hed_id.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,19 @@ def test_verify_tag_id(self):
4848
id_validator = HedIDValidator(self.hed_schema84)
4949

5050
issues = id_validator.verify_tag_id(self.hed_schema84, event_entry, HedKey.HedID)
51-
self.assertTrue("It has changed", issues[0]["message"])
52-
self.assertTrue("between 10000", issues[0]["message"])
51+
self.assertGreater(len(issues), 0)
52+
messages = [i["message"] for i in issues]
53+
self.assertTrue(any("It has changed" in m for m in messages))
54+
self.assertTrue(any("between 10000" in m for m in messages))
5355

54-
event_entry = self.hed_schema84.tags["Event"]
56+
def test_verify_tag_id_invalid_format(self):
57+
"""A non-integer hedId should produce an INVALID format error."""
58+
schema84 = copy.deepcopy(self.hed_schema)
59+
schema84.header_attributes[hed_schema_constants.VERSION_ATTRIBUTE] = "8.4.0"
60+
event_entry = schema84.tags["Event"]
5561
event_entry.attributes[HedKey.HedID] = "HED_XXXXXXX"
56-
self.assertTrue("It must be an integer in the format", issues[0]["message"])
62+
63+
id_validator = HedIDValidator(schema84)
64+
issues = id_validator.verify_tag_id(schema84, event_entry, HedKey.HedID)
65+
self.assertGreater(len(issues), 0)
66+
self.assertIn("It must be an integer in the format", issues[0]["message"])

0 commit comments

Comments
 (0)