Skip to content

Commit 29bb542

Browse files
authored
Merge pull request #1236 from VisLab/fix_extras
Add more compliance tests for the extras
2 parents 2448818 + 5660951 commit 29bb542

7 files changed

Lines changed: 609 additions & 13 deletions

File tree

hed/errors/error_types.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,14 @@ class SchemaAttributeErrors:
154154

155155
SCHEMA_GENERIC_ATTRIBUTE_VALUE_INVALID = "SCHEMA_GENERIC_ATTRIBUTE_VALUE_INVALID"
156156

157+
# Extras section errors
158+
SCHEMA_MISSING_EXTRA_VALUE = "SCHEMA_MISSING_EXTRA_VALUE"
159+
160+
# Annotation attribute errors
161+
SCHEMA_ANNOTATION_PREFIX_MISSING = "SCHEMA_ANNOTATION_PREFIX_MISSING"
162+
SCHEMA_ANNOTATION_EXTERNAL_MISSING = "SCHEMA_ANNOTATION_EXTERNAL_MISSING"
163+
SCHEMA_ANNOTATION_SOURCE_MISSING = "SCHEMA_ANNOTATION_SOURCE_MISSING"
164+
157165

158166
class DefinitionErrors:
159167
# These are all DEFINITION_INVALID errors

hed/errors/schema_error_messages.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,41 @@ def schema_error_SCHEMA_ALLOWED_CHARACTERS_INVALID(tag, invalid_character):
186186
@hed_error(SchemaAttributeErrors.SCHEMA_IN_LIBRARY_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID)
187187
def schema_error_SCHEMA_IN_LIBRARY_INVALID(tag, bad_library):
188188
return f"Tag '{tag}' has an invalid inLibrary: '{bad_library}'. "
189+
190+
191+
@hed_error(SchemaAttributeErrors.SCHEMA_MISSING_EXTRA_VALUE, default_severity=ErrorSeverity.WARNING)
192+
def schema_error_SCHEMA_MISSING_EXTRA_VALUE(section_name, column_name, row_index):
193+
return f"Extras section '{section_name}' has an empty value in column '{column_name}' " f"at row {row_index}."
194+
195+
196+
@hed_error(
197+
SchemaAttributeErrors.SCHEMA_ANNOTATION_PREFIX_MISSING,
198+
default_severity=ErrorSeverity.WARNING,
199+
)
200+
def schema_error_annotation_prefix_missing(tag, annotation_value, prefix):
201+
return (
202+
f"Tag '{tag}' has annotation '{annotation_value}' with prefix '{prefix}' "
203+
f"that is not defined in the Prefixes section."
204+
)
205+
206+
207+
@hed_error(
208+
SchemaAttributeErrors.SCHEMA_ANNOTATION_EXTERNAL_MISSING,
209+
default_severity=ErrorSeverity.WARNING,
210+
)
211+
def schema_error_annotation_external_missing(tag, annotation_value, prefix, annotation_id):
212+
return (
213+
f"Tag '{tag}' has annotation '{annotation_value}' with prefix:id '{prefix}{annotation_id}' "
214+
f"that is not defined in the ExternalAnnotations section."
215+
)
216+
217+
218+
@hed_error(
219+
SchemaAttributeErrors.SCHEMA_ANNOTATION_SOURCE_MISSING,
220+
default_severity=ErrorSeverity.WARNING,
221+
)
222+
def schema_error_annotation_source_missing(tag, annotation_value, source_text):
223+
return (
224+
f"Tag '{tag}' has annotation '{annotation_value}' referencing dc:source "
225+
f"but '{source_text}' does not start with any name defined in the Sources section."
226+
)

hed/schema/schema_validation/compliance.py

Lines changed: 216 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from functools import partial
1717

18+
import pandas as pd
1819
from semantic_version import Version
1920

2021
from hed.errors.error_reporter import ErrorHandler, sort_issues
@@ -28,6 +29,7 @@
2829
from hed.schema import hed_cache
2930
from hed.schema.schema_validation import attribute_validators
3031
from hed.schema.hed_schema import HedSchema, HedKey, HedSectionKey
32+
from hed.schema.schema_io import df_constants
3133
from hed.schema.schema_validation.hed_id_validator import HedIDValidator
3234
from hed.schema.schema_validation.compliance_summary import ComplianceSummary
3335
from hed.schema.schema_validation.validation_util import (
@@ -77,6 +79,8 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl
7779
issues += validator.check_invalid_characters()
7880
issues += validator.check_attributes()
7981
issues += validator.check_duplicate_names()
82+
issues += validator.check_extras_columns()
83+
issues += validator.check_annotation_attribute_values()
8084

8185
error_handler.pop_error_context()
8286
issues = sort_issues(issues)
@@ -328,6 +332,218 @@ def check_duplicate_names(self):
328332
self.summary.record_issues(len(issues))
329333
return issues
330334

335+
def check_extras_columns(self):
336+
"""Validate that all extras DataFrames have non-empty values in required columns.
337+
338+
For each extras section (Sources, Prefixes, ExternalAnnotations), checks
339+
that every cell in the required columns defined in
340+
``df_constants.extras_column_dict`` has a non-empty value.
341+
342+
Note:
343+
Missing columns are automatically added with empty strings during
344+
schema loading (see ``base2schema.fix_extra``), so only value
345+
presence needs to be checked here.
346+
"""
347+
self.summary.start_check(
348+
"extras_columns",
349+
"Validate extras sections have non-empty values in required columns.",
350+
)
351+
self.summary.add_sub_check("non-empty cell values")
352+
353+
issues = []
354+
extras = getattr(self.hed_schema, "extras", {}) or {}
355+
for section_name, required_cols in df_constants.extras_column_dict.items():
356+
df = extras.get(section_name)
357+
if df is None or (isinstance(df, pd.DataFrame) and df.empty):
358+
# Empty extras are fine — nothing to validate
359+
continue
360+
361+
rows_checked = len(df)
362+
self.summary.record_section(section_name, rows_checked)
363+
364+
for col in required_cols:
365+
if col not in df.columns:
366+
continue
367+
mask = df[col].isna() | df[col].astype(str).str.strip().eq("")
368+
for row_idx in mask[mask].index:
369+
issues += ErrorHandler.format_error(
370+
SchemaAttributeErrors.SCHEMA_MISSING_EXTRA_VALUE,
371+
section_name=section_name,
372+
column_name=col,
373+
row_index=row_idx,
374+
)
375+
376+
self.error_handler.add_context_and_filter(issues)
377+
self.summary.record_issues(len(issues))
378+
return issues
379+
380+
def check_annotation_attribute_values(self):
381+
"""Validate that annotation attribute values reference valid prefixes, external annotations, and sources.
382+
383+
For each entry that has an ``annotation`` attribute, checks that:
384+
385+
1. The value starts with ``prefix:id`` where ``prefix:`` is defined in
386+
the Prefixes extras section and ``prefix:`` + ``id`` is a row in the
387+
ExternalAnnotations extras section.
388+
2. If the annotation references ``dc:source``, the remaining text after
389+
``dc:source `` must start with a name from the Sources extras section.
390+
"""
391+
self.summary.start_check(
392+
"annotation_attributes",
393+
"Validate annotation attribute values reference defined prefixes, external annotations, and sources.",
394+
)
395+
self.summary.add_sub_check("prefix defined in Prefixes")
396+
self.summary.add_sub_check("prefix:id in ExternalAnnotations")
397+
self.summary.add_sub_check("dc:source references valid Sources entry")
398+
399+
issues = []
400+
401+
# Build lookup sets from extras
402+
extras = getattr(self.hed_schema, "extras", {}) or {}
403+
defined_prefixes = self._get_extras_column_values(extras, df_constants.PREFIXES_KEY, df_constants.prefix)
404+
external_pairs = self._get_external_annotation_pairs(extras)
405+
defined_sources = self._get_extras_column_values(extras, df_constants.SOURCES_KEY, df_constants.source)
406+
407+
# Scan all entries in all sections for the "annotation" attribute
408+
entries_checked = 0
409+
for section_key in HedSectionKey:
410+
self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, str(section_key))
411+
for entry in self.hed_schema[section_key].values():
412+
annotation_value = entry.attributes.get("annotation")
413+
if not annotation_value:
414+
continue
415+
entries_checked += 1
416+
self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, entry.name)
417+
# Annotation values can be comma-separated (multiple annotations)
418+
for single_annotation in annotation_value.split(","):
419+
single_annotation = single_annotation.strip()
420+
if single_annotation:
421+
issues += self._validate_annotation_value(
422+
entry, single_annotation, defined_prefixes, external_pairs, defined_sources
423+
)
424+
self.error_handler.pop_error_context()
425+
self.error_handler.pop_error_context()
426+
427+
self.summary.record_section("annotation_entries", entries_checked)
428+
self.summary.record_issues(len(issues))
429+
return issues
430+
431+
# -----------------------------------------------------------------------
432+
# Private helpers — extras / annotation validation
433+
# -----------------------------------------------------------------------
434+
435+
@staticmethod
436+
def _get_extras_column_values(extras, section_key, column_name):
437+
"""Return the set of values in a column of an extras DataFrame.
438+
439+
Parameters:
440+
extras (dict): The schema extras dictionary.
441+
section_key (str): Key into the extras dict (e.g. "Prefixes").
442+
column_name (str): The column whose values to collect.
443+
444+
Returns:
445+
set: The set of non-empty string values in that column.
446+
"""
447+
df = extras.get(section_key)
448+
if df is None or not isinstance(df, pd.DataFrame) or df.empty:
449+
return set()
450+
if column_name not in df.columns:
451+
return set()
452+
return {str(v).strip() for v in df[column_name] if pd.notna(v) and str(v).strip()}
453+
454+
@staticmethod
455+
def _get_external_annotation_pairs(extras):
456+
"""Return a set of (prefix, id) tuples from the ExternalAnnotations DataFrame.
457+
458+
Parameters:
459+
extras (dict): The schema extras dictionary.
460+
461+
Returns:
462+
set: Set of (prefix_str, id_str) tuples.
463+
"""
464+
df = extras.get(df_constants.EXTERNAL_ANNOTATION_KEY)
465+
if df is None or not isinstance(df, pd.DataFrame) or df.empty:
466+
return set()
467+
pairs = set()
468+
if df_constants.prefix in df.columns and df_constants.id in df.columns:
469+
for _, row in df.iterrows():
470+
p = str(row[df_constants.prefix]).strip() if pd.notna(row[df_constants.prefix]) else ""
471+
i = str(row[df_constants.id]).strip() if pd.notna(row[df_constants.id]) else ""
472+
if p and i:
473+
pairs.add((p, i))
474+
return pairs
475+
476+
def _validate_annotation_value(self, entry, annotation_value, defined_prefixes, external_pairs, defined_sources):
477+
"""Validate a single annotation attribute value.
478+
479+
Parameters:
480+
entry: The schema entry with the annotation attribute.
481+
annotation_value (str): The annotation value string.
482+
defined_prefixes (set): Valid prefixes from the Prefixes section.
483+
external_pairs (set): Valid (prefix, id) pairs from ExternalAnnotations.
484+
defined_sources (set): Valid source names from the Sources section.
485+
486+
Returns:
487+
list: A list of issue dicts.
488+
"""
489+
issues = []
490+
tag_name = entry.name
491+
492+
# Parse prefix:id from the annotation value
493+
# Expected format: "prefix:id rest_of_text" e.g. "dc:source Beniczky ea 2017 Table 2."
494+
colon_pos = annotation_value.find(":")
495+
if colon_pos < 1:
496+
# No colon found — cannot parse prefix:id
497+
issues += self.error_handler.format_error_with_context(
498+
SchemaAttributeErrors.SCHEMA_ANNOTATION_PREFIX_MISSING,
499+
tag_name,
500+
annotation_value=annotation_value,
501+
prefix="(none)",
502+
)
503+
return issues
504+
505+
ann_prefix = annotation_value[: colon_pos + 1] # e.g. "dc:"
506+
remainder = annotation_value[colon_pos + 1 :] # e.g. "source Beniczky ea 2017 Table 2."
507+
508+
# Split remainder into id and rest — id is the first whitespace-delimited token
509+
parts = remainder.split(None, 1) # split on whitespace, max 1 split
510+
ann_id = parts[0] if parts else remainder # e.g. "source"
511+
rest_text = parts[1] if len(parts) > 1 else "" # e.g. "Beniczky ea 2017 Table 2."
512+
513+
# Check 1: prefix must be in Prefixes
514+
if ann_prefix not in defined_prefixes:
515+
issues += self.error_handler.format_error_with_context(
516+
SchemaAttributeErrors.SCHEMA_ANNOTATION_PREFIX_MISSING,
517+
tag_name,
518+
annotation_value=annotation_value,
519+
prefix=ann_prefix,
520+
)
521+
522+
# Check 2: prefix:id must be in ExternalAnnotations
523+
if (ann_prefix, ann_id) not in external_pairs:
524+
issues += self.error_handler.format_error_with_context(
525+
SchemaAttributeErrors.SCHEMA_ANNOTATION_EXTERNAL_MISSING,
526+
tag_name,
527+
annotation_value=annotation_value,
528+
prefix=ann_prefix,
529+
annotation_id=ann_id,
530+
)
531+
532+
# Check 3: If dc:source, the rest_text must start with a defined source name
533+
if ann_prefix == "dc:" and ann_id == "source":
534+
rest_text_stripped = rest_text.strip() if rest_text else ""
535+
if not rest_text_stripped or not any(rest_text_stripped.startswith(src) for src in defined_sources):
536+
issues += self.error_handler.format_error_with_context(
537+
SchemaAttributeErrors.SCHEMA_ANNOTATION_SOURCE_MISSING,
538+
tag_name,
539+
annotation_value=annotation_value,
540+
source_text=rest_text_stripped,
541+
)
542+
543+
for issue in issues:
544+
issue["severity"] = ErrorSeverity.WARNING
545+
return issues
546+
331547
# -----------------------------------------------------------------------
332548
# Private helpers — attribute validation
333549
# -----------------------------------------------------------------------
@@ -388,8 +604,6 @@ def _run_validators(self, entry, attribute_name, validators):
388604
for validator in validators:
389605
self.error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name)
390606
new_issues = validator(self.hed_schema, entry, attribute_name)
391-
for issue in new_issues:
392-
issue["severity"] = ErrorSeverity.WARNING
393607
self.error_handler.add_context_and_filter(new_issues)
394608
issues += new_issues
395609
self.error_handler.pop_error_context()

hed/scripts/schema_script_util.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import os.path
22
from collections import defaultdict
33
from hed.schema import from_string, load_schema, from_dataframes
4-
from hed.errors import get_printable_issue_string, HedFileError, SchemaWarnings
4+
from hed.errors import get_printable_issue_string, HedFileError
5+
from hed.errors.error_types import ErrorSeverity
56
from hed.schema.schema_comparer import SchemaComparer
67

78
all_extensions = [".tsv", ".mediawiki", ".xml", ".json"]
@@ -23,7 +24,7 @@ def validate_schema_object(base_schema, schema_name):
2324
validation_issues = []
2425
try:
2526
issues = base_schema.check_compliance()
26-
issues = [issue for issue in issues if issue["code"] != SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED]
27+
issues = [issue for issue in issues if issue.get("severity", ErrorSeverity.ERROR) == ErrorSeverity.ERROR]
2728
if issues:
2829
error_message = get_printable_issue_string(issues, title=schema_name)
2930
validation_issues.append(error_message)

spec_tests/test_errors.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,14 @@ def _run_single_schema_test(self, info, error_code, all_codes, description, name
442442
try:
443443
loaded_schema = from_string(schema_string, schema_format=".mediawiki")
444444
issues = loaded_schema.check_compliance()
445+
# Filter annotation compliance warnings — these are noise from schemas
446+
# lacking complete ExternalAnnotations/Prefixes/Sources sections
447+
_ANNOTATION_CODES = {
448+
"SCHEMA_ANNOTATION_PREFIX_MISSING",
449+
"SCHEMA_ANNOTATION_EXTERNAL_MISSING",
450+
"SCHEMA_ANNOTATION_SOURCE_MISSING",
451+
}
452+
issues = [i for i in issues if i.get("code") not in _ANNOTATION_CODES]
445453
except HedFileError as e:
446454
issues = e.issues
447455
if not issues:

0 commit comments

Comments
 (0)