|
15 | 15 |
|
16 | 16 | from functools import partial |
17 | 17 |
|
| 18 | +import pandas as pd |
18 | 19 | from semantic_version import Version |
19 | 20 |
|
20 | 21 | from hed.errors.error_reporter import ErrorHandler, sort_issues |
|
28 | 29 | from hed.schema import hed_cache |
29 | 30 | from hed.schema.schema_validation import attribute_validators |
30 | 31 | from hed.schema.hed_schema import HedSchema, HedKey, HedSectionKey |
| 32 | +from hed.schema.schema_io import df_constants |
31 | 33 | from hed.schema.schema_validation.hed_id_validator import HedIDValidator |
32 | 34 | from hed.schema.schema_validation.compliance_summary import ComplianceSummary |
33 | 35 | from hed.schema.schema_validation.validation_util import ( |
@@ -77,6 +79,8 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl |
77 | 79 | issues += validator.check_invalid_characters() |
78 | 80 | issues += validator.check_attributes() |
79 | 81 | issues += validator.check_duplicate_names() |
| 82 | + issues += validator.check_extras_columns() |
| 83 | + issues += validator.check_annotation_attribute_values() |
80 | 84 |
|
81 | 85 | error_handler.pop_error_context() |
82 | 86 | issues = sort_issues(issues) |
@@ -328,6 +332,218 @@ def check_duplicate_names(self): |
328 | 332 | self.summary.record_issues(len(issues)) |
329 | 333 | return issues |
330 | 334 |
|
| 335 | + def check_extras_columns(self): |
| 336 | + """Validate that all extras DataFrames have non-empty values in required columns. |
| 337 | +
|
| 338 | + For each extras section (Sources, Prefixes, ExternalAnnotations), checks |
| 339 | + that every cell in the required columns defined in |
| 340 | + ``df_constants.extras_column_dict`` has a non-empty value. |
| 341 | +
|
| 342 | + Note: |
| 343 | + Missing columns are automatically added with empty strings during |
| 344 | + schema loading (see ``base2schema.fix_extra``), so only value |
| 345 | + presence needs to be checked here. |
| 346 | + """ |
| 347 | + self.summary.start_check( |
| 348 | + "extras_columns", |
| 349 | + "Validate extras sections have non-empty values in required columns.", |
| 350 | + ) |
| 351 | + self.summary.add_sub_check("non-empty cell values") |
| 352 | + |
| 353 | + issues = [] |
| 354 | + extras = getattr(self.hed_schema, "extras", {}) or {} |
| 355 | + for section_name, required_cols in df_constants.extras_column_dict.items(): |
| 356 | + df = extras.get(section_name) |
| 357 | + if df is None or (isinstance(df, pd.DataFrame) and df.empty): |
| 358 | + # Empty extras are fine — nothing to validate |
| 359 | + continue |
| 360 | + |
| 361 | + rows_checked = len(df) |
| 362 | + self.summary.record_section(section_name, rows_checked) |
| 363 | + |
| 364 | + for col in required_cols: |
| 365 | + if col not in df.columns: |
| 366 | + continue |
| 367 | + mask = df[col].isna() | df[col].astype(str).str.strip().eq("") |
| 368 | + for row_idx in mask[mask].index: |
| 369 | + issues += ErrorHandler.format_error( |
| 370 | + SchemaAttributeErrors.SCHEMA_MISSING_EXTRA_VALUE, |
| 371 | + section_name=section_name, |
| 372 | + column_name=col, |
| 373 | + row_index=row_idx, |
| 374 | + ) |
| 375 | + |
| 376 | + self.error_handler.add_context_and_filter(issues) |
| 377 | + self.summary.record_issues(len(issues)) |
| 378 | + return issues |
| 379 | + |
| 380 | + def check_annotation_attribute_values(self): |
| 381 | + """Validate that annotation attribute values reference valid prefixes, external annotations, and sources. |
| 382 | +
|
| 383 | + For each entry that has an ``annotation`` attribute, checks that: |
| 384 | +
|
| 385 | + 1. The value starts with ``prefix:id`` where ``prefix:`` is defined in |
| 386 | + the Prefixes extras section and ``prefix:`` + ``id`` is a row in the |
| 387 | + ExternalAnnotations extras section. |
| 388 | + 2. If the annotation references ``dc:source``, the remaining text after |
| 389 | + ``dc:source `` must start with a name from the Sources extras section. |
| 390 | + """ |
| 391 | + self.summary.start_check( |
| 392 | + "annotation_attributes", |
| 393 | + "Validate annotation attribute values reference defined prefixes, external annotations, and sources.", |
| 394 | + ) |
| 395 | + self.summary.add_sub_check("prefix defined in Prefixes") |
| 396 | + self.summary.add_sub_check("prefix:id in ExternalAnnotations") |
| 397 | + self.summary.add_sub_check("dc:source references valid Sources entry") |
| 398 | + |
| 399 | + issues = [] |
| 400 | + |
| 401 | + # Build lookup sets from extras |
| 402 | + extras = getattr(self.hed_schema, "extras", {}) or {} |
| 403 | + defined_prefixes = self._get_extras_column_values(extras, df_constants.PREFIXES_KEY, df_constants.prefix) |
| 404 | + external_pairs = self._get_external_annotation_pairs(extras) |
| 405 | + defined_sources = self._get_extras_column_values(extras, df_constants.SOURCES_KEY, df_constants.source) |
| 406 | + |
| 407 | + # Scan all entries in all sections for the "annotation" attribute |
| 408 | + entries_checked = 0 |
| 409 | + for section_key in HedSectionKey: |
| 410 | + self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, str(section_key)) |
| 411 | + for entry in self.hed_schema[section_key].values(): |
| 412 | + annotation_value = entry.attributes.get("annotation") |
| 413 | + if not annotation_value: |
| 414 | + continue |
| 415 | + entries_checked += 1 |
| 416 | + self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, entry.name) |
| 417 | + # Annotation values can be comma-separated (multiple annotations) |
| 418 | + for single_annotation in annotation_value.split(","): |
| 419 | + single_annotation = single_annotation.strip() |
| 420 | + if single_annotation: |
| 421 | + issues += self._validate_annotation_value( |
| 422 | + entry, single_annotation, defined_prefixes, external_pairs, defined_sources |
| 423 | + ) |
| 424 | + self.error_handler.pop_error_context() |
| 425 | + self.error_handler.pop_error_context() |
| 426 | + |
| 427 | + self.summary.record_section("annotation_entries", entries_checked) |
| 428 | + self.summary.record_issues(len(issues)) |
| 429 | + return issues |
| 430 | + |
| 431 | + # ----------------------------------------------------------------------- |
| 432 | + # Private helpers — extras / annotation validation |
| 433 | + # ----------------------------------------------------------------------- |
| 434 | + |
| 435 | + @staticmethod |
| 436 | + def _get_extras_column_values(extras, section_key, column_name): |
| 437 | + """Return the set of values in a column of an extras DataFrame. |
| 438 | +
|
| 439 | + Parameters: |
| 440 | + extras (dict): The schema extras dictionary. |
| 441 | + section_key (str): Key into the extras dict (e.g. "Prefixes"). |
| 442 | + column_name (str): The column whose values to collect. |
| 443 | +
|
| 444 | + Returns: |
| 445 | + set: The set of non-empty string values in that column. |
| 446 | + """ |
| 447 | + df = extras.get(section_key) |
| 448 | + if df is None or not isinstance(df, pd.DataFrame) or df.empty: |
| 449 | + return set() |
| 450 | + if column_name not in df.columns: |
| 451 | + return set() |
| 452 | + return {str(v).strip() for v in df[column_name] if pd.notna(v) and str(v).strip()} |
| 453 | + |
| 454 | + @staticmethod |
| 455 | + def _get_external_annotation_pairs(extras): |
| 456 | + """Return a set of (prefix, id) tuples from the ExternalAnnotations DataFrame. |
| 457 | +
|
| 458 | + Parameters: |
| 459 | + extras (dict): The schema extras dictionary. |
| 460 | +
|
| 461 | + Returns: |
| 462 | + set: Set of (prefix_str, id_str) tuples. |
| 463 | + """ |
| 464 | + df = extras.get(df_constants.EXTERNAL_ANNOTATION_KEY) |
| 465 | + if df is None or not isinstance(df, pd.DataFrame) or df.empty: |
| 466 | + return set() |
| 467 | + pairs = set() |
| 468 | + if df_constants.prefix in df.columns and df_constants.id in df.columns: |
| 469 | + for _, row in df.iterrows(): |
| 470 | + p = str(row[df_constants.prefix]).strip() if pd.notna(row[df_constants.prefix]) else "" |
| 471 | + i = str(row[df_constants.id]).strip() if pd.notna(row[df_constants.id]) else "" |
| 472 | + if p and i: |
| 473 | + pairs.add((p, i)) |
| 474 | + return pairs |
| 475 | + |
| 476 | + def _validate_annotation_value(self, entry, annotation_value, defined_prefixes, external_pairs, defined_sources): |
| 477 | + """Validate a single annotation attribute value. |
| 478 | +
|
| 479 | + Parameters: |
| 480 | + entry: The schema entry with the annotation attribute. |
| 481 | + annotation_value (str): The annotation value string. |
| 482 | + defined_prefixes (set): Valid prefixes from the Prefixes section. |
| 483 | + external_pairs (set): Valid (prefix, id) pairs from ExternalAnnotations. |
| 484 | + defined_sources (set): Valid source names from the Sources section. |
| 485 | +
|
| 486 | + Returns: |
| 487 | + list: A list of issue dicts. |
| 488 | + """ |
| 489 | + issues = [] |
| 490 | + tag_name = entry.name |
| 491 | + |
| 492 | + # Parse prefix:id from the annotation value |
| 493 | + # Expected format: "prefix:id rest_of_text" e.g. "dc:source Beniczky ea 2017 Table 2." |
| 494 | + colon_pos = annotation_value.find(":") |
| 495 | + if colon_pos < 1: |
| 496 | + # No colon found — cannot parse prefix:id |
| 497 | + issues += self.error_handler.format_error_with_context( |
| 498 | + SchemaAttributeErrors.SCHEMA_ANNOTATION_PREFIX_MISSING, |
| 499 | + tag_name, |
| 500 | + annotation_value=annotation_value, |
| 501 | + prefix="(none)", |
| 502 | + ) |
| 503 | + return issues |
| 504 | + |
| 505 | + ann_prefix = annotation_value[: colon_pos + 1] # e.g. "dc:" |
| 506 | + remainder = annotation_value[colon_pos + 1 :] # e.g. "source Beniczky ea 2017 Table 2." |
| 507 | + |
| 508 | + # Split remainder into id and rest — id is the first whitespace-delimited token |
| 509 | + parts = remainder.split(None, 1) # split on whitespace, max 1 split |
| 510 | + ann_id = parts[0] if parts else remainder # e.g. "source" |
| 511 | + rest_text = parts[1] if len(parts) > 1 else "" # e.g. "Beniczky ea 2017 Table 2." |
| 512 | + |
| 513 | + # Check 1: prefix must be in Prefixes |
| 514 | + if ann_prefix not in defined_prefixes: |
| 515 | + issues += self.error_handler.format_error_with_context( |
| 516 | + SchemaAttributeErrors.SCHEMA_ANNOTATION_PREFIX_MISSING, |
| 517 | + tag_name, |
| 518 | + annotation_value=annotation_value, |
| 519 | + prefix=ann_prefix, |
| 520 | + ) |
| 521 | + |
| 522 | + # Check 2: prefix:id must be in ExternalAnnotations |
| 523 | + if (ann_prefix, ann_id) not in external_pairs: |
| 524 | + issues += self.error_handler.format_error_with_context( |
| 525 | + SchemaAttributeErrors.SCHEMA_ANNOTATION_EXTERNAL_MISSING, |
| 526 | + tag_name, |
| 527 | + annotation_value=annotation_value, |
| 528 | + prefix=ann_prefix, |
| 529 | + annotation_id=ann_id, |
| 530 | + ) |
| 531 | + |
| 532 | + # Check 3: If dc:source, the rest_text must start with a defined source name |
| 533 | + if ann_prefix == "dc:" and ann_id == "source": |
| 534 | + rest_text_stripped = rest_text.strip() if rest_text else "" |
| 535 | + if not rest_text_stripped or not any(rest_text_stripped.startswith(src) for src in defined_sources): |
| 536 | + issues += self.error_handler.format_error_with_context( |
| 537 | + SchemaAttributeErrors.SCHEMA_ANNOTATION_SOURCE_MISSING, |
| 538 | + tag_name, |
| 539 | + annotation_value=annotation_value, |
| 540 | + source_text=rest_text_stripped, |
| 541 | + ) |
| 542 | + |
| 543 | + for issue in issues: |
| 544 | + issue["severity"] = ErrorSeverity.WARNING |
| 545 | + return issues |
| 546 | + |
331 | 547 | # ----------------------------------------------------------------------- |
332 | 548 | # Private helpers — attribute validation |
333 | 549 | # ----------------------------------------------------------------------- |
@@ -388,8 +604,6 @@ def _run_validators(self, entry, attribute_name, validators): |
388 | 604 | for validator in validators: |
389 | 605 | self.error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name) |
390 | 606 | new_issues = validator(self.hed_schema, entry, attribute_name) |
391 | | - for issue in new_issues: |
392 | | - issue["severity"] = ErrorSeverity.WARNING |
393 | 607 | self.error_handler.add_context_and_filter(new_issues) |
394 | 608 | issues += new_issues |
395 | 609 | self.error_handler.pop_error_context() |
|
0 commit comments