|
| 1 | +"""Utility functions for HED ID assignment and validation. |
| 2 | +
|
| 3 | +This module handles HED ID ranges, validation, and assignment for schema elements. |
| 4 | +For ontology/OMN conversion functionality, see the hed-ontology repository. |
| 5 | +""" |
| 6 | + |
| 7 | +import pandas as pd |
| 8 | + |
| 9 | +from hed.schema.schema_io import schema_util |
| 10 | +from hed.errors.exceptions import HedFileError |
| 11 | +from hed.schema.hed_schema_constants import HedKey |
| 12 | +from hed.schema.schema_io.df_util import remove_prefix |
| 13 | +from hed.schema.hed_cache import get_library_data |
| 14 | +from hed.schema.schema_io import df_constants as constants |
| 15 | + |
| 16 | +object_type_id_offset = { |
| 17 | + constants.OBJECT_KEY: (100, 300), |
| 18 | + constants.DATA_KEY: (300, 500), |
| 19 | + constants.ANNOTATION_KEY: (500, 700), |
| 20 | + constants.ATTRIBUTE_PROPERTY_KEY: (700, 900), |
| 21 | + constants.VALUE_CLASS_KEY: (1300, 1400), |
| 22 | + constants.UNIT_MODIFIER_KEY: (1400, 1500), |
| 23 | + constants.UNIT_CLASS_KEY: (1500, 1600), |
| 24 | + constants.UNIT_KEY: (1600, 1700), |
| 25 | + constants.TAG_KEY: (2000, -1), # -1 = go to end of range |
| 26 | +} |
| 27 | + |
| 28 | + |
| 29 | +def _get_hedid_range(schema_name, df_key): |
| 30 | + """Get the set of HedId's for this object type/schema name. |
| 31 | +
|
| 32 | + Parameters: |
| 33 | + schema_name(str): The known schema name with an assigned id range. |
| 34 | + df_key(str): The dataframe range type we're interested in. a key from constants.DF_SUFFIXES. |
| 35 | +
|
| 36 | + Returns: |
| 37 | + set: A set of all id's in the requested range. |
| 38 | + """ |
| 39 | + if df_key == constants.STRUCT_KEY: |
| 40 | + raise NotImplementedError("Cannot assign hed_ids struct section") |
| 41 | + |
| 42 | + library_data = get_library_data(schema_name) |
| 43 | + if not library_data: |
| 44 | + return set() |
| 45 | + starting_id, ending_id = library_data["id_range"] |
| 46 | + |
| 47 | + start_object_range, end_object_range = object_type_id_offset[df_key] |
| 48 | + if df_key == constants.TAG_KEY: |
| 49 | + initial_tag_adj = 1 # We always skip 1 for tags |
| 50 | + else: |
| 51 | + initial_tag_adj = 0 |
| 52 | + final_start = starting_id + start_object_range + initial_tag_adj |
| 53 | + final_end = starting_id + end_object_range |
| 54 | + if end_object_range == -1: |
| 55 | + # Add one since the versions on hed-schemas are set to max_value - 1 |
| 56 | + final_end = ending_id + 1 |
| 57 | + return set(range(final_start, final_end)) |
| 58 | + |
| 59 | + |
| 60 | +def get_all_ids(df): |
| 61 | + """Returns a set of all unique hedIds in the dataframe |
| 62 | +
|
| 63 | + Parameters: |
| 64 | + df(pd.DataFrame): The dataframe |
| 65 | +
|
| 66 | + Returns: |
| 67 | + Union[Set, None]: None if this has no HED column, otherwise all unique numbers as a set. |
| 68 | + """ |
| 69 | + if constants.hed_id in df.columns: |
| 70 | + modified_df = df[constants.hed_id].apply(lambda x: remove_prefix(x, "HED_")) |
| 71 | + modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int) |
| 72 | + return set(modified_df.unique()) |
| 73 | + return None |
| 74 | + |
| 75 | + |
| 76 | +def update_dataframes_from_schema(dataframes, schema, schema_name="", assign_missing_ids=False): |
| 77 | + """Write out schema as a dataframe, then merge in extra columns from dataframes. |
| 78 | +
|
| 79 | + Parameters: |
| 80 | + dataframes(dict): A full set of schema spreadsheet formatted dataframes |
| 81 | + schema(HedSchema): The schema to write into the dataframes: |
| 82 | + schema_name(str): The name to use to find the schema id range. |
| 83 | + assign_missing_ids(bool): If True, replacing any blank(new) HedIds with valid ones |
| 84 | +
|
| 85 | + Returns: |
| 86 | + dict[str:pd.DataFrames]: The updated dataframes. These dataframes can potentially have extra columns. |
| 87 | +
|
| 88 | + """ |
| 89 | + hedid_errors = [] |
| 90 | + if not schema_name: |
| 91 | + schema_name = schema.library |
| 92 | + # 1. Verify existing HED ids don't conflict between schema/dataframes |
| 93 | + for df_key, df in dataframes.items(): |
| 94 | + if df_key in constants.DF_SUFFIXES: |
| 95 | + continue |
| 96 | + section_key = constants.section_mapping_hed_id.get(df_key) |
| 97 | + if not section_key: |
| 98 | + continue |
| 99 | + section = schema[section_key] |
| 100 | + |
| 101 | + unused_tag_ids = _get_hedid_range(schema_name, df_key) |
| 102 | + hedid_errors += _verify_hedid_matches(section, df, unused_tag_ids) |
| 103 | + |
| 104 | + if hedid_errors: |
| 105 | + raise HedFileError( |
| 106 | + hedid_errors[0]["code"], |
| 107 | + f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues " |
| 108 | + f"parameter on this exception for more details.", |
| 109 | + schema.name, |
| 110 | + issues=hedid_errors, |
| 111 | + ) |
| 112 | + |
| 113 | + # 2. Get the new schema as DFs |
| 114 | + from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive |
| 115 | + |
| 116 | + output_dfs = Schema2DF().process_schema(schema, save_merged=False) |
| 117 | + |
| 118 | + if assign_missing_ids: |
| 119 | + # 3: Add any HED ID's as needed to these generated dfs |
| 120 | + for df_key, df in output_dfs.items(): |
| 121 | + if df_key == constants.STRUCT_KEY or df_key in constants.DF_EXTRAS: |
| 122 | + continue |
| 123 | + unused_tag_ids = _get_hedid_range(schema_name, df_key) |
| 124 | + |
| 125 | + # If no errors, assign new HED ID's |
| 126 | + assign_hed_ids_section(df, unused_tag_ids) |
| 127 | + |
| 128 | + # 4: Merge the dataframes |
| 129 | + for df_key in output_dfs.keys(): |
| 130 | + if df_key in constants.DF_EXTRAS: |
| 131 | + continue |
| 132 | + out_df = output_dfs[df_key] |
| 133 | + df = dataframes[df_key] |
| 134 | + merge_dfs(out_df, df) |
| 135 | + |
| 136 | + return output_dfs |
| 137 | + |
| 138 | + |
| 139 | +def _verify_hedid_matches(section, df, unused_tag_ids): |
| 140 | + """Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema |
| 141 | +
|
| 142 | + Parameters: |
| 143 | + section(HedSchemaSection): The loaded schema section to compare ID's with |
| 144 | + df(pd.DataFrame): The loaded spreadsheet dataframe to compare with |
| 145 | + unused_tag_ids(set): The valid range of IDs for this df. |
| 146 | +
|
| 147 | + Returns: |
| 148 | + list[str]: A list of errors found matching IDs. |
| 149 | + """ |
| 150 | + hedid_errors = [] |
| 151 | + for row_number, row in df.iterrows(): |
| 152 | + if not any(row): |
| 153 | + continue |
| 154 | + label = row[constants.name] |
| 155 | + if label.endswith("-#"): |
| 156 | + label = label.replace("-#", "/#") |
| 157 | + df_id = row[constants.hed_id] |
| 158 | + entry = section.get(label) |
| 159 | + if not entry: |
| 160 | + # Neither side has a hedID, so nothing to do. |
| 161 | + if not df_id: |
| 162 | + continue |
| 163 | + hedid_errors += schema_util.format_error( |
| 164 | + row_number, row, f"'{label}' does not exist in schema file only the spreadsheet." |
| 165 | + ) |
| 166 | + continue |
| 167 | + entry_id = entry.attributes.get(HedKey.HedID) |
| 168 | + if df_id: |
| 169 | + if not (df_id.startswith("HED_") and len(df_id) == len("HED_0000000")): |
| 170 | + hedid_errors += schema_util.format_error( |
| 171 | + row_number, row, f"'{label}' has an improperly formatted hedID in dataframe." |
| 172 | + ) |
| 173 | + continue |
| 174 | + id_value = remove_prefix(df_id, "HED_") |
| 175 | + try: |
| 176 | + id_int = int(id_value) |
| 177 | + if id_int not in unused_tag_ids: |
| 178 | + hedid_errors += schema_util.format_error( |
| 179 | + row_number, |
| 180 | + row, |
| 181 | + f"'{label}' has id {id_int} which is outside " |
| 182 | + + "of the valid range for this type. Valid range is: " |
| 183 | + + f"{min(unused_tag_ids)} to {max(unused_tag_ids)}", |
| 184 | + ) |
| 185 | + continue |
| 186 | + except ValueError: |
| 187 | + hedid_errors += schema_util.format_error( |
| 188 | + row_number, row, f"'{label}' has a non-numeric hedID in the dataframe." |
| 189 | + ) |
| 190 | + continue |
| 191 | + |
| 192 | + if entry_id and entry_id != df_id: |
| 193 | + hedid_errors += schema_util.format_error( |
| 194 | + row_number, row, f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema." |
| 195 | + ) |
| 196 | + continue |
| 197 | + |
| 198 | + return hedid_errors |
| 199 | + |
| 200 | + |
| 201 | +def assign_hed_ids_section(df, unused_tag_ids): |
| 202 | + """Adds missing HedIds to dataframe. |
| 203 | +
|
| 204 | + Parameters: |
| 205 | + df(pd.DataFrame): The dataframe to add id's to. |
| 206 | + unused_tag_ids(set of int): The possible HED id's to assign from |
| 207 | + """ |
| 208 | + # Remove already used ids |
| 209 | + unused_tag_ids -= get_all_ids(df) |
| 210 | + sorted_unused_ids = sorted(unused_tag_ids, reverse=True) |
| 211 | + |
| 212 | + for _row_number, row in df.iterrows(): |
| 213 | + hed_id = row[constants.hed_id] |
| 214 | + # we already verified existing ones |
| 215 | + if hed_id: |
| 216 | + continue |
| 217 | + hed_id = f"HED_{sorted_unused_ids.pop():07d}" |
| 218 | + row[constants.hed_id] = hed_id |
| 219 | + |
| 220 | + |
| 221 | +def merge_dfs(dest_df, source_df): |
| 222 | + """Merges extra columns from source_df into dest_df, adding the extra columns from the ontology to the schema df. |
| 223 | +
|
| 224 | + Parameters: |
| 225 | + dest_df (DataFrame): The dataframe to add extra columns to |
| 226 | + source_df (DataFrame): The dataframe to get extra columns from |
| 227 | + """ |
| 228 | + # todo: vectorize this at some point |
| 229 | + save_df1_columns = dest_df.columns.copy() |
| 230 | + for _index, row in source_df.iterrows(): |
| 231 | + # Find matching index in df1 based on 'rdfs:label' |
| 232 | + match_index = dest_df[dest_df["rdfs:label"] == row["rdfs:label"]].index |
| 233 | + if not match_index.empty: |
| 234 | + for col in source_df.columns: |
| 235 | + if col not in save_df1_columns: |
| 236 | + dest_df.at[match_index[0], col] = row[col] |
0 commit comments