Skip to content

Commit 016bb62

Browse files
committed
Removed the ontology-related code per issue hed-standard#1177
1 parent 0e1d76c commit 016bb62

8 files changed

Lines changed: 254 additions & 583 deletions

File tree

docs/user_guide.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -625,9 +625,9 @@ hedpy schema add-ids /path/to/hed-schemas score 2.2.0
625625
1. Validate schema thoroughly before adding IDs
626626
2. Convert to all formats and verify equivalence
627627
3. Add HED IDs only once - they should remain stable
628-
4. Generate ontology after IDs are added
629-
5. Verify that the created ontology is valid using [Protégé](https://protege.stanford.edu/)
630-
6. Commit changes to version control before moving to stable release
628+
4. Commit changes to version control before moving to stable release
629+
630+
**Note:** Ontology generation (OMN/OWL format) has been moved to the separate [hed-ontology](https://github.com/hed-standard/hed-ontology) repository.
631631

632632
______________________________________________________________________
633633

hed/schema/schema_attribute_validator_hed_id.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from hed.schema.schema_io.ontology_util import get_library_data
1+
from hed.schema.hed_cache import get_library_data
22
from hed.schema.schema_io.df_util import remove_prefix
33
from semantic_version import Version
44
from hed.schema.hed_schema_io import load_schema_version
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
"""Utility functions for HED ID assignment and validation.
2+
3+
This module handles HED ID ranges, validation, and assignment for schema elements.
4+
For ontology/OMN conversion functionality, see the hed-ontology repository.
5+
"""
6+
7+
import pandas as pd
8+
9+
from hed.schema.schema_io import schema_util
10+
from hed.errors.exceptions import HedFileError
11+
from hed.schema.hed_schema_constants import HedKey
12+
from hed.schema.schema_io.df_util import remove_prefix
13+
from hed.schema.hed_cache import get_library_data
14+
from hed.schema.schema_io import df_constants as constants
15+
16+
object_type_id_offset = {
17+
constants.OBJECT_KEY: (100, 300),
18+
constants.DATA_KEY: (300, 500),
19+
constants.ANNOTATION_KEY: (500, 700),
20+
constants.ATTRIBUTE_PROPERTY_KEY: (700, 900),
21+
constants.VALUE_CLASS_KEY: (1300, 1400),
22+
constants.UNIT_MODIFIER_KEY: (1400, 1500),
23+
constants.UNIT_CLASS_KEY: (1500, 1600),
24+
constants.UNIT_KEY: (1600, 1700),
25+
constants.TAG_KEY: (2000, -1), # -1 = go to end of range
26+
}
27+
28+
29+
def _get_hedid_range(schema_name, df_key):
30+
"""Get the set of HedId's for this object type/schema name.
31+
32+
Parameters:
33+
schema_name(str): The known schema name with an assigned id range.
34+
df_key(str): The dataframe range type we're interested in. a key from constants.DF_SUFFIXES.
35+
36+
Returns:
37+
set: A set of all id's in the requested range.
38+
"""
39+
if df_key == constants.STRUCT_KEY:
40+
raise NotImplementedError("Cannot assign hed_ids struct section")
41+
42+
library_data = get_library_data(schema_name)
43+
if not library_data:
44+
return set()
45+
starting_id, ending_id = library_data["id_range"]
46+
47+
start_object_range, end_object_range = object_type_id_offset[df_key]
48+
if df_key == constants.TAG_KEY:
49+
initial_tag_adj = 1 # We always skip 1 for tags
50+
else:
51+
initial_tag_adj = 0
52+
final_start = starting_id + start_object_range + initial_tag_adj
53+
final_end = starting_id + end_object_range
54+
if end_object_range == -1:
55+
# Add one since the versions on hed-schemas are set to max_value - 1
56+
final_end = ending_id + 1
57+
return set(range(final_start, final_end))
58+
59+
60+
def get_all_ids(df):
61+
"""Returns a set of all unique hedIds in the dataframe
62+
63+
Parameters:
64+
df(pd.DataFrame): The dataframe
65+
66+
Returns:
67+
Union[Set, None]: None if this has no HED column, otherwise all unique numbers as a set.
68+
"""
69+
if constants.hed_id in df.columns:
70+
modified_df = df[constants.hed_id].apply(lambda x: remove_prefix(x, "HED_"))
71+
modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int)
72+
return set(modified_df.unique())
73+
return None
74+
75+
76+
def update_dataframes_from_schema(dataframes, schema, schema_name="", assign_missing_ids=False):
77+
"""Write out schema as a dataframe, then merge in extra columns from dataframes.
78+
79+
Parameters:
80+
dataframes(dict): A full set of schema spreadsheet formatted dataframes
81+
schema(HedSchema): The schema to write into the dataframes:
82+
schema_name(str): The name to use to find the schema id range.
83+
assign_missing_ids(bool): If True, replacing any blank(new) HedIds with valid ones
84+
85+
Returns:
86+
dict[str:pd.DataFrames]: The updated dataframes. These dataframes can potentially have extra columns.
87+
88+
"""
89+
hedid_errors = []
90+
if not schema_name:
91+
schema_name = schema.library
92+
# 1. Verify existing HED ids don't conflict between schema/dataframes
93+
for df_key, df in dataframes.items():
94+
if df_key in constants.DF_SUFFIXES:
95+
continue
96+
section_key = constants.section_mapping_hed_id.get(df_key)
97+
if not section_key:
98+
continue
99+
section = schema[section_key]
100+
101+
unused_tag_ids = _get_hedid_range(schema_name, df_key)
102+
hedid_errors += _verify_hedid_matches(section, df, unused_tag_ids)
103+
104+
if hedid_errors:
105+
raise HedFileError(
106+
hedid_errors[0]["code"],
107+
f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues "
108+
f"parameter on this exception for more details.",
109+
schema.name,
110+
issues=hedid_errors,
111+
)
112+
113+
# 2. Get the new schema as DFs
114+
from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive
115+
116+
output_dfs = Schema2DF().process_schema(schema, save_merged=False)
117+
118+
if assign_missing_ids:
119+
# 3: Add any HED ID's as needed to these generated dfs
120+
for df_key, df in output_dfs.items():
121+
if df_key == constants.STRUCT_KEY or df_key in constants.DF_EXTRAS:
122+
continue
123+
unused_tag_ids = _get_hedid_range(schema_name, df_key)
124+
125+
# If no errors, assign new HED ID's
126+
assign_hed_ids_section(df, unused_tag_ids)
127+
128+
# 4: Merge the dataframes
129+
for df_key in output_dfs.keys():
130+
if df_key in constants.DF_EXTRAS:
131+
continue
132+
out_df = output_dfs[df_key]
133+
df = dataframes[df_key]
134+
merge_dfs(out_df, df)
135+
136+
return output_dfs
137+
138+
139+
def _verify_hedid_matches(section, df, unused_tag_ids):
140+
"""Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema
141+
142+
Parameters:
143+
section(HedSchemaSection): The loaded schema section to compare ID's with
144+
df(pd.DataFrame): The loaded spreadsheet dataframe to compare with
145+
unused_tag_ids(set): The valid range of IDs for this df.
146+
147+
Returns:
148+
list[str]: A list of errors found matching IDs.
149+
"""
150+
hedid_errors = []
151+
for row_number, row in df.iterrows():
152+
if not any(row):
153+
continue
154+
label = row[constants.name]
155+
if label.endswith("-#"):
156+
label = label.replace("-#", "/#")
157+
df_id = row[constants.hed_id]
158+
entry = section.get(label)
159+
if not entry:
160+
# Neither side has a hedID, so nothing to do.
161+
if not df_id:
162+
continue
163+
hedid_errors += schema_util.format_error(
164+
row_number, row, f"'{label}' does not exist in schema file only the spreadsheet."
165+
)
166+
continue
167+
entry_id = entry.attributes.get(HedKey.HedID)
168+
if df_id:
169+
if not (df_id.startswith("HED_") and len(df_id) == len("HED_0000000")):
170+
hedid_errors += schema_util.format_error(
171+
row_number, row, f"'{label}' has an improperly formatted hedID in dataframe."
172+
)
173+
continue
174+
id_value = remove_prefix(df_id, "HED_")
175+
try:
176+
id_int = int(id_value)
177+
if id_int not in unused_tag_ids:
178+
hedid_errors += schema_util.format_error(
179+
row_number,
180+
row,
181+
f"'{label}' has id {id_int} which is outside "
182+
+ "of the valid range for this type. Valid range is: "
183+
+ f"{min(unused_tag_ids)} to {max(unused_tag_ids)}",
184+
)
185+
continue
186+
except ValueError:
187+
hedid_errors += schema_util.format_error(
188+
row_number, row, f"'{label}' has a non-numeric hedID in the dataframe."
189+
)
190+
continue
191+
192+
if entry_id and entry_id != df_id:
193+
hedid_errors += schema_util.format_error(
194+
row_number, row, f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema."
195+
)
196+
continue
197+
198+
return hedid_errors
199+
200+
201+
def assign_hed_ids_section(df, unused_tag_ids):
202+
"""Adds missing HedIds to dataframe.
203+
204+
Parameters:
205+
df(pd.DataFrame): The dataframe to add id's to.
206+
unused_tag_ids(set of int): The possible HED id's to assign from
207+
"""
208+
# Remove already used ids
209+
unused_tag_ids -= get_all_ids(df)
210+
sorted_unused_ids = sorted(unused_tag_ids, reverse=True)
211+
212+
for _row_number, row in df.iterrows():
213+
hed_id = row[constants.hed_id]
214+
# we already verified existing ones
215+
if hed_id:
216+
continue
217+
hed_id = f"HED_{sorted_unused_ids.pop():07d}"
218+
row[constants.hed_id] = hed_id
219+
220+
221+
def merge_dfs(dest_df, source_df):
222+
"""Merges extra columns from source_df into dest_df, adding the extra columns from the ontology to the schema df.
223+
224+
Parameters:
225+
dest_df (DataFrame): The dataframe to add extra columns to
226+
source_df (DataFrame): The dataframe to get extra columns from
227+
"""
228+
# todo: vectorize this at some point
229+
save_df1_columns = dest_df.columns.copy()
230+
for _index, row in source_df.iterrows():
231+
# Find matching index in df1 based on 'rdfs:label'
232+
match_index = dest_df[dest_df["rdfs:label"] == row["rdfs:label"]].index
233+
if not match_index.empty:
234+
for col in source_df.columns:
235+
if col not in save_df1_columns:
236+
dest_df.at[match_index[0], col] = row[col]

0 commit comments

Comments
 (0)