VisLab
diff --git a/‎docs/user_guide.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/user_guide.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎hed/schema/schema_attribute_validator_hed_id.py‎
Lines changed: 1 addition & 1 deletion b/‎hed/schema/schema_attribute_validator_hed_id.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎hed/schema/schema_io/hed_id_util.py‎
Lines changed: 236 additions & 0 deletions b/‎hed/schema/schema_io/hed_id_util.py‎
Lines changed: 236 additions & 0 deletions
@@ -625,9 +625,9 @@ hedpy schema add-ids /path/to/hed-schemas score 2.2.0
 1. Validate schema thoroughly before adding IDs
 2. Convert to all formats and verify equivalence
 3. Add HED IDs only once - they should remain stable
-4. Generate ontology after IDs are added
-5. Verify that the created ontology is valid using [Protégé](https://protege.stanford.edu/)
-6. Commit changes to version control before moving to stable release
+4. Commit changes to version control before moving to stable release
+
+**Note:** Ontology generation (OMN/OWL format) has been moved to the separate [hed-ontology](https://github.com/hed-standard/hed-ontology) repository.
 
 ______________________________________________________________________
 
 
@@ -1,4 +1,4 @@
-from hed.schema.schema_io.ontology_util import get_library_data
+from hed.schema.hed_cache import get_library_data
 from hed.schema.schema_io.df_util import remove_prefix
 from semantic_version import Version
 from hed.schema.hed_schema_io import load_schema_version
 
@@ -0,0 +1,236 @@
+"""Utility functions for HED ID assignment and validation.
+
+This module handles HED ID ranges, validation, and assignment for schema elements.
+For ontology/OMN conversion functionality, see the hed-ontology repository.
+"""
+
+import pandas as pd
+
+from hed.schema.schema_io import schema_util
+from hed.errors.exceptions import HedFileError
+from hed.schema.hed_schema_constants import HedKey
+from hed.schema.schema_io.df_util import remove_prefix
+from hed.schema.hed_cache import get_library_data
+from hed.schema.schema_io import df_constants as constants
+
+object_type_id_offset = {
+    constants.OBJECT_KEY: (100, 300),
+    constants.DATA_KEY: (300, 500),
+    constants.ANNOTATION_KEY: (500, 700),
+    constants.ATTRIBUTE_PROPERTY_KEY: (700, 900),
+    constants.VALUE_CLASS_KEY: (1300, 1400),
+    constants.UNIT_MODIFIER_KEY: (1400, 1500),
+    constants.UNIT_CLASS_KEY: (1500, 1600),
+    constants.UNIT_KEY: (1600, 1700),
+    constants.TAG_KEY: (2000, -1),  # -1 = go to end of range
+}
+
+
+def _get_hedid_range(schema_name, df_key):
+    """Get the set of HedId's for this object type/schema name.
+
+    Parameters:
+        schema_name(str): The known schema name with an assigned id range.
+        df_key(str): The dataframe range type we're interested in.  a key from constants.DF_SUFFIXES.
+
+    Returns:
+        set: A set of all id's in the requested range.
+    """
+    if df_key == constants.STRUCT_KEY:
+        raise NotImplementedError("Cannot assign hed_ids struct section")
+
+    library_data = get_library_data(schema_name)
+    if not library_data:
+        return set()
+    starting_id, ending_id = library_data["id_range"]
+
+    start_object_range, end_object_range = object_type_id_offset[df_key]
+    if df_key == constants.TAG_KEY:
+        initial_tag_adj = 1  # We always skip 1 for tags
+    else:
+        initial_tag_adj = 0
+    final_start = starting_id + start_object_range + initial_tag_adj
+    final_end = starting_id + end_object_range
+    if end_object_range == -1:
+        # Add one since the versions on hed-schemas are set to max_value - 1
+        final_end = ending_id + 1
+    return set(range(final_start, final_end))
+
+
+def get_all_ids(df):
+    """Returns a set of all unique hedIds in the dataframe
+
+    Parameters:
+        df(pd.DataFrame): The dataframe
+
+    Returns:
+        Union[Set, None]: None if this has no HED column, otherwise all unique numbers as a set.
+    """
+    if constants.hed_id in df.columns:
+        modified_df = df[constants.hed_id].apply(lambda x: remove_prefix(x, "HED_"))
+        modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int)
+        return set(modified_df.unique())
+    return None
+
+
+def update_dataframes_from_schema(dataframes, schema, schema_name="", assign_missing_ids=False):
+    """Write out schema as a dataframe, then merge in extra columns from dataframes.
+
+    Parameters:
+        dataframes(dict): A full set of schema spreadsheet formatted dataframes
+        schema(HedSchema): The schema to write into the dataframes:
+        schema_name(str): The name to use to find the schema id range.
+        assign_missing_ids(bool): If True, replacing any blank(new) HedIds with valid ones
+
+    Returns:
+        dict[str:pd.DataFrames]: The updated dataframes. These dataframes can potentially have extra columns.
+
+    """
+    hedid_errors = []
+    if not schema_name:
+        schema_name = schema.library
+    # 1. Verify existing HED ids don't conflict between schema/dataframes
+    for df_key, df in dataframes.items():
+        if df_key in constants.DF_SUFFIXES:
+            continue
+        section_key = constants.section_mapping_hed_id.get(df_key)
+        if not section_key:
+            continue
+        section = schema[section_key]
+
+        unused_tag_ids = _get_hedid_range(schema_name, df_key)
+        hedid_errors += _verify_hedid_matches(section, df, unused_tag_ids)
+
+    if hedid_errors:
+        raise HedFileError(
+            hedid_errors[0]["code"],
+            f"{len(hedid_errors)} issues found with hedId mismatches.  See the .issues "
+            f"parameter on this exception for more details.",
+            schema.name,
+            issues=hedid_errors,
+        )
+
+    # 2. Get the new schema as DFs
+    from hed.schema.schema_io.schema2df import Schema2DF  # Late import as this is recursive
+
+    output_dfs = Schema2DF().process_schema(schema, save_merged=False)
+
+    if assign_missing_ids:
+        # 3: Add any HED ID's as needed to these generated dfs
+        for df_key, df in output_dfs.items():
+            if df_key == constants.STRUCT_KEY or df_key in constants.DF_EXTRAS:
+                continue
+            unused_tag_ids = _get_hedid_range(schema_name, df_key)
+
+            # If no errors, assign new HED ID's
+            assign_hed_ids_section(df, unused_tag_ids)
+
+    # 4: Merge the dataframes
+    for df_key in output_dfs.keys():
+        if df_key in constants.DF_EXTRAS:
+            continue
+        out_df = output_dfs[df_key]
+        df = dataframes[df_key]
+        merge_dfs(out_df, df)
+
+    return output_dfs
+
+
+def _verify_hedid_matches(section, df, unused_tag_ids):
+    """Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema
+
+    Parameters:
+        section(HedSchemaSection): The loaded schema section to compare ID's with
+        df(pd.DataFrame): The loaded spreadsheet dataframe to compare with
+        unused_tag_ids(set): The valid range of IDs for this df.
+
+    Returns:
+        list[str]: A list of errors found matching IDs.
+    """
+    hedid_errors = []
+    for row_number, row in df.iterrows():
+        if not any(row):
+            continue
+        label = row[constants.name]
+        if label.endswith("-#"):
+            label = label.replace("-#", "/#")
+        df_id = row[constants.hed_id]
+        entry = section.get(label)
+        if not entry:
+            # Neither side has a hedID, so nothing to do.
+            if not df_id:
+                continue
+            hedid_errors += schema_util.format_error(
+                row_number, row, f"'{label}' does not exist in schema file only the spreadsheet."
+            )
+            continue
+        entry_id = entry.attributes.get(HedKey.HedID)
+        if df_id:
+            if not (df_id.startswith("HED_") and len(df_id) == len("HED_0000000")):
+                hedid_errors += schema_util.format_error(
+                    row_number, row, f"'{label}' has an improperly formatted hedID in dataframe."
+                )
+                continue
+            id_value = remove_prefix(df_id, "HED_")
+            try:
+                id_int = int(id_value)
+                if id_int not in unused_tag_ids:
+                    hedid_errors += schema_util.format_error(
+                        row_number,
+                        row,
+                        f"'{label}' has id {id_int} which is outside "
+                        + "of the valid range for this type.  Valid range is: "
+                        + f"{min(unused_tag_ids)} to {max(unused_tag_ids)}",
+                    )
+                    continue
+            except ValueError:
+                hedid_errors += schema_util.format_error(
+                    row_number, row, f"'{label}' has a non-numeric hedID in the dataframe."
+                )
+                continue
+
+        if entry_id and entry_id != df_id:
+            hedid_errors += schema_util.format_error(
+                row_number, row, f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema."
+            )
+            continue
+
+    return hedid_errors
+
+
+def assign_hed_ids_section(df, unused_tag_ids):
+    """Adds missing HedIds to dataframe.
+
+    Parameters:
+        df(pd.DataFrame): The dataframe to add id's to.
+        unused_tag_ids(set of int): The possible HED id's to assign from
+    """
+    # Remove already used ids
+    unused_tag_ids -= get_all_ids(df)
+    sorted_unused_ids = sorted(unused_tag_ids, reverse=True)
+
+    for _row_number, row in df.iterrows():
+        hed_id = row[constants.hed_id]
+        # we already verified existing ones
+        if hed_id:
+            continue
+        hed_id = f"HED_{sorted_unused_ids.pop():07d}"
+        row[constants.hed_id] = hed_id
+
+
+def merge_dfs(dest_df, source_df):
+    """Merges extra columns from source_df into dest_df, adding the extra columns from the ontology to the schema df.
+
+    Parameters:
+        dest_df (DataFrame): The dataframe to add extra columns to
+        source_df (DataFrame): The dataframe to get extra columns from
+    """
+    # todo: vectorize this at some point
+    save_df1_columns = dest_df.columns.copy()
+    for _index, row in source_df.iterrows():
+        # Find matching index in df1 based on 'rdfs:label'
+        match_index = dest_df[dest_df["rdfs:label"] == row["rdfs:label"]].index
+        if not match_index.empty:
+            for col in source_df.columns:
+                if col not in save_df1_columns:
+                    dest_df.at[match_index[0], col] = row[col]
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from hed.schema.schema_io.ontology_util import get_library_data`
	`1`	`+from hed.schema.hed_cache import get_library_data`
`2`	`2`	`from hed.schema.schema_io.df_util import remove_prefix`
`3`	`3`	`from semantic_version import Version`
`4`	`4`	`from hed.schema.hed_schema_io import load_schema_version`