Added handling of inLibrary for extra sections -- xml only

VisLab · VisLab · commit 28b1c38bcf76 · 2026-02-17T10:00:08.000-06:00
diff --git a/hed/schema/schema_io/df_constants.py b/hed/schema/schema_io/df_constants.py
@@ -80,6 +80,7 @@
 type = "Type"
 properties = "Properties"  # for the schema properties, this is the column name in the properties dataframe
 description = "description"
+in_library = "in_library"  # for extras sections, tracks which library an entry came from (internal metadata, not serialized)
 
 struct_columns = [hed_id, name, attributes, subclass_of, dcdescription]
 tag_columns = [hed_id, name, level, subclass_of, attributes, dcdescription]
diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py
@@ -26,13 +26,30 @@ def merge_dataframes(df1, df2, key):
     """
     if df2 is None or df2.empty:
         return df1
-    if set(df1.columns) != set(df2.columns):
-        raise HedFileError(
-            HedExceptions.BAD_COLUMN_NAMES,
-            f"Both dataframes corresponding to {key} to be merged must have the same columns.  "
-            f"df1 columns: {list(df1.columns)} df2 columns: {list(df2.columns)}",
-            "",
-        )
+
+    # Handle in_library column mismatch (one has it, the other doesn't)
+    # This can happen when merging extras from different schema formats
+    df1_cols = set(df1.columns)
+    df2_cols = set(df2.columns)
+
+    if df1_cols != df2_cols:
+        in_lib = constants.in_library
+        # If only difference is in_library column, add it to the one missing it
+        if in_lib in df1_cols and in_lib not in df2_cols and df1_cols - {in_lib} == df2_cols:
+            df2 = df2.copy()
+            df2[in_lib] = None
+        elif in_lib in df2_cols and in_lib not in df1_cols and df2_cols - {in_lib} == df1_cols:
+            df1 = df1.copy()
+            df1[in_lib] = None
+        elif df1_cols != df2_cols:
+            # Still different columns after handling in_library - this is an error
+            raise HedFileError(
+                HedExceptions.BAD_COLUMN_NAMES,
+                f"Both dataframes corresponding to {key} to be merged must have the same columns.  "
+                f"df1 columns: {list(df1.columns)} df2 columns: {list(df2.columns)}",
+                "",
+            )
+
     combined = pd.concat([df1, df2], ignore_index=True)
     combined = combined.sort_values(by=list(combined.columns))
     combined = combined.drop_duplicates()
diff --git a/hed/schema/schema_io/schema2xml.py b/hed/schema/schema_io/schema2xml.py
@@ -1,7 +1,8 @@
 """Allows output of HedSchema objects as .xml format"""
 
 from xml.etree.ElementTree import Element, SubElement
-from hed.schema.hed_schema_constants import HedSectionKey
+import pandas as pd
+from hed.schema.hed_schema_constants import HedSectionKey, HedKey
 from hed.schema.schema_io import xml_constants, df_constants as df_constants
 from hed.schema.schema_io.schema2base import Schema2Base
 
@@ -44,8 +45,19 @@ def _output_extras(self, hed_schema):
 
     def _output_sources(self, hed_schema):
         sources = hed_schema.get_extras(df_constants.SOURCES_KEY)
-        if sources is None:
+        if sources is None or sources.empty:
             return
+
+        # Filter for unmerged library schemas - only output library entries
+        if not self._save_merged and hed_schema.library and hed_schema.with_standard:
+            if df_constants.in_library in sources.columns:
+                sources = sources[sources[df_constants.in_library].notna()].copy()
+                if sources.empty:
+                    return
+            else:
+                # No in_library tracking, skip output for safety
+                return
+
         sources_node = SubElement(self.hed_node, xml_constants.SCHEMA_SOURCE_SECTION_ELEMENT)
         for _, row in sources.iterrows():
             source_node = SubElement(sources_node, xml_constants.SCHEMA_SOURCE_DEF_ELEMENT)
@@ -56,10 +68,29 @@ def _output_sources(self, hed_schema):
             description = SubElement(source_node, xml_constants.DESCRIPTION_ELEMENT)
             description.text = row[df_constants.description]
 
+            # Add inLibrary attribute in merged saves if present
+            if self._save_merged and df_constants.in_library in row.index and pd.notna(row[df_constants.in_library]):
+                attribute_node = SubElement(source_node, xml_constants.ATTRIBUTE_ELEMENT)
+                name_node = SubElement(attribute_node, xml_constants.NAME_ELEMENT)
+                name_node.text = HedKey.InLibrary
+                value_node = SubElement(attribute_node, xml_constants.VALUE_ELEMENT)
+                value_node.text = row[df_constants.in_library]
+
     def _output_prefixes(self, hed_schema):
         prefixes = hed_schema.get_extras(df_constants.PREFIXES_KEY)
-        if prefixes is None:
+        if prefixes is None or prefixes.empty:
             return
+
+        # Filter for unmerged library schemas - only output library entries
+        if not self._save_merged and hed_schema.library and hed_schema.with_standard:
+            if df_constants.in_library in prefixes.columns:
+                prefixes = prefixes[prefixes[df_constants.in_library].notna()].copy()
+                if prefixes.empty:
+                    return
+            else:
+                # No in_library tracking, skip output for safety
+                return
+
         prefixes_node = SubElement(self.hed_node, xml_constants.SCHEMA_PREFIX_SECTION_ELEMENT)
         for _, row in prefixes.iterrows():
             prefix_node = SubElement(prefixes_node, xml_constants.SCHEMA_PREFIX_DEF_ELEMENT)
@@ -69,11 +100,29 @@ def _output_prefixes(self, hed_schema):
             prefix_namespace.text = row[df_constants.namespace]
             prefix_description = SubElement(prefix_node, xml_constants.DESCRIPTION_ELEMENT)
             prefix_description.text = row[df_constants.description]
+            # Add inLibrary attribute in merged saves if present
+            if self._save_merged and df_constants.in_library in row.index and pd.notna(row[df_constants.in_library]):
+                attribute_node = SubElement(prefix_node, xml_constants.ATTRIBUTE_ELEMENT)
+                name_node = SubElement(attribute_node, xml_constants.NAME_ELEMENT)
+                name_node.text = HedKey.InLibrary
+                value_node = SubElement(attribute_node, xml_constants.VALUE_ELEMENT)
+                value_node.text = row[df_constants.in_library]
 
     def _output_external_annotations(self, hed_schema):
         externals = hed_schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
-        if externals is None:
+        if externals is None or externals.empty:
             return
+
+        # Filter for unmerged library schemas - only output library entries
+        if not self._save_merged and hed_schema.library and hed_schema.with_standard:
+            if df_constants.in_library in externals.columns:
+                externals = externals[externals[df_constants.in_library].notna()].copy()
+                if externals.empty:
+                    return
+            else:
+                # No in_library tracking, skip output for safety
+                return
+
         externals_node = SubElement(self.hed_node, xml_constants.SCHEMA_EXTERNAL_SECTION_ELEMENT)
         for _, row in externals.iterrows():
             external_node = SubElement(externals_node, xml_constants.SCHEMA_EXTERNAL_DEF_ELEMENT)
@@ -86,6 +135,14 @@ def _output_external_annotations(self, hed_schema):
             external_description = SubElement(external_node, xml_constants.DESCRIPTION_ELEMENT)
             external_description.text = row[df_constants.description]
 
+            # Add inLibrary attribute in merged saves if present
+            if self._save_merged and df_constants.in_library in row.index and pd.notna(row[df_constants.in_library]):
+                attribute_node = SubElement(external_node, xml_constants.ATTRIBUTE_ELEMENT)
+                name_node = SubElement(attribute_node, xml_constants.NAME_ELEMENT)
+                name_node.text = HedKey.InLibrary
+                value_node = SubElement(attribute_node, xml_constants.VALUE_ELEMENT)
+                value_node.text = row[df_constants.in_library]
+
     def _output_epilogue(self, epilogue):
         if epilogue:
             prologue_node = SubElement(self.hed_node, xml_constants.EPILOGUE_ELEMENT)
diff --git a/hed/schema/schema_io/xml2schema.py b/hed/schema/schema_io/xml2schema.py
@@ -110,7 +110,21 @@ def _read_sources(self):
             data.append(
                 {df_constants.source: source_name, df_constants.link: source_link, df_constants.description: description}
             )
-        self._schema.extras[df_constants.SOURCES_KEY] = pd.DataFrame(data, columns=df_constants.source_columns)
+        library_df = pd.DataFrame(data, columns=df_constants.source_columns)
+
+        # Add in_library column if this is a library schema
+        if self.library:
+            library_df[df_constants.in_library] = self.library
+
+        # Merge with standard schema extras if applicable
+        if self.appending_to_schema:
+            standard_df = self._schema.extras.get(df_constants.SOURCES_KEY, None)
+            if standard_df is not None and not standard_df.empty:
+                self._schema.extras[df_constants.SOURCES_KEY] = pd.concat([standard_df, library_df], ignore_index=True)
+            else:
+                self._schema.extras[df_constants.SOURCES_KEY] = library_df
+        else:
+            self._schema.extras[df_constants.SOURCES_KEY] = library_df
 
     def _read_prefixes(self):
         prefix_elements = self._get_elements_by_name(xml_constants.SCHEMA_PREFIX_DEF_ELEMENT)
@@ -126,7 +140,21 @@ def _read_prefixes(self):
                     df_constants.description: prefix_description,
                 }
             )
-        self._schema.extras[df_constants.PREFIXES_KEY] = pd.DataFrame(data, columns=df_constants.prefix_columns)
+        library_df = pd.DataFrame(data, columns=df_constants.prefix_columns)
+
+        # Add in_library column if this is a library schema
+        if self.library:
+            library_df[df_constants.in_library] = self.library
+
+        # Merge with standard schema extras if applicable
+        if self.appending_to_schema:
+            standard_df = self._schema.extras.get(df_constants.PREFIXES_KEY, None)
+            if standard_df is not None and not standard_df.empty:
+                self._schema.extras[df_constants.PREFIXES_KEY] = pd.concat([standard_df, library_df], ignore_index=True)
+            else:
+                self._schema.extras[df_constants.PREFIXES_KEY] = library_df
+        else:
+            self._schema.extras[df_constants.PREFIXES_KEY] = library_df
 
     def _read_external_annotations(self):
         external_elements = self._get_elements_by_name(xml_constants.SCHEMA_EXTERNAL_DEF_ELEMENT)
@@ -144,9 +172,23 @@ def _read_external_annotations(self):
                     df_constants.description: external_description,
                 }
             )
-        self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = pd.DataFrame(
-            data, columns=df_constants.external_annotation_columns
-        )
+        library_df = pd.DataFrame(data, columns=df_constants.external_annotation_columns)
+
+        # Add in_library column if this is a library schema
+        if self.library:
+            library_df[df_constants.in_library] = self.library
+
+        # Merge with standard schema extras if applicable
+        if self.appending_to_schema:
+            standard_df = self._schema.extras.get(df_constants.EXTERNAL_ANNOTATION_KEY, None)
+            if standard_df is not None and not standard_df.empty:
+                self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = pd.concat(
+                    [standard_df, library_df], ignore_index=True
+                )
+            else:
+                self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = library_df
+        else:
+            self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = library_df
 
     def _add_tags_recursive(self, new_tags, parent_tags):
         for tag_element in new_tags:
diff --git a/tests/schema/test_schema_extras_xml_roundtrip.py b/tests/schema/test_schema_extras_xml_roundtrip.py
diff --git a/tests/schema/test_schema_format_roundtrip.py b/tests/schema/test_schema_format_roundtrip.py