Skip to content

Commit 28b1c38

Browse files
committed
Added handling of inLibrary for extra sections -- xml only
1 parent 8c20dfe commit 28b1c38

6 files changed

Lines changed: 545 additions & 18 deletions

File tree

hed/schema/schema_io/df_constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
type = "Type"
8181
properties = "Properties" # for the schema properties, this is the column name in the properties dataframe
8282
description = "description"
83+
in_library = "in_library" # for extras sections, tracks which library an entry came from (internal metadata, not serialized)
8384

8485
struct_columns = [hed_id, name, attributes, subclass_of, dcdescription]
8586
tag_columns = [hed_id, name, level, subclass_of, attributes, dcdescription]

hed/schema/schema_io/df_util.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,30 @@ def merge_dataframes(df1, df2, key):
2626
"""
2727
if df2 is None or df2.empty:
2828
return df1
29-
if set(df1.columns) != set(df2.columns):
30-
raise HedFileError(
31-
HedExceptions.BAD_COLUMN_NAMES,
32-
f"Both dataframes corresponding to {key} to be merged must have the same columns. "
33-
f"df1 columns: {list(df1.columns)} df2 columns: {list(df2.columns)}",
34-
"",
35-
)
29+
30+
# Handle in_library column mismatch (one has it, the other doesn't)
31+
# This can happen when merging extras from different schema formats
32+
df1_cols = set(df1.columns)
33+
df2_cols = set(df2.columns)
34+
35+
if df1_cols != df2_cols:
36+
in_lib = constants.in_library
37+
# If only difference is in_library column, add it to the one missing it
38+
if in_lib in df1_cols and in_lib not in df2_cols and df1_cols - {in_lib} == df2_cols:
39+
df2 = df2.copy()
40+
df2[in_lib] = None
41+
elif in_lib in df2_cols and in_lib not in df1_cols and df2_cols - {in_lib} == df1_cols:
42+
df1 = df1.copy()
43+
df1[in_lib] = None
44+
elif df1_cols != df2_cols:
45+
# Still different columns after handling in_library - this is an error
46+
raise HedFileError(
47+
HedExceptions.BAD_COLUMN_NAMES,
48+
f"Both dataframes corresponding to {key} to be merged must have the same columns. "
49+
f"df1 columns: {list(df1.columns)} df2 columns: {list(df2.columns)}",
50+
"",
51+
)
52+
3653
combined = pd.concat([df1, df2], ignore_index=True)
3754
combined = combined.sort_values(by=list(combined.columns))
3855
combined = combined.drop_duplicates()

hed/schema/schema_io/schema2xml.py

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""Allows output of HedSchema objects as .xml format"""
22

33
from xml.etree.ElementTree import Element, SubElement
4-
from hed.schema.hed_schema_constants import HedSectionKey
4+
import pandas as pd
5+
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
56
from hed.schema.schema_io import xml_constants, df_constants as df_constants
67
from hed.schema.schema_io.schema2base import Schema2Base
78

@@ -44,8 +45,19 @@ def _output_extras(self, hed_schema):
4445

4546
def _output_sources(self, hed_schema):
4647
sources = hed_schema.get_extras(df_constants.SOURCES_KEY)
47-
if sources is None:
48+
if sources is None or sources.empty:
4849
return
50+
51+
# Filter for unmerged library schemas - only output library entries
52+
if not self._save_merged and hed_schema.library and hed_schema.with_standard:
53+
if df_constants.in_library in sources.columns:
54+
sources = sources[sources[df_constants.in_library].notna()].copy()
55+
if sources.empty:
56+
return
57+
else:
58+
# No in_library tracking, skip output for safety
59+
return
60+
4961
sources_node = SubElement(self.hed_node, xml_constants.SCHEMA_SOURCE_SECTION_ELEMENT)
5062
for _, row in sources.iterrows():
5163
source_node = SubElement(sources_node, xml_constants.SCHEMA_SOURCE_DEF_ELEMENT)
@@ -56,10 +68,29 @@ def _output_sources(self, hed_schema):
5668
description = SubElement(source_node, xml_constants.DESCRIPTION_ELEMENT)
5769
description.text = row[df_constants.description]
5870

71+
# Add inLibrary attribute in merged saves if present
72+
if self._save_merged and df_constants.in_library in row.index and pd.notna(row[df_constants.in_library]):
73+
attribute_node = SubElement(source_node, xml_constants.ATTRIBUTE_ELEMENT)
74+
name_node = SubElement(attribute_node, xml_constants.NAME_ELEMENT)
75+
name_node.text = HedKey.InLibrary
76+
value_node = SubElement(attribute_node, xml_constants.VALUE_ELEMENT)
77+
value_node.text = row[df_constants.in_library]
78+
5979
def _output_prefixes(self, hed_schema):
6080
prefixes = hed_schema.get_extras(df_constants.PREFIXES_KEY)
61-
if prefixes is None:
81+
if prefixes is None or prefixes.empty:
6282
return
83+
84+
# Filter for unmerged library schemas - only output library entries
85+
if not self._save_merged and hed_schema.library and hed_schema.with_standard:
86+
if df_constants.in_library in prefixes.columns:
87+
prefixes = prefixes[prefixes[df_constants.in_library].notna()].copy()
88+
if prefixes.empty:
89+
return
90+
else:
91+
# No in_library tracking, skip output for safety
92+
return
93+
6394
prefixes_node = SubElement(self.hed_node, xml_constants.SCHEMA_PREFIX_SECTION_ELEMENT)
6495
for _, row in prefixes.iterrows():
6596
prefix_node = SubElement(prefixes_node, xml_constants.SCHEMA_PREFIX_DEF_ELEMENT)
@@ -69,11 +100,29 @@ def _output_prefixes(self, hed_schema):
69100
prefix_namespace.text = row[df_constants.namespace]
70101
prefix_description = SubElement(prefix_node, xml_constants.DESCRIPTION_ELEMENT)
71102
prefix_description.text = row[df_constants.description]
103+
# Add inLibrary attribute in merged saves if present
104+
if self._save_merged and df_constants.in_library in row.index and pd.notna(row[df_constants.in_library]):
105+
attribute_node = SubElement(prefix_node, xml_constants.ATTRIBUTE_ELEMENT)
106+
name_node = SubElement(attribute_node, xml_constants.NAME_ELEMENT)
107+
name_node.text = HedKey.InLibrary
108+
value_node = SubElement(attribute_node, xml_constants.VALUE_ELEMENT)
109+
value_node.text = row[df_constants.in_library]
72110

73111
def _output_external_annotations(self, hed_schema):
74112
externals = hed_schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
75-
if externals is None:
113+
if externals is None or externals.empty:
76114
return
115+
116+
# Filter for unmerged library schemas - only output library entries
117+
if not self._save_merged and hed_schema.library and hed_schema.with_standard:
118+
if df_constants.in_library in externals.columns:
119+
externals = externals[externals[df_constants.in_library].notna()].copy()
120+
if externals.empty:
121+
return
122+
else:
123+
# No in_library tracking, skip output for safety
124+
return
125+
77126
externals_node = SubElement(self.hed_node, xml_constants.SCHEMA_EXTERNAL_SECTION_ELEMENT)
78127
for _, row in externals.iterrows():
79128
external_node = SubElement(externals_node, xml_constants.SCHEMA_EXTERNAL_DEF_ELEMENT)
@@ -86,6 +135,14 @@ def _output_external_annotations(self, hed_schema):
86135
external_description = SubElement(external_node, xml_constants.DESCRIPTION_ELEMENT)
87136
external_description.text = row[df_constants.description]
88137

138+
# Add inLibrary attribute in merged saves if present
139+
if self._save_merged and df_constants.in_library in row.index and pd.notna(row[df_constants.in_library]):
140+
attribute_node = SubElement(external_node, xml_constants.ATTRIBUTE_ELEMENT)
141+
name_node = SubElement(attribute_node, xml_constants.NAME_ELEMENT)
142+
name_node.text = HedKey.InLibrary
143+
value_node = SubElement(attribute_node, xml_constants.VALUE_ELEMENT)
144+
value_node.text = row[df_constants.in_library]
145+
89146
def _output_epilogue(self, epilogue):
90147
if epilogue:
91148
prologue_node = SubElement(self.hed_node, xml_constants.EPILOGUE_ELEMENT)

hed/schema/schema_io/xml2schema.py

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,21 @@ def _read_sources(self):
110110
data.append(
111111
{df_constants.source: source_name, df_constants.link: source_link, df_constants.description: description}
112112
)
113-
self._schema.extras[df_constants.SOURCES_KEY] = pd.DataFrame(data, columns=df_constants.source_columns)
113+
library_df = pd.DataFrame(data, columns=df_constants.source_columns)
114+
115+
# Add in_library column if this is a library schema
116+
if self.library:
117+
library_df[df_constants.in_library] = self.library
118+
119+
# Merge with standard schema extras if applicable
120+
if self.appending_to_schema:
121+
standard_df = self._schema.extras.get(df_constants.SOURCES_KEY, None)
122+
if standard_df is not None and not standard_df.empty:
123+
self._schema.extras[df_constants.SOURCES_KEY] = pd.concat([standard_df, library_df], ignore_index=True)
124+
else:
125+
self._schema.extras[df_constants.SOURCES_KEY] = library_df
126+
else:
127+
self._schema.extras[df_constants.SOURCES_KEY] = library_df
114128

115129
def _read_prefixes(self):
116130
prefix_elements = self._get_elements_by_name(xml_constants.SCHEMA_PREFIX_DEF_ELEMENT)
@@ -126,7 +140,21 @@ def _read_prefixes(self):
126140
df_constants.description: prefix_description,
127141
}
128142
)
129-
self._schema.extras[df_constants.PREFIXES_KEY] = pd.DataFrame(data, columns=df_constants.prefix_columns)
143+
library_df = pd.DataFrame(data, columns=df_constants.prefix_columns)
144+
145+
# Add in_library column if this is a library schema
146+
if self.library:
147+
library_df[df_constants.in_library] = self.library
148+
149+
# Merge with standard schema extras if applicable
150+
if self.appending_to_schema:
151+
standard_df = self._schema.extras.get(df_constants.PREFIXES_KEY, None)
152+
if standard_df is not None and not standard_df.empty:
153+
self._schema.extras[df_constants.PREFIXES_KEY] = pd.concat([standard_df, library_df], ignore_index=True)
154+
else:
155+
self._schema.extras[df_constants.PREFIXES_KEY] = library_df
156+
else:
157+
self._schema.extras[df_constants.PREFIXES_KEY] = library_df
130158

131159
def _read_external_annotations(self):
132160
external_elements = self._get_elements_by_name(xml_constants.SCHEMA_EXTERNAL_DEF_ELEMENT)
@@ -144,9 +172,23 @@ def _read_external_annotations(self):
144172
df_constants.description: external_description,
145173
}
146174
)
147-
self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = pd.DataFrame(
148-
data, columns=df_constants.external_annotation_columns
149-
)
175+
library_df = pd.DataFrame(data, columns=df_constants.external_annotation_columns)
176+
177+
# Add in_library column if this is a library schema
178+
if self.library:
179+
library_df[df_constants.in_library] = self.library
180+
181+
# Merge with standard schema extras if applicable
182+
if self.appending_to_schema:
183+
standard_df = self._schema.extras.get(df_constants.EXTERNAL_ANNOTATION_KEY, None)
184+
if standard_df is not None and not standard_df.empty:
185+
self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = pd.concat(
186+
[standard_df, library_df], ignore_index=True
187+
)
188+
else:
189+
self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = library_df
190+
else:
191+
self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = library_df
150192

151193
def _add_tags_recursive(self, new_tags, parent_tags):
152194
for tag_element in new_tags:

0 commit comments

Comments
 (0)