Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hed/schema/schema_io/df2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _parse_data(self):
)
extras = {key: self.input_data[key] for key in constants.DF_EXTRAS if key in self.input_data}
for key, _item in extras.items():
self._schema.extras[key] = df_util.merge_dataframes(extras[key], self._schema.extras.get(key, None), key)
self._schema.extras[key] = df_util.merge_extras_dataframes(extras[key], self._schema.extras.get(key, None))

def _get_prologue_epilogue(self, file_data):
prologue, epilogue = "", ""
Expand Down
26 changes: 26 additions & 0 deletions hed/schema/schema_io/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,32 @@ def merge_dataframes(df1, df2, key):
return combined


def merge_extras_dataframes(library_df, standard_df):
"""Merge library and standard extras DataFrames by combining and deduplicating.

The library extras should contain all entries (standard + library-specific).
This function combines both and removes exact duplicates.

Parameters:
library_df (pd.DataFrame): DataFrame from library schema extras section
standard_df (pd.DataFrame): DataFrame from standard schema extras section

Returns:
pd.DataFrame: Combined DataFrame with duplicates removed and sorted
"""
if standard_df is None or standard_df.empty:
if library_df is None or library_df.empty:
return pd.DataFrame()
return library_df.drop_duplicates().sort_values(by=list(library_df.columns)).reset_index(drop=True)
if library_df is None or library_df.empty:
return standard_df.drop_duplicates().sort_values(by=list(standard_df.columns)).reset_index(drop=True)

combined = pd.concat([standard_df, library_df], ignore_index=True)
combined = combined.drop_duplicates()
combined = combined.sort_values(by=list(combined.columns)).reset_index(drop=True)
return combined


def merge_dataframe_dicts(df_dict1, df_dict2, key_column=constants.KEY_COLUMN_NAME):
"""Create a new dictionary of DataFrames where dict2 is merged into dict1.

Expand Down
47 changes: 4 additions & 43 deletions hed/schema/schema_io/json2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import json
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.schema.schema_io import json_constants
from hed.schema.schema_io import json_constants, df_util
from hed.schema.schema_io.base2schema import SchemaLoader


Expand Down Expand Up @@ -498,103 +498,64 @@ def _load_extras(self):
if json_constants.SOURCES_KEY in self._json_data:
sources_data = self._json_data[json_constants.SOURCES_KEY]
for source_data in sources_data:
# Parse inLibrary attribute from JSON if present (for merged JSON)
in_library_value = source_data.get(HedKey.InLibrary, None)
# If not found in JSON but this is a library schema, use self.library
if in_library_value is None and self.library:
in_library_value = self.library

sources_list.append(
{
df_constants.source: source_data.get("name", ""),
df_constants.link: source_data.get("link", ""),
df_constants.description: source_data.get(json_constants.DESCRIPTION_KEY, ""),
df_constants.in_library: in_library_value,
}
)
# Create DataFrame - if empty, use column specification to match XML/MEDIAWIKI behavior
if sources_list:
library_df = pd.DataFrame(sources_list).fillna("").astype(str)
else:
library_df = pd.DataFrame([], columns=df_constants.source_columns)
# Convert in_library None values to empty strings for consistency
if df_constants.in_library in library_df.columns:
library_df[df_constants.in_library] = library_df[df_constants.in_library].replace("None", "")

# Merge with existing schema extras if present (from withStandard base schema)
standard_df = self._schema.extras.get(df_constants.SOURCES_KEY, None)
if standard_df is not None and not standard_df.empty:
self._schema.extras[df_constants.SOURCES_KEY] = pd.concat([standard_df, library_df], ignore_index=True)
else:
self._schema.extras[df_constants.SOURCES_KEY] = library_df
self._schema.extras[df_constants.SOURCES_KEY] = df_util.merge_extras_dataframes(library_df, standard_df)

# Load prefixes - always create DataFrame even if empty
prefixes_list = []
if json_constants.PREFIXES_KEY in self._json_data:
prefixes_data = self._json_data[json_constants.PREFIXES_KEY]
for prefix_data in prefixes_data:
# Parse inLibrary attribute from JSON if present (for merged JSON)
in_library_value = prefix_data.get(HedKey.InLibrary, None)
# If not found in JSON but this is a library schema, use self.library
if in_library_value is None and self.library:
in_library_value = self.library

prefixes_list.append(
{
df_constants.prefix: prefix_data.get("name", ""),
df_constants.namespace: prefix_data.get("namespace", ""),
df_constants.description: prefix_data.get(json_constants.DESCRIPTION_KEY, ""),
df_constants.in_library: in_library_value,
}
)
# Create DataFrame - if empty, use column specification to match XML/MEDIAWIKI behavior
if prefixes_list:
library_df = pd.DataFrame(prefixes_list).fillna("").astype(str)
else:
library_df = pd.DataFrame([], columns=df_constants.prefix_columns)
# Convert in_library None values to empty strings for consistency
if df_constants.in_library in library_df.columns:
library_df[df_constants.in_library] = library_df[df_constants.in_library].replace("None", "")

# Merge with existing schema extras if present (from withStandard base schema)
standard_df = self._schema.extras.get(df_constants.PREFIXES_KEY, None)
if standard_df is not None and not standard_df.empty:
self._schema.extras[df_constants.PREFIXES_KEY] = pd.concat([standard_df, library_df], ignore_index=True)
else:
self._schema.extras[df_constants.PREFIXES_KEY] = library_df
self._schema.extras[df_constants.PREFIXES_KEY] = df_util.merge_extras_dataframes(library_df, standard_df)

# Load external annotations - always create DataFrame even if empty
externals_list = []
if json_constants.EXTERNAL_ANNOTATIONS_KEY in self._json_data:
externals_data = self._json_data[json_constants.EXTERNAL_ANNOTATIONS_KEY]
for external_data in externals_data:
# Parse inLibrary attribute from JSON if present (for merged JSON)
in_library_value = external_data.get(HedKey.InLibrary, None)
# If not found in JSON but this is a library schema, use self.library
if in_library_value is None and self.library:
in_library_value = self.library

externals_list.append(
{
df_constants.prefix: external_data.get("name", ""),
df_constants.id: external_data.get("id", ""),
df_constants.iri: external_data.get("iri", ""),
df_constants.description: external_data.get(json_constants.DESCRIPTION_KEY, ""),
df_constants.in_library: in_library_value,
}
)
# Create DataFrame - if empty, use column specification to match XML/MEDIAWIKI behavior
if externals_list:
library_df = pd.DataFrame(externals_list).fillna("").astype(str)
else:
library_df = pd.DataFrame([], columns=df_constants.external_annotation_columns)
# Convert in_library None values to empty strings for consistency
if df_constants.in_library in library_df.columns:
library_df[df_constants.in_library] = library_df[df_constants.in_library].replace("None", "")

# Merge with existing schema extras if present (from withStandard base schema)
standard_df = self._schema.extras.get(df_constants.EXTERNAL_ANNOTATION_KEY, None)
if standard_df is not None and not standard_df.empty:
self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = pd.concat([standard_df, library_df], ignore_index=True)
else:
self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = library_df
self._schema.extras[df_constants.EXTERNAL_ANNOTATION_KEY] = df_util.merge_extras_dataframes(library_df, standard_df)
53 changes: 0 additions & 53 deletions hed/schema/schema_io/schema2json.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Allows output of HedSchema objects as .json format"""

import json
import pandas as pd
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.schema.schema_io import json_constants, df_constants
from hed.schema.schema_io.schema2base import Schema2Base
Expand Down Expand Up @@ -64,29 +63,13 @@ def _output_sources(self, hed_schema):
if sources is None or sources.empty:
return

# Filter for unmerged library schemas - only output library entries if tracking is available
if not self._save_merged and hed_schema.library and hed_schema.with_standard:
if df_constants.in_library in sources.columns:
sources = sources[sources[df_constants.in_library].notna() & (sources[df_constants.in_library] != "")].copy()
if sources.empty:
return
# Otherwise fall back to writing all rows (assume all are library entries)

sources_list = []
for _, row in sources.iterrows():
source_dict = {
"name": row[df_constants.source],
"link": row[df_constants.link],
json_constants.DESCRIPTION_KEY: row[df_constants.description],
}
# Add inLibrary attribute in merged saves if present
if (
self._save_merged
and df_constants.in_library in row.index
and pd.notna(row[df_constants.in_library])
and row[df_constants.in_library] != ""
):
source_dict[HedKey.InLibrary] = row[df_constants.in_library]
sources_list.append(source_dict)

self.output[json_constants.SOURCES_KEY] = sources_list
Expand All @@ -101,31 +84,13 @@ def _output_prefixes(self, hed_schema):
if prefixes is None or prefixes.empty:
return

# Filter for unmerged library schemas - only output library entries if tracking is available
if not self._save_merged and hed_schema.library and hed_schema.with_standard:
if df_constants.in_library in prefixes.columns:
prefixes = prefixes[
prefixes[df_constants.in_library].notna() & (prefixes[df_constants.in_library] != "")
].copy()
if prefixes.empty:
return
# Otherwise fall back to writing all rows (assume all are library entries)

prefixes_list = []
for _, row in prefixes.iterrows():
prefix_dict = {
"name": row[df_constants.prefix],
"namespace": row[df_constants.namespace],
json_constants.DESCRIPTION_KEY: row[df_constants.description],
}
# Add inLibrary attribute in merged saves if present
if (
self._save_merged
and df_constants.in_library in row.index
and pd.notna(row[df_constants.in_library])
and row[df_constants.in_library] != ""
):
prefix_dict[HedKey.InLibrary] = row[df_constants.in_library]
prefixes_list.append(prefix_dict)

self.output[json_constants.PREFIXES_KEY] = prefixes_list
Expand All @@ -140,16 +105,6 @@ def _output_external_annotations(self, hed_schema):
if externals is None or externals.empty:
return

# Filter for unmerged library schemas - only output library entries if tracking is available
if not self._save_merged and hed_schema.library and hed_schema.with_standard:
if df_constants.in_library in externals.columns:
externals = externals[
externals[df_constants.in_library].notna() & (externals[df_constants.in_library] != "")
].copy()
if externals.empty:
return
# Otherwise fall back to writing all rows (assume all are library entries)

externals_list = []
for _, row in externals.iterrows():
external_dict = {
Expand All @@ -158,14 +113,6 @@ def _output_external_annotations(self, hed_schema):
"iri": row[df_constants.iri],
json_constants.DESCRIPTION_KEY: row[df_constants.description],
}
# Add inLibrary attribute in merged saves if present
if (
self._save_merged
and df_constants.in_library in row.index
and pd.notna(row[df_constants.in_library])
and row[df_constants.in_library] != ""
):
external_dict[HedKey.InLibrary] = row[df_constants.in_library]
externals_list.append(external_dict)

self.output[json_constants.EXTERNAL_ANNOTATIONS_KEY] = externals_list
Expand Down
20 changes: 3 additions & 17 deletions hed/schema/schema_io/schema2wiki.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Allows output of HedSchema objects as MEDIAWIKI format"""

import pandas as pd
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.schema.hed_schema_constants import HedSectionKey
from hed.schema.schema_io import wiki_constants, df_constants
from hed.schema.schema_io.schema2base import Schema2Base

Expand Down Expand Up @@ -59,33 +59,19 @@ def _output_extra(self, hed_schema, section_key, wiki_key):
wiki_key (string): The key in the wiki constants for the section.

"""
# In the base class, we do nothing, but subclasses can override this method.
extra = hed_schema.get_extras(section_key)
if extra is None or extra.empty:
return

# Filter for unmerged library schemas - only output library entries if tracking is available
if not self._save_merged and hed_schema.library and hed_schema.with_standard:
if df_constants.in_library in extra.columns:
extra = extra[extra[df_constants.in_library].notna() & (extra[df_constants.in_library] != "")].copy()
if extra.empty:
return
# Otherwise fall back to writing all rows (assume all are library entries)

self._add_blank_line()
self.current_tag_string = wiki_key
self._flush_current_tag()
for _, row in extra.iterrows():
self.current_tag_string += "*"
# Build column string, excluding in_library column for output
# Build column string from all columns
column_strings = []
for col in extra.columns:
if col == df_constants.in_library:
# For merged saves, include inLibrary in the output
if self._save_merged and pd.notna(row[col]) and row[col] != "":
column_strings.append(f"{HedKey.InLibrary}={row[col]}")
# For unmerged saves, skip writing in_library
else:
if pd.notna(row[col]) and row[col] != "":
column_strings.append(f"{col}={row[col]}")
self.current_tag_extra = ",".join(column_strings)
self._flush_current_tag()
Expand Down
Loading