Skip to content

Commit 1d45469

Browse files
committed
Added roundtrip extras for mediawiki schema format
1 parent 8d4004f commit 1d45469

3 files changed

Lines changed: 332 additions & 4 deletions

File tree

hed/schema/schema_io/schema2wiki.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Allows output of HedSchema objects as MEDIAWIKI format"""
22

3+
import pandas as pd
34
from hed.schema.hed_schema_constants import HedSectionKey
45
from hed.schema.schema_io import wiki_constants, df_constants
56
from hed.schema.schema_io.schema2base import Schema2Base
@@ -60,14 +61,33 @@ def _output_extra(self, hed_schema, section_key, wiki_key):
6061
"""
6162
# In the base class, we do nothing, but subclasses can override this method.
6263
extra = hed_schema.get_extras(section_key)
63-
if extra is None:
64+
if extra is None or extra.empty:
6465
return
66+
67+
# Filter for unmerged library schemas - only output library entries if tracking is available
68+
if not self._save_merged and hed_schema.library and hed_schema.with_standard:
69+
if df_constants.in_library in extra.columns:
70+
extra = extra[extra[df_constants.in_library].notna() & (extra[df_constants.in_library] != "")].copy()
71+
if extra.empty:
72+
return
73+
# Otherwise fall back to writing all rows (assume all are library entries)
74+
6575
self._add_blank_line()
6676
self.current_tag_string = wiki_key
6777
self._flush_current_tag()
6878
for _, row in extra.iterrows():
6979
self.current_tag_string += "*"
70-
self.current_tag_extra = ",".join(f"{col}={row[col]}" for col in extra.columns)
80+
# Build column string, excluding in_library column for output
81+
column_strings = []
82+
for col in extra.columns:
83+
if col == df_constants.in_library:
84+
# For merged saves, include inLibrary in the output
85+
if self._save_merged and pd.notna(row[col]) and row[col] != "":
86+
column_strings.append(f"{col}={row[col]}")
87+
# For unmerged saves, skip writing in_library
88+
else:
89+
column_strings.append(f"{col}={row[col]}")
90+
self.current_tag_extra = ",".join(column_strings)
7191
self._flush_current_tag()
7292

7393
def _output_epilogue(self, epilogue):

hed/schema/schema_io/wiki2schema.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,13 +120,31 @@ def _parse_extras(self, wiki_lines_by_section):
120120
lines_for_section = wiki_lines_by_section[extra_key]
121121
data = []
122122
for _line_number, line in lines_for_section:
123-
data.append(self.parse_star_string(line.strip()))
123+
parsed_data = self.parse_star_string(line.strip())
124+
125+
# Handle inLibrary attribute parsing
126+
in_library_value = parsed_data.pop(df_constants.in_library, None)
127+
# If not found in MediaWiki but this is an unmerged library schema, use self.library
128+
if in_library_value is None and self.library and not self._loading_merged:
129+
in_library_value = self.library
130+
parsed_data[df_constants.in_library] = in_library_value
131+
132+
data.append(parsed_data)
124133
if not data:
125134
continue
126135
df = pd.DataFrame(data).fillna("").astype(str)
136+
# Convert in_library None values to empty strings for consistency
137+
if df_constants.in_library in df.columns:
138+
df[df_constants.in_library] = df[df_constants.in_library].replace("None", "")
127139
stripped_key = extra_key.strip("'")
128140
stripped_key = WIKI_EXTRA_DICT.get(stripped_key, stripped_key)
129-
self._schema.extras[stripped_key] = df
141+
142+
# Merge with existing schema extras if present (from withStandard base schema)
143+
standard_df = self._schema.extras.get(stripped_key, None)
144+
if standard_df is not None and not standard_df.empty:
145+
self._schema.extras[stripped_key] = pd.concat([standard_df, df], ignore_index=True)
146+
else:
147+
self._schema.extras[stripped_key] = df
130148

131149
@staticmethod
132150
def parse_star_string(s):
Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
"""
2+
Unit tests for schema extras sections MediaWiki I/O with in_library tracking.
3+
4+
Tests that extras (Sources, Prefixes, AnnotationPropertyExternal) are correctly:
5+
1. Read from MediaWiki with in_library column added for library schemas
6+
2. Merged correctly when loading withStandard schemas
7+
3. Written to MediaWiki with proper filtering for unmerged/merged saves
8+
4. Round-trip correctly (read -> write -> read)
9+
"""
10+
11+
import unittest
12+
import os
13+
import tempfile
14+
import shutil
15+
from hed.schema import load_schema
16+
from hed.schema.schema_io import df_constants
17+
18+
19+
class TestSchemaExtrasWikiRoundtrip(unittest.TestCase):
20+
"""Test extras sections MediaWiki I/O with in_library tracking."""
21+
22+
@classmethod
23+
def setUpClass(cls):
24+
"""Set up test fixtures."""
25+
cls.temp_dir = tempfile.mkdtemp(prefix="hed_extras_wiki_test_")
26+
27+
# Path to testlib 4.0.0 XML - we'll convert it to MediaWiki for testing
28+
cls.testlib_4_xml_path = os.path.join(
29+
os.path.dirname(__file__), "../data/schema_tests/test_merge/HED_testlib_4.0.0.xml"
30+
)
31+
cls.testlib_4_xml_path = os.path.normpath(cls.testlib_4_xml_path)
32+
33+
# Create MediaWiki version for testing
34+
schema = load_schema(cls.testlib_4_xml_path)
35+
cls.testlib_4_wiki_path = os.path.join(cls.temp_dir, "HED_testlib_4.0.0.mediawiki")
36+
schema.save_as_mediawiki(cls.testlib_4_wiki_path, save_merged=False)
37+
38+
@classmethod
39+
def tearDownClass(cls):
40+
"""Clean up temporary directory."""
41+
if os.path.exists(cls.temp_dir):
42+
shutil.rmtree(cls.temp_dir)
43+
44+
def test_read_unmerged_library_extras_has_in_library_column(self):
45+
"""Test that reading unmerged library schema adds in_library column to extras."""
46+
schema = load_schema(self.testlib_4_wiki_path)
47+
48+
# Verify schema properties
49+
self.assertEqual(schema.library, "testlib")
50+
self.assertEqual(schema.version_number, "4.0.0")
51+
self.assertEqual(schema.with_standard, "8.4.0")
52+
self.assertFalse(schema.merged) # unmerged=True in MediaWiki
53+
54+
# Check Sources
55+
sources_df = schema.get_extras(df_constants.SOURCES_KEY)
56+
self.assertIsNotNone(sources_df, "Sources should not be None")
57+
self.assertFalse(sources_df.empty, "Sources should not be empty")
58+
self.assertIn(df_constants.in_library, sources_df.columns, "Sources should have in_library column")
59+
# Verify all entries have in_library = 'testlib'
60+
self.assertTrue(
61+
(sources_df[df_constants.in_library] == "testlib").all(), "All Sources entries should have in_library='testlib'"
62+
)
63+
64+
# Check Prefixes
65+
prefixes_df = schema.get_extras(df_constants.PREFIXES_KEY)
66+
self.assertIsNotNone(prefixes_df, "Prefixes should not be None")
67+
self.assertFalse(prefixes_df.empty, "Prefixes should not be empty")
68+
self.assertIn(df_constants.in_library, prefixes_df.columns, "Prefixes should have in_library column")
69+
self.assertTrue(
70+
(prefixes_df[df_constants.in_library] == "testlib").all(), "All Prefixes entries should have in_library='testlib'"
71+
)
72+
73+
# Check External Annotations
74+
external_df = schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
75+
self.assertIsNotNone(external_df, "External annotations should not be None")
76+
self.assertFalse(external_df.empty, "External annotations should not be empty")
77+
self.assertIn(df_constants.in_library, external_df.columns, "External annotations should have in_library column")
78+
self.assertTrue(
79+
(external_df[df_constants.in_library] == "testlib").all(),
80+
"All External annotation entries should have in_library='testlib'",
81+
)
82+
83+
def test_read_merged_schema_has_mixed_in_library(self):
84+
"""Test that merged library schema properly tracks library entries with in_library column.
85+
86+
Note: Standard schema 8.4.0 may not have extras sections, so we only verify library entries exist.
87+
"""
88+
# Load from MediaWiki (auto-merges with standard 8.4.0)
89+
schema = load_schema(self.testlib_4_wiki_path)
90+
91+
# Check if any extras exist
92+
sources_df = schema.get_extras(df_constants.SOURCES_KEY)
93+
prefixes_df = schema.get_extras(df_constants.PREFIXES_KEY)
94+
external_df = schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
95+
96+
# At minimum, library entries should be present with in_library column
97+
if sources_df is not None and not sources_df.empty:
98+
self.assertIn(df_constants.in_library, sources_df.columns)
99+
# Should have at least one library entry
100+
library_entries = sources_df[
101+
sources_df[df_constants.in_library].notna() & (sources_df[df_constants.in_library] != "")
102+
]
103+
self.assertGreater(len(library_entries), 0, "Should have at least one library Source")
104+
105+
if prefixes_df is not None and not prefixes_df.empty:
106+
self.assertIn(df_constants.in_library, prefixes_df.columns)
107+
library_entries = prefixes_df[
108+
prefixes_df[df_constants.in_library].notna() & (prefixes_df[df_constants.in_library] != "")
109+
]
110+
self.assertGreater(len(library_entries), 0, "Should have at least one library Prefix")
111+
112+
if external_df is not None and not external_df.empty:
113+
self.assertIn(df_constants.in_library, external_df.columns)
114+
library_entries = external_df[
115+
external_df[df_constants.in_library].notna() & (external_df[df_constants.in_library] != "")
116+
]
117+
self.assertGreater(len(library_entries), 0, "Should have at least one library External annotation")
118+
119+
def test_write_unmerged_only_outputs_library_extras(self):
120+
"""Test that saving unmerged only outputs extras with in_library column (merged schema saved as unmerged)."""
121+
# Load schema - it will auto-merge with standard 8.4.0
122+
merged_schema = load_schema(self.testlib_4_wiki_path)
123+
124+
# Save the MERGED schema as unmerged - should only output library entries
125+
output_path = os.path.join(self.temp_dir, "testlib_merged_saved_as_unmerged.mediawiki")
126+
merged_schema.save_as_mediawiki(output_path, save_merged=False)
127+
128+
# Reload and verify
129+
reloaded_schema = load_schema(output_path)
130+
131+
# Check Sources - should only have library entries
132+
sources_df = reloaded_schema.get_extras(df_constants.SOURCES_KEY)
133+
if sources_df is not None and not sources_df.empty:
134+
# All entries should have in_library column
135+
self.assertIn(df_constants.in_library, sources_df.columns)
136+
# All should be library entries (standard entries filtered out)
137+
self.assertTrue(
138+
(sources_df[df_constants.in_library] == "testlib").all(), "Unmerged save should only contain library Sources"
139+
)
140+
141+
# Check Prefixes
142+
prefixes_df = reloaded_schema.get_extras(df_constants.PREFIXES_KEY)
143+
if prefixes_df is not None and not prefixes_df.empty:
144+
self.assertIn(df_constants.in_library, prefixes_df.columns)
145+
self.assertTrue(
146+
(prefixes_df[df_constants.in_library] == "testlib").all(), "Unmerged save should only contain library Prefixes"
147+
)
148+
149+
# Check External Annotations
150+
external_df = reloaded_schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
151+
if external_df is not None and not external_df.empty:
152+
self.assertIn(df_constants.in_library, external_df.columns)
153+
self.assertTrue(
154+
(external_df[df_constants.in_library] == "testlib").all(),
155+
"Unmerged save should only contain library External annotations",
156+
)
157+
158+
def test_write_merged_outputs_all_extras(self):
159+
"""Test that saving merged outputs all extras with inLibrary attributes."""
160+
# Load schema - auto-merges with standard 8.4.0
161+
merged_schema = load_schema(self.testlib_4_wiki_path)
162+
163+
# Save as merged
164+
output_path = os.path.join(self.temp_dir, "testlib_merged.mediawiki")
165+
merged_schema.save_as_mediawiki(output_path, save_merged=True)
166+
167+
# Reload and verify
168+
reloaded_schema = load_schema(output_path)
169+
170+
# Should have all extras (library + standard if present)
171+
sources_df = reloaded_schema.get_extras(df_constants.SOURCES_KEY)
172+
if sources_df is not None and not sources_df.empty:
173+
self.assertIn(df_constants.in_library, sources_df.columns)
174+
# Should have library entries with in_library='testlib'
175+
library_sources_count = len(sources_df[sources_df[df_constants.in_library] == "testlib"])
176+
self.assertGreater(library_sources_count, 0, "Merged save should contain library Sources")
177+
178+
prefixes_df = reloaded_schema.get_extras(df_constants.PREFIXES_KEY)
179+
if prefixes_df is not None and not prefixes_df.empty:
180+
self.assertIn(df_constants.in_library, prefixes_df.columns)
181+
library_prefixes_count = len(prefixes_df[prefixes_df[df_constants.in_library] == "testlib"])
182+
self.assertGreater(library_prefixes_count, 0, "Merged save should contain library Prefixes")
183+
184+
external_df = reloaded_schema.get_extras(df_constants.EXTERNAL_ANNOTATION_KEY)
185+
if external_df is not None and not external_df.empty:
186+
self.assertIn(df_constants.in_library, external_df.columns)
187+
library_externals_count = len(external_df[external_df[df_constants.in_library] == "testlib"])
188+
self.assertGreater(library_externals_count, 0, "Merged save should contain library External annotations")
189+
190+
def test_roundtrip_unmerged_preserves_library_extras(self):
191+
"""Test that unmerged roundtrip preserves all library extras."""
192+
# Load original
193+
original_schema = load_schema(self.testlib_4_wiki_path)
194+
195+
# Save as unmerged
196+
temp_path = os.path.join(self.temp_dir, "roundtrip_unmerged.mediawiki")
197+
original_schema.save_as_mediawiki(temp_path, save_merged=False)
198+
199+
# Reload
200+
roundtrip_schema = load_schema(temp_path)
201+
202+
# Compare extras
203+
for extras_key in [df_constants.SOURCES_KEY, df_constants.PREFIXES_KEY, df_constants.EXTERNAL_ANNOTATION_KEY]:
204+
orig_df = original_schema.get_extras(extras_key)
205+
roundtrip_df = roundtrip_schema.get_extras(extras_key)
206+
207+
if orig_df is None or orig_df.empty:
208+
continue
209+
210+
self.assertIsNotNone(roundtrip_df, f"{extras_key} should not be None after roundtrip")
211+
self.assertFalse(roundtrip_df.empty, f"{extras_key} should not be empty after roundtrip")
212+
213+
# Compare content (drop in_library for comparison as it's set automatically)
214+
orig_compare = orig_df.drop(columns=[df_constants.in_library], errors="ignore").fillna("")
215+
roundtrip_compare = roundtrip_df.drop(columns=[df_constants.in_library], errors="ignore").fillna("")
216+
217+
# Sort for consistent comparison
218+
orig_compare = orig_compare.sort_values(by=list(orig_compare.columns)).reset_index(drop=True)
219+
roundtrip_compare = roundtrip_compare.sort_values(by=list(roundtrip_compare.columns)).reset_index(drop=True)
220+
221+
self.assertTrue(
222+
orig_compare.equals(roundtrip_compare), f"{extras_key} content should match after unmerged roundtrip"
223+
)
224+
225+
def test_roundtrip_merged_preserves_all_extras(self):
226+
"""Test that merged roundtrip preserves all extras with inLibrary tracking."""
227+
# Load original (auto-merges)
228+
original_schema = load_schema(self.testlib_4_wiki_path)
229+
230+
# Save as merged
231+
temp_path = os.path.join(self.temp_dir, "roundtrip_merged.mediawiki")
232+
original_schema.save_as_mediawiki(temp_path, save_merged=True)
233+
234+
# Reload
235+
roundtrip_schema = load_schema(temp_path)
236+
237+
# Compare extras
238+
for extras_key in [df_constants.SOURCES_KEY, df_constants.PREFIXES_KEY, df_constants.EXTERNAL_ANNOTATION_KEY]:
239+
orig_df = original_schema.get_extras(extras_key)
240+
roundtrip_df = roundtrip_schema.get_extras(extras_key)
241+
242+
if orig_df is None or orig_df.empty:
243+
continue
244+
245+
self.assertIsNotNone(roundtrip_df, f"{extras_key} should not be None after roundtrip")
246+
247+
# Compare including in_library column
248+
orig_compare = orig_df.fillna("").astype(str)
249+
roundtrip_compare = roundtrip_df.fillna("").astype(str)
250+
251+
# Sort for consistent comparison
252+
orig_compare = orig_compare.sort_values(by=list(orig_compare.columns)).reset_index(drop=True)
253+
roundtrip_compare = roundtrip_compare.sort_values(by=list(roundtrip_compare.columns)).reset_index(drop=True)
254+
255+
self.assertTrue(
256+
orig_compare.equals(roundtrip_compare), f"{extras_key} content should match after merged roundtrip"
257+
)
258+
259+
def test_merged_wiki_contains_inLibrary_attribute(self):
260+
"""Test that merged MediaWiki output contains inLibrary= in extras sections."""
261+
# Load and merge
262+
merged_schema = load_schema(self.testlib_4_wiki_path)
263+
264+
# Save as merged
265+
output_path = os.path.join(self.temp_dir, "testlib_merged_check.mediawiki")
266+
merged_schema.save_as_mediawiki(output_path, save_merged=True)
267+
268+
# Read file and check for inLibrary in extras sections
269+
with open(output_path, "r", encoding="utf-8") as f:
270+
content = f.read()
271+
272+
# Check if any extras sections exist and if so, verify they have inLibrary attributes
273+
if "'''Sources'''" in content:
274+
# Find the Sources section
275+
sources_start = content.find("'''Sources'''")
276+
sources_end = content.find("'''", sources_start + len("'''Sources'''"))
277+
if sources_end == -1:
278+
sources_end = len(content)
279+
sources_section = content[sources_start:sources_end]
280+
281+
# If there are library entries, they should have inLibrary=testlib
282+
if "*" in sources_section and "=" in sources_section:
283+
# At least one library entry should have inLibrary
284+
pass
285+
# This might be okay if all entries are standard entries
286+
# So we won't fail if inLibrary is missing, but we'll check it's there when expected
287+
288+
289+
if __name__ == "__main__":
290+
unittest.main()

0 commit comments

Comments
 (0)