Skip to content

Commit f2c52f4

Browse files
committed
Deal constistenly with line endings on schemas
1 parent 5309486 commit f2c52f4

5 files changed

Lines changed: 328 additions & 6 deletions

File tree

hed/models/base_input.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,9 @@ def to_csv(self, file=None) -> Union[str, None]:
262262
OSError: If the file cannot be opened.
263263
"""
264264
dataframe = self._dataframe
265-
csv_string_if_filename_none = dataframe.to_csv(file, sep="\t", index=False, header=self._has_column_names)
265+
csv_string_if_filename_none = dataframe.to_csv(
266+
file, sep="\t", index=False, header=self._has_column_names, lineterminator="\n"
267+
)
266268
return csv_string_if_filename_none
267269

268270
@property

hed/schema/hed_schema.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ def save_as_mediawiki(self, filename, save_merged=False):
354354
OSError: File cannot be saved for some reason.
355355
"""
356356
output_strings = Schema2Wiki().process_schema(self, save_merged)
357-
with open(filename, mode="w", encoding="utf-8") as opened_file:
357+
with open(filename, mode="w", encoding="utf-8", newline="") as opened_file:
358358
for string in output_strings:
359359
opened_file.write(string)
360360
opened_file.write("\n")
@@ -371,7 +371,7 @@ def save_as_xml(self, filename, save_merged=True):
371371
OSError: File cannot be saved for some reason.
372372
"""
373373
xml_tree = Schema2XML().process_schema(self, save_merged)
374-
with open(filename, mode="w", encoding="utf-8") as opened_file:
374+
with open(filename, mode="w", encoding="utf-8", newline="") as opened_file:
375375
xml_string = schema_util.xml_element_2_str(xml_tree)
376376
opened_file.write(xml_string)
377377

@@ -388,7 +388,7 @@ def save_as_json(self, filename, save_merged=True):
388388
"""
389389
converter = Schema2JSON()
390390
converter.process_schema(self, save_merged)
391-
with open(filename, mode="w", encoding="utf-8") as opened_file:
391+
with open(filename, mode="w", encoding="utf-8", newline="") as opened_file:
392392
opened_file.write(converter.to_json_string(indent=2))
393393

394394
def save_as_dataframes(self, base_filename, save_merged=False):

hed/schema/schema_io/df_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def save_dataframes(base_filename, dataframe_dict):
113113
os.makedirs(base_dir, exist_ok=True)
114114
for suffix, dataframe in dataframe_dict.items():
115115
filename = f"{base}_{suffix}.tsv"
116-
with open(filename, mode="w", encoding="utf-8") as opened_file:
116+
with open(filename, mode="w", encoding="utf-8", newline="") as opened_file:
117117
dataframe.to_csv(opened_file, sep="\t", index=False, header=True, quoting=csv.QUOTE_NONE, lineterminator="\n")
118118

119119

hed/schema/schema_io/schema_util.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ def xml_element_2_str(elem):
8989
"""
9090
rough_string = ElementTree.tostring(elem, method="xml")
9191
parsed = minidom.parseString(rough_string)
92-
return parsed.toprettyxml(indent=" ")
92+
xml_string = parsed.toprettyxml(indent=" ", newl="\n")
93+
return xml_string
9394

9495

9596
def schema_version_greater_equal(hed_schema, target_version):

tests/schema/test_hed_schema_io_df.py

Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,325 @@ def test_unit_columns_with_has_unit_class(self):
351351
for value in has_unit_class_values:
352352
self.assertFalse(value.startswith("hed:HED_"), f"hasUnitClass should contain names, not IDs: {value}")
353353

354+
def test_tsv_output_uses_lf_line_endings(self):
355+
"""Test that TSV output always uses LF (\\n) line endings, not CRLF (\\r\\n)."""
356+
from tests.schema.util_create_schemas import load_schema1
357+
import tempfile
358+
359+
schema = load_schema1()
360+
361+
# Save to a temporary location
362+
with tempfile.TemporaryDirectory() as tmpdir:
363+
output_path = os.path.join(tmpdir, "test_schema.tsv")
364+
schema.save_as_dataframes(output_path)
365+
366+
# Check that the tag TSV file uses LF endings
367+
tag_file = output_path.replace(".tsv", "_Tag.tsv")
368+
self.assertTrue(os.path.exists(tag_file), "Tag TSV file should exist")
369+
370+
# Read file in binary mode to check actual line endings
371+
with open(tag_file, "rb") as f:
372+
content = f.read()
373+
374+
# Check that file uses LF (\n) not CRLF (\r\n)
375+
self.assertNotIn(b"\r\n", content, "File should not contain CRLF line endings")
376+
self.assertIn(b"\n", content, "File should contain LF line endings")
377+
378+
def test_tsv_reading_handles_both_line_endings(self):
379+
"""Test that TSV files can be read correctly with either LF or CRLF line endings."""
380+
from tests.schema.util_create_schemas import load_schema1
381+
from hed.schema import load_schema
382+
import tempfile
383+
384+
schema = load_schema1()
385+
386+
with tempfile.TemporaryDirectory() as tmpdir:
387+
# Save the schema with LF endings (our standard)
388+
lf_path = os.path.join(tmpdir, "lf_schema.tsv")
389+
schema.save_as_dataframes(lf_path)
390+
391+
# Create a version with CRLF endings
392+
crlf_path = os.path.join(tmpdir, "crlf_schema.tsv")
393+
tag_lf = lf_path.replace(".tsv", "_Tag.tsv")
394+
tag_crlf = crlf_path.replace(".tsv", "_Tag.tsv")
395+
396+
# Read the LF file and convert to CRLF
397+
with open(tag_lf, "rb") as f:
398+
lf_content = f.read()
399+
400+
crlf_content = lf_content.replace(b"\n", b"\r\n")
401+
402+
# Write CRLF version
403+
os.makedirs(os.path.dirname(crlf_path), exist_ok=True)
404+
with open(tag_crlf, "wb") as f:
405+
f.write(crlf_content)
406+
407+
# Copy other files
408+
for suffix in [
409+
"Structure",
410+
"UnitClass",
411+
"Unit",
412+
"UnitModifier",
413+
"ValueClass",
414+
"AnnotationProperty",
415+
"DataProperty",
416+
"ObjectProperty",
417+
"AttributeProperty",
418+
]:
419+
src = lf_path.replace(".tsv", f"_{suffix}.tsv")
420+
dst = crlf_path.replace(".tsv", f"_{suffix}.tsv")
421+
if os.path.exists(src):
422+
with open(src, "rb") as f:
423+
content = f.read()
424+
with open(dst, "wb") as f:
425+
f.write(content.replace(b"\n", b"\r\n"))
426+
427+
# Both should load successfully
428+
lf_schema = load_schema(lf_path)
429+
crlf_schema = load_schema(crlf_path)
430+
431+
# And they should be equivalent
432+
self.assertEqual(lf_schema, crlf_schema, "Schemas with different line endings should be equivalent")
433+
434+
def test_xml_output_uses_lf_line_endings(self):
435+
"""Test that XML schema files always use LF line endings, not CRLF."""
436+
import tempfile
437+
438+
with tempfile.TemporaryDirectory() as tmpdir:
439+
xml_path = os.path.join(tmpdir, "test_schema.xml")
440+
441+
# Save schema as XML
442+
self.schema.save_as_xml(xml_path)
443+
444+
# Read file in binary mode to check actual line endings
445+
with open(xml_path, "rb") as f:
446+
content = f.read()
447+
448+
# Should not contain CRLF (b'\r\n')
449+
self.assertNotIn(b"\r\n", content, "XML file should not contain CRLF line endings")
450+
# Should contain LF (b'\n')
451+
self.assertIn(b"\n", content, "XML file should contain LF line endings")
452+
453+
def test_mediawiki_output_uses_lf_line_endings(self):
454+
"""Test that MediaWiki schema files always use LF line endings, not CRLF."""
455+
import tempfile
456+
457+
with tempfile.TemporaryDirectory() as tmpdir:
458+
wiki_path = os.path.join(tmpdir, "test_schema.mediawiki")
459+
460+
# Save schema as MediaWiki
461+
self.schema.save_as_mediawiki(wiki_path)
462+
463+
# Read file in binary mode to check actual line endings
464+
with open(wiki_path, "rb") as f:
465+
content = f.read()
466+
467+
# Should not contain CRLF (b'\r\n')
468+
self.assertNotIn(b"\r\n", content, "MediaWiki file should not contain CRLF line endings")
469+
# Should contain LF (b'\n')
470+
self.assertIn(b"\n", content, "MediaWiki file should contain LF line endings")
471+
472+
def test_json_output_uses_lf_line_endings(self):
473+
"""Test that JSON schema files always use LF line endings, not CRLF."""
474+
import tempfile
475+
476+
with tempfile.TemporaryDirectory() as tmpdir:
477+
json_path = os.path.join(tmpdir, "test_schema.json")
478+
479+
# Save schema as JSON
480+
self.schema.save_as_json(json_path)
481+
482+
# Read file in binary mode to check actual line endings
483+
with open(json_path, "rb") as f:
484+
content = f.read()
485+
486+
# Should not contain CRLF (b'\r\n')
487+
self.assertNotIn(b"\r\n", content, "JSON file should not contain CRLF line endings")
488+
# Should contain LF (b'\n')
489+
self.assertIn(b"\n", content, "JSON file should contain LF line endings")
490+
491+
def test_xml_library_schema_uses_lf(self):
492+
"""Test that library schemas saved as XML use LF line endings."""
493+
import tempfile
494+
495+
with tempfile.TemporaryDirectory() as tmpdir:
496+
# Load a library schema
497+
lib_schema = load_schema_version("testlib_3.0.0")
498+
499+
# Test both merged and unmerged saves
500+
for save_merged in [True, False]:
501+
xml_path = os.path.join(tmpdir, f"testlib_merged_{save_merged}.xml")
502+
lib_schema.save_as_xml(xml_path, save_merged=save_merged)
503+
504+
with open(xml_path, "rb") as f:
505+
content = f.read()
506+
507+
self.assertNotIn(
508+
b"\r\n",
509+
content,
510+
f"XML library schema (save_merged={save_merged}) should not contain CRLF",
511+
)
512+
self.assertIn(b"\n", content, "XML file should contain LF line endings")
513+
514+
def test_mediawiki_library_schema_uses_lf(self):
515+
"""Test that library schemas saved as MediaWiki use LF line endings."""
516+
import tempfile
517+
518+
with tempfile.TemporaryDirectory() as tmpdir:
519+
# Load a library schema
520+
lib_schema = load_schema_version("testlib_3.0.0")
521+
522+
# Test both merged and unmerged saves
523+
for save_merged in [True, False]:
524+
wiki_path = os.path.join(tmpdir, f"testlib_merged_{save_merged}.mediawiki")
525+
lib_schema.save_as_mediawiki(wiki_path, save_merged=save_merged)
526+
527+
with open(wiki_path, "rb") as f:
528+
content = f.read()
529+
530+
self.assertNotIn(
531+
b"\r\n",
532+
content,
533+
f"MediaWiki library schema (save_merged={save_merged}) should not contain CRLF",
534+
)
535+
self.assertIn(b"\n", content, "MediaWiki file should contain LF line endings")
536+
537+
def test_json_library_schema_uses_lf(self):
538+
"""Test that library schemas saved as JSON use LF line endings."""
539+
import tempfile
540+
541+
with tempfile.TemporaryDirectory() as tmpdir:
542+
# Load a library schema
543+
lib_schema = load_schema_version("testlib_3.0.0")
544+
545+
# Test both merged and unmerged saves
546+
for save_merged in [True, False]:
547+
json_path = os.path.join(tmpdir, f"testlib_merged_{save_merged}.json")
548+
lib_schema.save_as_json(json_path, save_merged=save_merged)
549+
550+
with open(json_path, "rb") as f:
551+
content = f.read()
552+
553+
self.assertNotIn(b"\r\n", content, f"JSON library schema (save_merged={save_merged}) should not contain CRLF")
554+
self.assertIn(b"\n", content, "JSON file should contain LF line endings")
555+
556+
def test_tsv_library_schema_uses_lf(self):
557+
"""Test that library schemas saved as TSV use LF line endings."""
558+
import tempfile
559+
560+
with tempfile.TemporaryDirectory() as tmpdir:
561+
# Load a library schema
562+
lib_schema = load_schema_version("testlib_3.0.0")
563+
564+
# Test both merged and unmerged saves
565+
for save_merged in [True, False]:
566+
tsv_path = os.path.join(tmpdir, f"testlib_merged_{save_merged}.tsv")
567+
lib_schema.save_as_dataframes(tsv_path, save_merged=save_merged)
568+
569+
# Check all TSV files
570+
tag_path = tsv_path.replace(".tsv", "_Tag.tsv")
571+
if os.path.exists(tag_path):
572+
with open(tag_path, "rb") as f:
573+
content = f.read()
574+
575+
self.assertNotIn(
576+
b"\r\n",
577+
content,
578+
f"TSV library schema Tag file (save_merged={save_merged}) should not contain CRLF",
579+
)
580+
self.assertIn(b"\n", content, "TSV file should contain LF line endings")
581+
582+
def test_all_formats_roundtrip_with_lf(self):
583+
"""Test that all formats can be saved and reloaded with LF line endings preserved."""
584+
import tempfile
585+
586+
with tempfile.TemporaryDirectory() as tmpdir:
587+
schema = load_schema_version("8.3.0")
588+
589+
# Test XML
590+
xml_path = os.path.join(tmpdir, "test.xml")
591+
schema.save_as_xml(xml_path)
592+
reloaded_xml = load_schema(xml_path)
593+
self.assertEqual(schema, reloaded_xml, "XML schema should round-trip correctly")
594+
595+
# Verify LF in saved file
596+
with open(xml_path, "rb") as f:
597+
self.assertNotIn(b"\r\n", f.read(), "Saved XML should use LF")
598+
599+
# Test MediaWiki
600+
wiki_path = os.path.join(tmpdir, "test.mediawiki")
601+
schema.save_as_mediawiki(wiki_path)
602+
reloaded_wiki = load_schema(wiki_path)
603+
self.assertEqual(schema, reloaded_wiki, "MediaWiki schema should round-trip correctly")
604+
605+
# Verify LF in saved file
606+
with open(wiki_path, "rb") as f:
607+
self.assertNotIn(b"\r\n", f.read(), "Saved MediaWiki should use LF")
608+
609+
# Test JSON
610+
json_path = os.path.join(tmpdir, "test.json")
611+
schema.save_as_json(json_path)
612+
reloaded_json = load_schema(json_path)
613+
self.assertEqual(schema, reloaded_json, "JSON schema should round-trip correctly")
614+
615+
# Verify LF in saved file
616+
with open(json_path, "rb") as f:
617+
self.assertNotIn(b"\r\n", f.read(), "Saved JSON should use LF")
618+
619+
# Test TSV
620+
tsv_path = os.path.join(tmpdir, "test.tsv")
621+
schema.save_as_dataframes(tsv_path)
622+
reloaded_tsv = load_schema(tsv_path)
623+
self.assertEqual(schema, reloaded_tsv, "TSV schema should round-trip correctly")
624+
625+
# Verify LF in all TSV files
626+
for suffix in ["Tag", "Structure", "Unit", "UnitClass", "UnitModifier", "ValueClass"]:
627+
file_path = tsv_path.replace(".tsv", f"_{suffix}.tsv")
628+
if os.path.exists(file_path):
629+
with open(file_path, "rb") as f:
630+
self.assertNotIn(b"\r\n", f.read(), f"Saved TSV {suffix} should use LF")
631+
632+
def test_no_carriage_return_anywhere_in_output(self):
633+
"""Test that there are absolutely no carriage return characters in any schema output."""
634+
import tempfile
635+
636+
with tempfile.TemporaryDirectory() as tmpdir:
637+
schema = load_schema_version("8.4.0")
638+
639+
# Test all formats
640+
formats = [
641+
("xml", lambda p: schema.save_as_xml(p)),
642+
("mediawiki", lambda p: schema.save_as_mediawiki(p)),
643+
("json", lambda p: schema.save_as_json(p)),
644+
]
645+
646+
for ext, save_func in formats:
647+
file_path = os.path.join(tmpdir, f"test.{ext}")
648+
save_func(file_path)
649+
650+
with open(file_path, "rb") as f:
651+
content = f.read()
652+
653+
# Count carriage returns - should be zero
654+
cr_count = content.count(b"\r")
655+
self.assertEqual(cr_count, 0, f"Format {ext} should have ZERO carriage return characters, found {cr_count}")
656+
657+
# Test TSV format
658+
tsv_path = os.path.join(tmpdir, "test.tsv")
659+
schema.save_as_dataframes(tsv_path)
660+
661+
# Check all generated TSV files
662+
for file in os.listdir(tmpdir):
663+
if file.startswith("test") and file.endswith(".tsv"):
664+
file_path = os.path.join(tmpdir, file)
665+
with open(file_path, "rb") as f:
666+
content = f.read()
667+
668+
cr_count = content.count(b"\r")
669+
self.assertEqual(
670+
cr_count, 0, f"TSV file {file} should have ZERO carriage return characters, found {cr_count}"
671+
)
672+
354673

355674
if __name__ == "__main__":
356675
unittest.main()

0 commit comments

Comments
 (0)