racking-data-conversion/disaggregate_racking.py at main · laccore/racking-data-conversion · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# disaggregate_racking.py
#
# Convert tabular racking data with multiple core sections per row
# to a single section per row, preserving the cores' Location and Tube IDs.
#
# Input:
# CSV with Location, Tube/Container ID, Core 1, Core 2... headers in row 1.
# Each data row must have non-empty values for Location and Core 1. Tube/Container ID can be empty.
# Row may have additional non-empty values in Core 2+ and beyond.
#
# Output:
# CSV with Location, Tube ID, Core headers in row 1, followed by corresponding data rows.

import csv, os, sys

def is_valid_row(row, row_number):
    if len(row) < 3:
        print(f"Skipping row {row_number}: must have Location, Tube ID, and 1+ Core columns")
        return False
    if len(row[0]) == 0:
        print(f"Skipping row {row_number}: Empty location")
        return False
    # row[1] aka Tube/Container ID can be empty
    if len(row[2]) == 0:
        print(f"Skipping row {row_number}: Empty Core 1")
        for cell in row[3:]: # bail on goofy formatting
            assert len(cell) == 0, f"Empty Core 1 but valid cores exist beyond: {row[3:]}"
        return False
    return True

def create_single_section_rows(row):
    ss_rows = []
    location = row[0]
    tube_id = row[1]
    for cell in row[2:]:
        if len(cell) > 0:
            section_row = [location, tube_id, cell]
            ss_rows.append(section_row)
    return ss_rows

def disaggregate(inputfile, outputfile):
    all_rows = []
    with open(inputfile, 'rt') as f:
        reader = csv.reader(f)
        for idx, row in enumerate(reader):
            if idx == 0:
                continue # skip header
            if not is_valid_row(row, idx+1): # 1-based row numbers
                continue
            ss_rows = create_single_section_rows(row)
            all_rows += ss_rows
            # print(ss_rows)

    with open(outputfile, 'wt', newline='', encoding='utf-8-sig') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['Location', 'Tube ID', 'Core'])
        writer.writerows(all_rows)

def convert_dirs(input_dir, output_dir):
    csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
    input_abspath = os.path.abspath(input_dir)
    output_abspath = os.path.abspath(output_dir)

    for cf in csv_files:
        print(f'Disaggregating racking data in {cf}...')
        output_file = f"{cf.split('.')[0]}_one_per_row.csv"
        disaggregate(os.path.join(input_abspath, cf), os.path.join(output_abspath, output_file))

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage:\npython disaggregate_racking.py [dir containing input .csv files] [dir to write .csv output files]")
    else:
        convert_dirs(sys.argv[1], sys.argv[2])