|
1 | | -""" Column type for a column in a ColumnMapper. """ |
2 | | -from enum import Enum |
3 | | -from hed.errors.error_types import SidecarErrors |
4 | | -import pandas as pd |
5 | | -import copy |
6 | | - |
7 | | - |
8 | | -class ColumnType(Enum): |
9 | | - """ The overall column_type of a column in column mapper, e.g. treat it as HED tags. |
10 | | -
|
11 | | - Mostly internal to column mapper related code |
12 | | - """ |
13 | | - Unknown = None |
14 | | - # Do not return this column at all |
15 | | - Ignore = "ignore" |
16 | | - # This column is a category with a list of possible values to replace with HED strings. |
17 | | - Categorical = "categorical" |
18 | | - # This column has a value(e.g. filename) that is added to a HED tag in place of a # sign. |
19 | | - Value = "value" |
20 | | - # Return this column exactly as given, it is HED tags. |
21 | | - HEDTags = "hed_tags" |
22 | | - |
23 | | - |
24 | | -class ColumnMetadata: |
25 | | - """ Column in a ColumnMapper. """ |
26 | | - |
27 | | - def __init__(self, column_type=None, name=None, source=None): |
28 | | - """ A single column entry in the column mapper. |
29 | | -
|
30 | | - Parameters: |
31 | | - column_type (ColumnType or None): How to treat this column when reading data. |
32 | | - name (str, int, or None): The column_name or column number identifying this column. |
33 | | - If name is a string, you'll need to use a column map to set the number later. |
34 | | - source (dict or str or None): Either the entire loaded json sidecar or a single HED string. |
35 | | - """ |
36 | | - self.column_name = name |
37 | | - self._source = source |
38 | | - if column_type is None: |
39 | | - column_type = self._detect_column_type(self.source_dict) |
40 | | - self.column_type = column_type |
41 | | - |
42 | | - @property |
43 | | - def hed_dict(self): |
44 | | - """ The HED strings for any given entry. |
45 | | -
|
46 | | - Returns: |
47 | | - dict or str: A string or dict of strings for this column. |
48 | | -
|
49 | | - """ |
50 | | - if self._source is None or isinstance(self._source, str): |
51 | | - return self._source |
52 | | - return self._source[self.column_name].get("HED", {}) |
53 | | - |
54 | | - @property |
55 | | - def source_dict(self): |
56 | | - """ The raw dict for this entry(if it exists). |
57 | | -
|
58 | | - Returns: |
59 | | - dict or str: A string or dict of strings for this column. |
60 | | - """ |
61 | | - if self._source is None or isinstance(self._source, str): |
62 | | - return {"HED": self._source} |
63 | | - return self._source[self.column_name] |
64 | | - |
65 | | - def get_hed_strings(self): |
66 | | - """ Return the HED strings for this entry as a series. |
67 | | -
|
68 | | - Returns: |
69 | | - hed_strings(pd.Series): The HED strings for this series.(potentially empty). |
70 | | - """ |
71 | | - if not self.column_type: |
72 | | - return pd.Series(dtype=str) |
73 | | - |
74 | | - series = pd.Series(self.hed_dict, dtype=str) |
75 | | - |
76 | | - return series |
77 | | - |
78 | | - def set_hed_strings(self, new_strings): |
79 | | - """ Set the HED strings for this entry. |
80 | | -
|
81 | | - Parameters: |
82 | | - new_strings(pd.Series, dict, or str): The HED strings to set. |
83 | | - This should generally be the return value from get_hed_strings. |
84 | | -
|
85 | | - Returns: |
86 | | - hed_strings(pd.Series): The HED strings for this series.(potentially empty). |
87 | | - """ |
88 | | - if new_strings is None: |
89 | | - return False |
90 | | - |
91 | | - if not self.column_type: |
92 | | - return False |
93 | | - |
94 | | - if isinstance(new_strings, pd.Series): |
95 | | - if self.column_type == ColumnType.Categorical: |
96 | | - new_strings = new_strings.to_dict() |
97 | | - elif new_strings.empty: |
98 | | - return False |
99 | | - else: |
100 | | - new_strings = new_strings.iloc[0] |
101 | | - |
102 | | - self._source[self.column_name]["HED"] = new_strings |
103 | | - |
104 | | - return True |
105 | | - |
106 | | - @staticmethod |
107 | | - def _detect_column_type(dict_for_entry, basic_validation=True): |
108 | | - """ Determine the ColumnType of a given json entry. |
109 | | -
|
110 | | - Parameters: |
111 | | - dict_for_entry (dict): The loaded json entry a specific column. |
112 | | - Generally has a "HED" entry among other optional ones. |
113 | | - basic_validation (bool): If False, does not verify past "HED" exists and the type |
114 | | - This is used to issue more precise errors that are normally just silently ignored, |
115 | | - but also not crash. |
116 | | - Returns: |
117 | | - ColumnType: The determined type of given column. Returns None if unknown. |
118 | | -
|
119 | | - """ |
120 | | - if not dict_for_entry or not isinstance(dict_for_entry, dict): |
121 | | - return ColumnType.Ignore |
122 | | - |
123 | | - minimum_required_keys = ("HED",) |
124 | | - if not set(minimum_required_keys).issubset(dict_for_entry.keys()): |
125 | | - return ColumnType.Ignore |
126 | | - |
127 | | - hed_entry = dict_for_entry["HED"] |
128 | | - if isinstance(hed_entry, dict): |
129 | | - if basic_validation and not all(isinstance(entry, str) for entry in hed_entry.values()): |
130 | | - return None |
131 | | - return ColumnType.Categorical |
132 | | - |
133 | | - if not isinstance(hed_entry, str): |
134 | | - return None |
135 | | - |
136 | | - if basic_validation and "#" not in dict_for_entry["HED"]: |
137 | | - return None |
138 | | - |
139 | | - return ColumnType.Value |
140 | | - |
141 | | - @staticmethod |
142 | | - def expected_pound_sign_count(column_type): |
143 | | - """ Return how many pound signs a column string should have. |
144 | | -
|
145 | | - Parameters: |
146 | | - column_type(ColumnType): The type of the column. |
147 | | -
|
148 | | - Returns: |
149 | | - tuple: |
150 | | - expected_count(int): The expected count. 0 or 1. |
151 | | - error_type(str): The type of the error we should issue. |
152 | | - """ |
153 | | - if column_type == ColumnType.Value: |
154 | | - expected_count = 1 |
155 | | - error_type = SidecarErrors.INVALID_POUND_SIGNS_VALUE |
156 | | - elif column_type == ColumnType.HEDTags or column_type == ColumnType.Categorical: |
157 | | - expected_count = 0 |
158 | | - error_type = SidecarErrors.INVALID_POUND_SIGNS_CATEGORY |
159 | | - else: |
160 | | - return 0, None |
161 | | - return expected_count, error_type |
162 | | - |
163 | | - def _get_unvalidated_data(self): |
164 | | - """Returns a copy with less preliminary validation done(such as verifying all data types)""" |
165 | | - return_copy = copy.deepcopy(self) |
166 | | - return_copy.column_type = ColumnMetadata._detect_column_type(dict_for_entry=return_copy.source_dict, |
167 | | - basic_validation=False) |
168 | | - return return_copy |
| 1 | +""" Column type for a column in a ColumnMapper. """ |
| 2 | +from enum import Enum |
| 3 | +from typing import Union |
| 4 | + |
| 5 | +from hed.errors.error_types import SidecarErrors |
| 6 | +import pandas as pd |
| 7 | +import copy |
| 8 | + |
| 9 | + |
| 10 | +class ColumnType(Enum): |
| 11 | + """ The overall column_type of a column in column mapper, e.g. treat it as HED tags. |
| 12 | +
|
| 13 | + Mostly internal to column mapper related code |
| 14 | + """ |
| 15 | + Unknown = None |
| 16 | + # Do not return this column at all |
| 17 | + Ignore = "ignore" |
| 18 | + # This column is a category with a list of possible values to replace with HED strings. |
| 19 | + Categorical = "categorical" |
| 20 | + # This column has a value(e.g. filename) that is added to a HED tag in place of a # sign. |
| 21 | + Value = "value" |
| 22 | + # Return this column exactly as given, it is HED tags. |
| 23 | + HEDTags = "hed_tags" |
| 24 | + |
| 25 | + |
| 26 | +class ColumnMetadata: |
| 27 | + """ Column in a ColumnMapper. """ |
| 28 | + |
| 29 | + def __init__(self, column_type=None, name=None, source=None): |
| 30 | + """ A single column entry in the column mapper. |
| 31 | +
|
| 32 | + Parameters: |
| 33 | + column_type (ColumnType or None): How to treat this column when reading data. |
| 34 | + name (str, int, or None): The column_name or column number identifying this column. |
| 35 | + If name is a string, you'll need to use a column map to set the number later. |
| 36 | + source (dict or str or None): Either the entire loaded json sidecar or a single HED string. |
| 37 | + """ |
| 38 | + self.column_name = name |
| 39 | + self._source = source |
| 40 | + if column_type is None: |
| 41 | + column_type = self._detect_column_type(self.source_dict) |
| 42 | + self.column_type = column_type |
| 43 | + |
| 44 | + @property |
| 45 | + def hed_dict(self) -> Union[dict, str]: |
| 46 | + """ The HED strings for any given entry. |
| 47 | +
|
| 48 | + Returns: |
| 49 | + Union[dict, str]: A string or dict of strings for this column. |
| 50 | +
|
| 51 | + """ |
| 52 | + if self._source is None or isinstance(self._source, str): |
| 53 | + return self._source |
| 54 | + return self._source[self.column_name].get("HED", {}) |
| 55 | + |
| 56 | + @property |
| 57 | + def source_dict(self) -> Union[dict, str]: |
| 58 | + """ The raw dict for this entry(if it exists). |
| 59 | +
|
| 60 | + Returns: |
| 61 | + Union[dict, str]: A string or dict of strings for this column. |
| 62 | + """ |
| 63 | + if self._source is None or isinstance(self._source, str): |
| 64 | + return {"HED": self._source} |
| 65 | + return self._source[self.column_name] |
| 66 | + |
| 67 | + def get_hed_strings(self) -> pd.Series: |
| 68 | + """ Return the HED strings for this entry as a series. |
| 69 | +
|
| 70 | + Returns: |
| 71 | + pd.Series: The HED strings for this series.(potentially empty). |
| 72 | + """ |
| 73 | + if not self.column_type: |
| 74 | + return pd.Series(dtype=str) |
| 75 | + |
| 76 | + series = pd.Series(self.hed_dict, dtype=str) |
| 77 | + |
| 78 | + return series |
| 79 | + |
| 80 | + def set_hed_strings(self, new_strings) -> bool: |
| 81 | + """ Set the HED strings for this entry. |
| 82 | +
|
| 83 | + Parameters: |
| 84 | + new_strings (pd.Series, dict, or str): The HED strings to set. |
| 85 | + This should generally be the return value from get_hed_strings. |
| 86 | +
|
| 87 | + Returns: |
| 88 | + bool: True if the strings were successfully set, False otherwise. |
| 89 | + """ |
| 90 | + if new_strings is None: |
| 91 | + return False |
| 92 | + |
| 93 | + if not self.column_type: |
| 94 | + return False |
| 95 | + |
| 96 | + if isinstance(new_strings, pd.Series): |
| 97 | + if self.column_type == ColumnType.Categorical: |
| 98 | + new_strings = new_strings.to_dict() |
| 99 | + elif new_strings.empty: |
| 100 | + return False |
| 101 | + else: |
| 102 | + new_strings = new_strings.iloc[0] |
| 103 | + |
| 104 | + self._source[self.column_name]["HED"] = new_strings |
| 105 | + |
| 106 | + return True |
| 107 | + |
| 108 | + @staticmethod |
| 109 | + def _detect_column_type(dict_for_entry, basic_validation=True): |
| 110 | + """ Determine the ColumnType of a given json entry. |
| 111 | +
|
| 112 | + Parameters: |
| 113 | + dict_for_entry (dict): The loaded json entry a specific column. |
| 114 | + Generally has a "HED" entry among other optional ones. |
| 115 | + basic_validation (bool): If False, does not verify past "HED" exists and the type |
| 116 | + This is used to issue more precise errors that are normally just silently ignored, |
| 117 | + but also not crash. |
| 118 | + Returns: |
| 119 | + ColumnType: The determined type of given column. Returns None if unknown. |
| 120 | +
|
| 121 | + """ |
| 122 | + if not dict_for_entry or not isinstance(dict_for_entry, dict): |
| 123 | + return ColumnType.Ignore |
| 124 | + |
| 125 | + minimum_required_keys = ("HED",) |
| 126 | + if not set(minimum_required_keys).issubset(dict_for_entry.keys()): |
| 127 | + return ColumnType.Ignore |
| 128 | + |
| 129 | + hed_entry = dict_for_entry["HED"] |
| 130 | + if isinstance(hed_entry, dict): |
| 131 | + if basic_validation and not all(isinstance(entry, str) for entry in hed_entry.values()): |
| 132 | + return None |
| 133 | + return ColumnType.Categorical |
| 134 | + |
| 135 | + if not isinstance(hed_entry, str): |
| 136 | + return None |
| 137 | + |
| 138 | + if basic_validation and "#" not in dict_for_entry["HED"]: |
| 139 | + return None |
| 140 | + |
| 141 | + return ColumnType.Value |
| 142 | + |
| 143 | + @staticmethod |
| 144 | + def expected_pound_sign_count(column_type)-> tuple[int, int]: |
| 145 | + """ Return how many pound signs a column string should have. |
| 146 | +
|
| 147 | + Parameters: |
| 148 | + column_type (ColumnType): The type of the column. |
| 149 | +
|
| 150 | + Returns: |
| 151 | + tuple[int, int]: |
| 152 | + - The expected count: 0 or 1. |
| 153 | + - The type of the error we should issue. |
| 154 | + """ |
| 155 | + if column_type == ColumnType.Value: |
| 156 | + expected_count = 1 |
| 157 | + error_type = SidecarErrors.INVALID_POUND_SIGNS_VALUE |
| 158 | + elif column_type == ColumnType.HEDTags or column_type == ColumnType.Categorical: |
| 159 | + expected_count = 0 |
| 160 | + error_type = SidecarErrors.INVALID_POUND_SIGNS_CATEGORY |
| 161 | + else: |
| 162 | + return 0, None |
| 163 | + return expected_count, error_type |
| 164 | + |
| 165 | + def _get_unvalidated_data(self): |
| 166 | + """Returns a copy with less preliminary validation done(such as verifying all data types)""" |
| 167 | + return_copy = copy.deepcopy(self) |
| 168 | + return_copy.column_type = ColumnMetadata._detect_column_type(dict_for_entry=return_copy.source_dict, |
| 169 | + basic_validation=False) |
| 170 | + return return_copy |
0 commit comments