Skip to content

Commit ca39e8d

Browse files
committed
Revert "fix(utils): fix the regexp matching with problematic unicode casing check in extract_relevant_sections()"
This reverts commit c11b338. Related to #2684.
1 parent 0216bdf commit ca39e8d

1 file changed

Lines changed: 6 additions & 12 deletions

File tree

wikidict/utils.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,30 +1025,24 @@ def extract_keywords_from(parts: list[str]) -> defaultdict[str, str]:
10251025

10261026

10271027
def extract_relevant_sections(wikitext: str, locale: str) -> str:
1028-
"""Extract relevant sections for the chosen locale from a given wikitext.
1029-
1030-
Process: we check all lines in lower case but keep relevant ones in their original casing.
1031-
This is to workaround unicode diacritics being lost when working on Turkish, for instance.
1032-
See https://stackoverflow.com/q/79169550/1117028 for more details.
1033-
"""
1028+
"""Extract relevant sections for the chosen locale from a given wikitext."""
10341029
level = lang.section_level[locale]
10351030
equals = "=" * level
10361031

10371032
interesting_sections = [
1038-
re.compile(rf"{equals}[ ]*{re.escape(section)}[ ]*{equals}") for section in lang.head_sections[locale]
1033+
re.compile(rf"{equals}[ ]*{re.escape(section)}[ ]*{equals}", flags=re.IGNORECASE)
1034+
for section in lang.head_sections[locale]
10391035
]
10401036

10411037
cleaned: list[str] = []
10421038
in_expected_section = False
1043-
raw_lines = wikitext.splitlines()
1044-
raw_lines_lower = wikitext.lower().splitlines()
1045-
for idx, raw_line_lower in enumerate(raw_lines_lower):
1046-
if not (line := raw_line_lower.strip()):
1039+
for raw_line in wikitext.splitlines():
1040+
if not (line := raw_line.strip()):
10471041
continue
10481042
if line.startswith(equals) and line[level] != "=":
10491043
in_expected_section = any(pattern.match(line) for pattern in interesting_sections)
10501044
if in_expected_section:
1051-
cleaned.append(raw_lines[idx])
1045+
cleaned.append(line)
10521046
return "\n".join(cleaned) if cleaned else ""
10531047

10541048

0 commit comments

Comments
 (0)