@@ -1025,30 +1025,24 @@ def extract_keywords_from(parts: list[str]) -> defaultdict[str, str]:
10251025
10261026
10271027def extract_relevant_sections (wikitext : str , locale : str ) -> str :
1028- """Extract relevant sections for the chosen locale from a given wikitext.
1029-
1030- Process: we check all lines in lower case but keep relevant ones in their original casing.
1031- This is to workaround unicode diacritics being lost when working on Turkish, for instance.
1032- See https://stackoverflow.com/q/79169550/1117028 for more details.
1033- """
1028+ """Extract relevant sections for the chosen locale from a given wikitext."""
10341029 level = lang .section_level [locale ]
10351030 equals = "=" * level
10361031
10371032 interesting_sections = [
1038- re .compile (rf"{ equals } [ ]*{ re .escape (section )} [ ]*{ equals } " ) for section in lang .head_sections [locale ]
1033+ re .compile (rf"{ equals } [ ]*{ re .escape (section )} [ ]*{ equals } " , flags = re .IGNORECASE )
1034+ for section in lang .head_sections [locale ]
10391035 ]
10401036
10411037 cleaned : list [str ] = []
10421038 in_expected_section = False
1043- raw_lines = wikitext .splitlines ()
1044- raw_lines_lower = wikitext .lower ().splitlines ()
1045- for idx , raw_line_lower in enumerate (raw_lines_lower ):
1046- if not (line := raw_line_lower .strip ()):
1039+ for raw_line in wikitext .splitlines ():
1040+ if not (line := raw_line .strip ()):
10471041 continue
10481042 if line .startswith (equals ) and line [level ] != "=" :
10491043 in_expected_section = any (pattern .match (line ) for pattern in interesting_sections )
10501044 if in_expected_section :
1051- cleaned .append (raw_lines [ idx ] )
1045+ cleaned .append (line )
10521046 return "\n " .join (cleaned ) if cleaned else ""
10531047
10541048
0 commit comments