From 33a8990b87beb9c777b9ac50138565a56c81470d Mon Sep 17 00:00:00 2001 From: ashb155 Date: Tue, 14 Apr 2026 14:23:10 +0530 Subject: [PATCH 1/7] invalid-parameter-default fix --- pyproject.toml | 1 - src/scribe_data/cli/contracts/filter.py | 2 +- src/scribe_data/cli/convert.py | 6 +++--- src/scribe_data/cli/get.py | 12 +++++------ src/scribe_data/cli/list.py | 4 ++-- src/scribe_data/cli/total.py | 6 +++--- .../unicode/generate_emoji_keywords.py | 2 +- src/scribe_data/wikidata/parse_dump.py | 20 +++++++++---------- src/scribe_data/wikidata/query_data.py | 7 +++---- 9 files changed, 29 insertions(+), 31 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cfb415141..cec5d0442 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,7 +122,6 @@ include = ["src"] [tool.ty.rules] unresolved-import = "warn" -invalid-parameter-default = "warn" invalid-return-type = "warn" invalid-argument-type = "warn" invalid-assignment = "warn" diff --git a/src/scribe_data/cli/contracts/filter.py b/src/scribe_data/cli/contracts/filter.py index 41d33b6b4..85ee9c2ca 100644 --- a/src/scribe_data/cli/contracts/filter.py +++ b/src/scribe_data/cli/contracts/filter.py @@ -225,7 +225,7 @@ def filter_exported_data( def export_data_filtered_by_contracts( - contracts_dir: str = None, input_dir: str = None, output_dir: str = None + contracts_dir: str | None = None, input_dir: str | None = None, output_dir: str | None = None ) -> None: """ Export contract-filtered data to a new directory with a standardized structure. diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index a226d6993..c1664c986 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -25,7 +25,7 @@ def convert_to_json( data_type: Union[str, List[str]], output_type: str, input_file: str, - output_dir: str = None, + output_dir: str | None = None, overwrite: bool = False, identifier_case: str = "camel", ) -> None: @@ -180,7 +180,7 @@ def convert_to_csv_or_tsv( data_type: Union[str, List[str]], output_type: str, input_file: str, - output_dir: str = None, + output_dir: str | None = None, overwrite: bool = False, identifier_case: str = "camel", ) -> None: @@ -373,7 +373,7 @@ def convert_wrapper( data_types: Union[str, List[str]], output_type: str, input_files: Union[str, List[str]], - output_dir: str = None, + output_dir: str | None = None, overwrite: bool = False, identifier_case: str = "camel", all: bool = False, diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 7cf9e0965..feb7d9f0d 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -30,16 +30,16 @@ def get_data( - language: str = None, - data_type: Union[str, List[str]] = None, - output_type: str = None, - output_dir: str = None, + language: str | None = None, + data_type: str | list[str] | None = None, + output_type: str | None = None, + output_dir: str | None = None, overwrite: bool = False, - outputs_per_entry: int = None, + outputs_per_entry: int | None = None, all_bool: bool = False, interactive: bool = False, identifier_case: str = "camel", - wikidata_dump: str = None, + wikidata_dump: str | None = None, ) -> None: """ Function for controlling the data get process for the CLI. diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index e95e45d7c..6c3983cf6 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -50,7 +50,7 @@ def list_languages() -> None: print() -def list_data_types(language: str = None) -> None: +def list_data_types(language: str | None = None) -> None: """ List all data types or those available for a given language. @@ -168,7 +168,7 @@ def list_languages_for_data_type(data_type: str) -> None: def list_wrapper( - language: str = None, data_type: str = None, all_bool: bool = False + language: str | None = None, data_type: str | None = None, all_bool: bool = False ) -> None: """ Conditionally provides the full functionality of the list command. diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index ec9a35e08..b073c72b2 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -350,10 +350,10 @@ def get_total_lexemes( def total_wrapper( - language: Union[str, List[str]] = None, - data_type: Union[str, List[str]] = None, + language: str | list[str] | None = None, + data_type: str | list[str] | None = None, all_bool: bool = False, - wikidata_dump: Union[str, bool] = None, + wikidata_dump: str | bool | None = None, ) -> None: """ Conditionally provides the full functionality of the total command. diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index 09d623fd8..bb0f0e860 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -17,7 +17,7 @@ EMOJI_KEYWORDS_DICT = 9 -def generate_emoji(language: str, output_dir: str = None) -> None: +def generate_emoji(language: str, output_dir: str | None = None) -> None: """ Generate emoji keywords for a specified language. diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py index 28ec72a9e..b02f958b3 100644 --- a/src/scribe_data/wikidata/parse_dump.py +++ b/src/scribe_data/wikidata/parse_dump.py @@ -7,7 +7,6 @@ import time from collections import Counter, defaultdict from pathlib import Path -from typing import List, Union import orjson import questionary @@ -48,9 +47,10 @@ class LexemeProcessor: def __init__( self, - target_lang: Union[str, List[str]] = None, - parse_type: List[str] = None, - data_types: List[str] = None, + target_lang: str | list[str] | None = None, + parse_type: list[str] | None = None, + data_types: list[str] | None = None, + ) -> None: """ Use to derive information on lexeme dump entries. @@ -589,7 +589,7 @@ def _print_total_summary(self) -> None: # MARK: Export Translations - def export_translations_json(self, filepath: str, language_iso: str = None) -> None: + def export_translations_json(self, filepath: str, language_iso: str | None = None) -> None: """ Save translations_index to file, optionally filtering by language_iso. @@ -658,7 +658,7 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N # MARK: Export Forms def export_forms_json( - self, filepath: str, language_iso: str = None, data_type: str = None + self, filepath: str, language_iso: str | None = None, data_type: str | None = None ) -> None: """ Export grammatical forms to a JSON file with readable feature labels. @@ -759,11 +759,11 @@ def export_forms_json( def parse_dump( - language: Union[str, List[str]] = None, - parse_type: List[str] = None, - data_types: List[str] = None, + language: str | list[str] | None = None, + parse_type: list[str] | None = None, + data_types: list[str] | None = None, file_path: str = "latest-lexemes.json.bz2", - output_dir: str = None, + output_dir: str | None = None, overwrite_all: bool = False, ) -> None: """ diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index 1f8ecb407..a92f2268f 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -9,7 +9,6 @@ import subprocess import sys from pathlib import Path -from typing import List from urllib.error import HTTPError from tqdm.auto import tqdm @@ -80,9 +79,9 @@ def execute_formatting_script(output_dir: str, language: str, data_type: str) -> def query_data( - languages: List[str] = None, - data_type: List[str] = None, - output_dir: str = None, + languages: list[str] | None = None, + data_type: list[str] | None = None, + output_dir: str | None = None, overwrite: bool = False, interactive: bool = False, ) -> None: From e0797fb0d855c6c094a0ac0a6abb289347ce6c50 Mon Sep 17 00:00:00 2001 From: ashb155 Date: Tue, 14 Apr 2026 14:23:27 +0530 Subject: [PATCH 2/7] invalid-parameter-default fix 1 --- src/scribe_data/wikidata/wikidata_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index e36568a2c..271ef7087 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -19,11 +19,11 @@ def parse_wd_lexeme_dump( - language: Union[str, List[str]] = None, - wikidata_dump_type: List[str] = None, - data_types: List[str] = None, - type_output_dir: str = None, - wikidata_dump_path: str = None, + language: str | list[str] | None = None, + wikidata_dump_type: list[str] | None = None, + data_types: list[str] | None = None, + type_output_dir: str | None = None, + wikidata_dump_path: str | None = None, overwrite_all: bool = False, interactive_mode: bool = False, ) -> None: From 1c6fe95e627484b50d6d71775e67331fd5b4beb2 Mon Sep 17 00:00:00 2001 From: ashb155 Date: Tue, 14 Apr 2026 14:42:38 +0530 Subject: [PATCH 3/7] ruff formatting fix --- src/scribe_data/cli/get.py | 1 - src/scribe_data/cli/total.py | 1 - src/scribe_data/wikidata/wikidata_utils.py | 1 - 3 files changed, 3 deletions(-) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index feb7d9f0d..7471bc91b 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -8,7 +8,6 @@ import urllib.error from http.client import IncompleteRead from pathlib import Path -from typing import List, Union from urllib.error import URLError import questionary diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index b073c72b2..67e85d56c 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -4,7 +4,6 @@ """ from http.client import IncompleteRead -from typing import List, Union from urllib.error import HTTPError from SPARQLWrapper import JSON diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 271ef7087..38f2805e6 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -4,7 +4,6 @@ """ from pathlib import Path -from typing import List, Union from rich import print as rprint from SPARQLWrapper import JSON, POST, SPARQLWrapper From 24b55791878a74709151d755ea9045e54181f6e8 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sun, 3 May 2026 22:21:43 +0200 Subject: [PATCH 4/7] Revert changes to allow for a merge of main --- src/scribe_data/cli/contracts/filter.py | 2 +- src/scribe_data/cli/convert.py | 6 +++--- src/scribe_data/cli/get.py | 13 +++++++------ src/scribe_data/cli/list.py | 4 ++-- src/scribe_data/cli/total.py | 7 ++++--- .../unicode/generate_emoji_keywords.py | 2 +- src/scribe_data/wikidata/parse_dump.py | 17 +++++++++++------ src/scribe_data/wikidata/query_data.py | 7 ++++--- src/scribe_data/wikidata/wikidata_utils.py | 11 ++++++----- 9 files changed, 39 insertions(+), 30 deletions(-) diff --git a/src/scribe_data/cli/contracts/filter.py b/src/scribe_data/cli/contracts/filter.py index 85ee9c2ca..41d33b6b4 100644 --- a/src/scribe_data/cli/contracts/filter.py +++ b/src/scribe_data/cli/contracts/filter.py @@ -225,7 +225,7 @@ def filter_exported_data( def export_data_filtered_by_contracts( - contracts_dir: str | None = None, input_dir: str | None = None, output_dir: str | None = None + contracts_dir: str = None, input_dir: str = None, output_dir: str = None ) -> None: """ Export contract-filtered data to a new directory with a standardized structure. diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index c1664c986..a226d6993 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -25,7 +25,7 @@ def convert_to_json( data_type: Union[str, List[str]], output_type: str, input_file: str, - output_dir: str | None = None, + output_dir: str = None, overwrite: bool = False, identifier_case: str = "camel", ) -> None: @@ -180,7 +180,7 @@ def convert_to_csv_or_tsv( data_type: Union[str, List[str]], output_type: str, input_file: str, - output_dir: str | None = None, + output_dir: str = None, overwrite: bool = False, identifier_case: str = "camel", ) -> None: @@ -373,7 +373,7 @@ def convert_wrapper( data_types: Union[str, List[str]], output_type: str, input_files: Union[str, List[str]], - output_dir: str | None = None, + output_dir: str = None, overwrite: bool = False, identifier_case: str = "camel", all: bool = False, diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 7471bc91b..7cf9e0965 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -8,6 +8,7 @@ import urllib.error from http.client import IncompleteRead from pathlib import Path +from typing import List, Union from urllib.error import URLError import questionary @@ -29,16 +30,16 @@ def get_data( - language: str | None = None, - data_type: str | list[str] | None = None, - output_type: str | None = None, - output_dir: str | None = None, + language: str = None, + data_type: Union[str, List[str]] = None, + output_type: str = None, + output_dir: str = None, overwrite: bool = False, - outputs_per_entry: int | None = None, + outputs_per_entry: int = None, all_bool: bool = False, interactive: bool = False, identifier_case: str = "camel", - wikidata_dump: str | None = None, + wikidata_dump: str = None, ) -> None: """ Function for controlling the data get process for the CLI. diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 6c3983cf6..e95e45d7c 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -50,7 +50,7 @@ def list_languages() -> None: print() -def list_data_types(language: str | None = None) -> None: +def list_data_types(language: str = None) -> None: """ List all data types or those available for a given language. @@ -168,7 +168,7 @@ def list_languages_for_data_type(data_type: str) -> None: def list_wrapper( - language: str | None = None, data_type: str | None = None, all_bool: bool = False + language: str = None, data_type: str = None, all_bool: bool = False ) -> None: """ Conditionally provides the full functionality of the list command. diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 67e85d56c..138b1250b 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -4,6 +4,7 @@ """ from http.client import IncompleteRead +from typing import List, Union from urllib.error import HTTPError from SPARQLWrapper import JSON @@ -349,10 +350,10 @@ def get_total_lexemes( def total_wrapper( - language: str | list[str] | None = None, - data_type: str | list[str] | None = None, + language: Union[str, List[str]] = None, + data_type: Union[str, List[str]] = None, all_bool: bool = False, - wikidata_dump: str | bool | None = None, + wikidata_dump: Union[str, List[str]] = None, ) -> None: """ Conditionally provides the full functionality of the total command. diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index bb0f0e860..09d623fd8 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -17,7 +17,7 @@ EMOJI_KEYWORDS_DICT = 9 -def generate_emoji(language: str, output_dir: str | None = None) -> None: +def generate_emoji(language: str, output_dir: str = None) -> None: """ Generate emoji keywords for a specified language. diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py index b02f958b3..efa4cb951 100644 --- a/src/scribe_data/wikidata/parse_dump.py +++ b/src/scribe_data/wikidata/parse_dump.py @@ -7,6 +7,7 @@ import time from collections import Counter, defaultdict from pathlib import Path +from typing import List, Union import orjson import questionary @@ -47,10 +48,9 @@ class LexemeProcessor: def __init__( self, - target_lang: str | list[str] | None = None, - parse_type: list[str] | None = None, - data_types: list[str] | None = None, - + target_lang: Union[str, List[str]] = None, + parse_type: list[str] = None, + data_types: list[str] = None, ) -> None: """ Use to derive information on lexeme dump entries. @@ -589,7 +589,9 @@ def _print_total_summary(self) -> None: # MARK: Export Translations - def export_translations_json(self, filepath: str, language_iso: str | None = None) -> None: + def export_translations_json( + self, filepath: str, language_iso: str | None = None + ) -> None: """ Save translations_index to file, optionally filtering by language_iso. @@ -658,7 +660,10 @@ def export_translations_json(self, filepath: str, language_iso: str | None = Non # MARK: Export Forms def export_forms_json( - self, filepath: str, language_iso: str | None = None, data_type: str | None = None + self, + filepath: str, + language_iso: str | None = None, + data_type: str | None = None, ) -> None: """ Export grammatical forms to a JSON file with readable feature labels. diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index a92f2268f..1f8ecb407 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -9,6 +9,7 @@ import subprocess import sys from pathlib import Path +from typing import List from urllib.error import HTTPError from tqdm.auto import tqdm @@ -79,9 +80,9 @@ def execute_formatting_script(output_dir: str, language: str, data_type: str) -> def query_data( - languages: list[str] | None = None, - data_type: list[str] | None = None, - output_dir: str | None = None, + languages: List[str] = None, + data_type: List[str] = None, + output_dir: str = None, overwrite: bool = False, interactive: bool = False, ) -> None: diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 38f2805e6..e36568a2c 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -4,6 +4,7 @@ """ from pathlib import Path +from typing import List, Union from rich import print as rprint from SPARQLWrapper import JSON, POST, SPARQLWrapper @@ -18,11 +19,11 @@ def parse_wd_lexeme_dump( - language: str | list[str] | None = None, - wikidata_dump_type: list[str] | None = None, - data_types: list[str] | None = None, - type_output_dir: str | None = None, - wikidata_dump_path: str | None = None, + language: Union[str, List[str]] = None, + wikidata_dump_type: List[str] = None, + data_types: List[str] = None, + type_output_dir: str = None, + wikidata_dump_path: str = None, overwrite_all: bool = False, interactive_mode: bool = False, ) -> None: From d586cc286cdd18f492c76ab88c21e5ae8d08520b Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sun, 3 May 2026 22:23:52 +0200 Subject: [PATCH 5/7] Revert changes to allow for a merge of main - second commit --- src/scribe_data/cli/total.py | 2 +- src/scribe_data/wikidata/parse_dump.py | 20 +++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 138b1250b..ec9a35e08 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -353,7 +353,7 @@ def total_wrapper( language: Union[str, List[str]] = None, data_type: Union[str, List[str]] = None, all_bool: bool = False, - wikidata_dump: Union[str, List[str]] = None, + wikidata_dump: Union[str, bool] = None, ) -> None: """ Conditionally provides the full functionality of the total command. diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py index efa4cb951..1b5425855 100644 --- a/src/scribe_data/wikidata/parse_dump.py +++ b/src/scribe_data/wikidata/parse_dump.py @@ -49,8 +49,8 @@ class LexemeProcessor: def __init__( self, target_lang: Union[str, List[str]] = None, - parse_type: list[str] = None, - data_types: list[str] = None, + parse_type: List[str] = None, + data_types: List[str] = None, ) -> None: """ Use to derive information on lexeme dump entries. @@ -589,9 +589,7 @@ def _print_total_summary(self) -> None: # MARK: Export Translations - def export_translations_json( - self, filepath: str, language_iso: str | None = None - ) -> None: + def export_translations_json(self, filepath: str, language_iso: str = None) -> None: """ Save translations_index to file, optionally filtering by language_iso. @@ -662,8 +660,8 @@ def export_translations_json( def export_forms_json( self, filepath: str, - language_iso: str | None = None, - data_type: str | None = None, + language_iso: str = None, + data_type: str = None, ) -> None: """ Export grammatical forms to a JSON file with readable feature labels. @@ -764,11 +762,11 @@ def export_forms_json( def parse_dump( - language: str | list[str] | None = None, - parse_type: list[str] | None = None, - data_types: list[str] | None = None, + language: Union[str, List[str]] = None, + parse_type: List[str] = None, + data_types: List[str] = None, file_path: str = "latest-lexemes.json.bz2", - output_dir: str | None = None, + output_dir: str = None, overwrite_all: bool = False, ) -> None: """ From 708fdc860fcee54b0dfdf4c2f3ee4bed16481778 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sun, 3 May 2026 22:26:13 +0200 Subject: [PATCH 6/7] Remove all ty warnings --- pyproject.toml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e7f19dc4a..72ee697e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,18 +122,6 @@ omit = ["*/tests/*", "*/__init__.py"] [tool.ty.src] include = ["src"] -[tool.ty.rules] -unresolved-import = "warn" -invalid-return-type = "warn" -invalid-argument-type = "warn" -invalid-assignment = "warn" -unsupported-operator = "warn" -unresolved-attribute = "warn" -missing-argument = "warn" -unknown-argument = "warn" -not-subscriptable = "warn" -possibly-missing-submodule = "warn" - [tool.numpydoc_validation] checks = [ "all", # report on all checks, except the below From 36ff3ec81a5eb986d1ee826fa2c24c47118332b0 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sun, 3 May 2026 22:37:24 +0200 Subject: [PATCH 7/7] Misc docstring fixes --- .../check_missing_forms.py | 16 +++++------ .../check_missing_forms/normalize_forms.py | 12 ++++---- .../check/check_project_metadata.py | 9 +++--- src/scribe_data/check/check_pyicu.py | 6 ++-- src/scribe_data/check/check_query_forms.py | 12 ++++---- src/scribe_data/cli/convert.py | 6 ++-- src/scribe_data/cli/download.py | 6 ++-- src/scribe_data/cli/get.py | 8 +++--- src/scribe_data/cli/interactive.py | 2 +- src/scribe_data/cli/total.py | 2 +- src/scribe_data/load/data_to_sqlite.py | 28 +++++++++---------- src/scribe_data/unicode/process_unicode.py | 4 +-- src/scribe_data/unicode/unicode_utils.py | 4 ++- src/scribe_data/utils.py | 8 +++--- src/scribe_data/wikidata/check_query/check.py | 24 ++++++++-------- src/scribe_data/wikidata/parse_dump.py | 12 ++++---- src/scribe_data/wikidata/query_data.py | 6 ++-- .../wiktionary/parse_translations.py | 2 +- tests/cli/contracts/test_contracts_export.py | 3 +- tests/wikidata/test_check_query.py | 10 +++---- 20 files changed, 93 insertions(+), 87 deletions(-) diff --git a/src/scribe_data/check/check_missing_forms/check_missing_forms.py b/src/scribe_data/check/check_missing_forms/check_missing_forms.py index cf146c70e..89ef5ad08 100644 --- a/src/scribe_data/check/check_missing_forms/check_missing_forms.py +++ b/src/scribe_data/check/check_missing_forms/check_missing_forms.py @@ -53,8 +53,8 @@ def execute_sparql_query(query: str, max_retries: int = 3) -> Optional[list]: query : str The SPARQL query to execute. - max_retries : int, optional - Maximum number of retry attempts (default: 3). + max_retries : int, optional, default=3 + Maximum number of retry attempts. Returns ------- @@ -183,8 +183,8 @@ def get_forms_from_sparql_service( Minimum frequency threshold for including form combinations. Default is 0 (include all combinations). - max_results : int, optional - Maximum number of results to return (default: 1000). + max_results : int, optional, default=1000 + Maximum number of results to return. Helps prevent timeout for very large datasets. language_name : str, optional @@ -288,8 +288,8 @@ def get_forms_from_sparql_service_all_languages( Minimum frequency threshold for including form combinations. Default is 0 (include all combinations). - max_results : int, optional - Maximum results per query to prevent timeouts (default: 1000). + max_results : int, optional, default=1000 + Maximum results per query to prevent timeouts. Returns ------- @@ -384,8 +384,8 @@ def get_features_from_sparql_service( Minimum frequency threshold for including form combinations. Default is 0 (include all combinations). - max_results : int, optional - Maximum results per query to prevent timeouts (default: 1000). + max_results : int, optional, default=1000 + Maximum results per query to prevent timeouts . Returns ------- diff --git a/src/scribe_data/check/check_missing_forms/normalize_forms.py b/src/scribe_data/check/check_missing_forms/normalize_forms.py index 517777e0b..c99f6e2fe 100644 --- a/src/scribe_data/check/check_missing_forms/normalize_forms.py +++ b/src/scribe_data/check/check_missing_forms/normalize_forms.py @@ -3,11 +3,13 @@ Order QID from a missing_unique_forms based on lexeme_form_metadata.yaml. """ +from typing import List + from scribe_data.utils import lexeme_form_metadata # Precompute QID positions mapping only once when the module is imported. -def sort_qids_in_list(qids_lists: list[list[str]]) -> list[list[str]]: +def sort_qids_in_list(qids_lists: List[List[str]]) -> List[List[str]]: """ Sort QIDs within each sublist based on their predefined positions. @@ -16,7 +18,7 @@ def sort_qids_in_list(qids_lists: list[list[str]]) -> list[list[str]]: Parameters ---------- - qids_lists : list[list[str]] + qids_lists : List[List[str]] A list of lists, where each sublist contains QIDs. Returns @@ -37,17 +39,17 @@ def sort_qids_in_list(qids_lists: list[list[str]]) -> list[list[str]]: ] -def sort_qids_by_position(nested_qids: list[list[str]]) -> list[list[str]]: +def sort_qids_by_position(nested_qids: List[List[str]]) -> List[List[str]]: """ Sort lists of QIDs based on their predefined positions and sublist length. - This function sorts the sublists within `nested_qids` based on two criteria: + This function sorts the sub-lists within `nested_qids` based on two criteria: 1. The length of the sublist (shorter lists come first). 2. The positions of the QIDs within each sublist, as defined in `lexeme_form_metadata`. Parameters ---------- - nested_qids : list[list[str]] + nested_qids : List[List[str]] A list of lists, where each sublist contains QIDs. Returns diff --git a/src/scribe_data/check/check_project_metadata.py b/src/scribe_data/check/check_project_metadata.py index 2529339af..f1cb569a5 100644 --- a/src/scribe_data/check/check_project_metadata.py +++ b/src/scribe_data/check/check_project_metadata.py @@ -9,6 +9,7 @@ import difflib import sys +from typing import List from scribe_data.utils import ( WIKIDATA_QUERIES_ALL_DATA_DIR, @@ -19,13 +20,13 @@ all_data_types = tuple(data_type_metadata.keys()) -def get_available_languages() -> dict[str, list[str]]: +def get_available_languages() -> dict[str, List[str]]: """ Get available languages from the data extraction folder. Returns ------- - dict[str, list[str]] + dict[str, List[str]] A dictionary with the language name as the key and a list of its sub-languages (if available). """ available_languages = {} @@ -64,7 +65,7 @@ def get_available_languages() -> dict[str, list[str]]: def get_missing_languages( reference_languages: dict, target_languages: dict -) -> list[str]: +) -> List[str]: """ Compare two language dictionaries and return a list of languages and sub-languages that exist. @@ -78,7 +79,7 @@ def get_missing_languages( Returns ------- - list[str] + List[str] A list of languages and sub-languages that are in target_languages but not in reference_languages. """ missing_languages = [] diff --git a/src/scribe_data/check/check_pyicu.py b/src/scribe_data/check/check_pyicu.py index 92f590851..e7b37dedf 100644 --- a/src/scribe_data/check/check_pyicu.py +++ b/src/scribe_data/check/check_pyicu.py @@ -13,7 +13,7 @@ import subprocess import sys from pathlib import Path -from typing import Optional +from typing import List, Optional import questionary import requests @@ -55,7 +55,7 @@ def get_python_version_and_architecture() -> tuple[str, str]: return python_version, architecture -def fetch_wheel_releases() -> tuple[list[tuple[str, str]], float]: +def fetch_wheel_releases() -> tuple[List[tuple[str, str]], float]: """ Fetch the release data for PyICU from GitHub with error handling for rate limits. @@ -114,7 +114,7 @@ def download_wheel_file(wheel_url: str, output_dir: Path) -> str: def find_matching_wheel( - wheels: list[tuple[str, str]], python_version: str, architecture: str + wheels: List[tuple[str, str]], python_version: str, architecture: str ) -> Optional[str]: """ Find the matching wheel file based on Python version and architecture. diff --git a/src/scribe_data/check/check_query_forms.py b/src/scribe_data/check/check_query_forms.py index a56f207d6..87743703d 100644 --- a/src/scribe_data/check/check_query_forms.py +++ b/src/scribe_data/check/check_query_forms.py @@ -135,7 +135,7 @@ def decompose_label_features(label: str) -> list: # MARK: Extract QIDs -def extract_form_qids(form_text: str) -> Optional[list[str]]: +def extract_form_qids(form_text: str) -> Optional[List[str]]: """ Extract all QIDs from an optional query form. @@ -146,7 +146,7 @@ def extract_form_qids(form_text: str) -> Optional[list[str]]: Returns ------- - list[str] + Optional[List[str]] All QIDS that make up the form. """ qids_pattern = r"wikibase:grammaticalFeature .+ \." @@ -224,13 +224,13 @@ def check_query_formatting(form_text: str) -> bool: # MARK: Correct Label -def return_correct_form_label(qids: list[str]) -> str: +def return_correct_form_label(qids: List[str]) -> str: """ Return the correct label for a lexeme form representation given the QIDs that compose it. Parameters ---------- - qids : list[str] + qids : List[str] All QIDS that make up the form. Returns @@ -447,12 +447,12 @@ def compare_key(components: List[str]) -> List[str | int | float]: Parameters ---------- - components : list[str] + components : List[str] The components that can make up the form identifier. Returns ------- - list[str] + List[str | int | float] The list of component parts to compare against. """ return [order_map.get(c, float("inf")) for c in components] diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 219211884..473c019fb 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -397,13 +397,13 @@ def convert_wrapper( output_type : str The desired output format. Can be 'json', 'csv', 'tsv', or 'sqlite'. - overwrite : bool, optional (default=False) + overwrite : bool, optional, default=False Whether to overwrite existing output files. - identifier_case : str, optional (default='camel') + identifier_case : str, optional, default='camel' The case format for identifiers. - all : bool, optional (default=False) + all : bool, optional, default=False Convert all languages and data types. Returns diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py index f89cfed6f..708469789 100644 --- a/src/scribe_data/cli/download.py +++ b/src/scribe_data/cli/download.py @@ -9,7 +9,7 @@ from collections.abc import Callable from datetime import date, datetime from pathlib import Path -from typing import Optional +from typing import List, Optional import questionary import requests @@ -298,7 +298,7 @@ def wd_lexeme_dump_download_wrapper( def download_wiktionary_dumps( output_dir: Path = DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, - language_isos: list[str] = ["en"], + language_isos: List[str] = ["en"], dump_snapshot: Optional[str] = "latest", ) -> Optional[Path]: """ @@ -309,7 +309,7 @@ def download_wiktionary_dumps( output_dir : Path, optional, default=DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR Directory to save the dump. Defaults to DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR. - language_isos : list[str], optional, default=['en'] + language_isos : List[str], optional, default=['en'] A list of ISO-2 codes for desired Wiktionary dumps. dump_snapshot : str, optional, default='latest' diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 20715c70b..1f39b6314 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -48,10 +48,10 @@ def get_data( Parameters ---------- - languages : list[str] + languages : List[str] The language(s) to get. - data_types : list[str] + data_types : List[str] The data type(s) to get. output_type : str @@ -60,7 +60,7 @@ def get_data( output_dir : Path The output directory path for results. - overwrite : bool (default: False) + overwrite : bool, default=False Whether to overwrite existing files. outputs_per_entry : str @@ -69,7 +69,7 @@ def get_data( all_bool : bool Get all languages and data types. - interactive : bool (default: False) + interactive : bool, default: False Whether it's running in interactive mode. identifier_case : str diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index bddfecdf8..6d1996919 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -103,7 +103,7 @@ def create_word_completer( Parameters ---------- - options : list[str] + options : List[str] The options that could complete the current input. include_all : bool diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 026bfa42d..336040a31 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -58,7 +58,7 @@ def get_datatype_list(language: str) -> list | dict: Returns ------- - list[str] + list | dict A list of the corresponding data types. """ language_key = language.strip().lower() # normalize input diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 63cb759c6..5b3a3d345 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -25,7 +25,7 @@ def create_table( - cursor: sqlite3.Cursor, identifier_case: str, data_type: str, cols: list[str] + cursor: sqlite3.Cursor, identifier_case: str, data_type: str, cols: List[str] ) -> None: """ Create a table in the language database. @@ -102,10 +102,10 @@ def translations_to_sqlite( A list of current languages. identifier_case : str, optional The identifier case. Default is "snake". - input_file : str, optional - The input JSON export directory (default=DEFAULT_JSON_EXPORT_DIR). - output_file : str, optional - The output SQLite export directory (default=DEFAULT_SQLITE_EXPORT_DIR). + input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR + The input JSON export directory. + output_file : str, optional, default=DEFAULT_SQLITE_EXPORT_DIR + The output SQLite export directory. overwrite : bool, optional If True, existing SQLite files will be overwritten without prompting. """ @@ -195,11 +195,11 @@ def wiktionary_translations_to_sqlite( identifier_case : str, optional Either "camel" or "snake" to determine column naming. Default is "snake". - input_file : str, optional - The input JSON export directory (default=DEFAULT_JSON_EXPORT_DIR). + input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR + The input JSON export directory. - output_file : str, optional - The output SQLite export directory (default=DEFAULT_SQLITE_EXPORT_DIR). + output_file : str, optional, default=DEFAULT_SQLITE_EXPORT_DIR + The output SQLite export directory. overwrite : bool, optional If True, existing SQLite files will be overwritten without prompting. @@ -296,14 +296,14 @@ def data_to_sqlite( specific_tables : Optional[Union[str, List[str]]] The specific tables to process. If None, process all tables. - identifier_case : str, optional (default='camel') + identifier_case : str, optional, default='camel' Format of the identifiers ("camel" or "snake"). Defaults to "camel". - input_file : str, optional - The input JSON export directory (default=DEFAULT_JSON_EXPORT_DIR). + input_file : str, optional, default=DEFAULT_JSON_EXPORT_DIR + The input JSON export directory. - output_file : str, optional - The output SQLite export directory (default=DEFAULT_SQLITE_EXPORT_DIR). + output_file : str, optional, default=DEFAULT_SQLITE_EXPORT_DIR + The output SQLite export directory. overwrite : bool, optional If set to True, existing SQLite files will be overwritten without prompting. diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index c00aaa367..2c3874cc1 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -34,10 +34,10 @@ def gen_emoji_lexicon( Parameters ---------- - language : str (default=None) + language : str The language keywords are being generated for. - emojis_per_keyword : int (default=None) + emojis_per_keyword : int The limit for number of emoji keywords that should be generated per keyword. Returns diff --git a/src/scribe_data/unicode/unicode_utils.py b/src/scribe_data/unicode/unicode_utils.py index efa765b8f..1dead44dd 100644 --- a/src/scribe_data/unicode/unicode_utils.py +++ b/src/scribe_data/unicode/unicode_utils.py @@ -3,6 +3,8 @@ Module for a function to get emojis we want to filter from suggestions. """ +from typing import List + # See: https://getemoji.com/ def get_emojis_to_ignore() -> str: @@ -25,7 +27,7 @@ def get_emojis_to_ignore() -> str: """ -def get_emoji_codes_to_ignore() -> list[bytes]: +def get_emoji_codes_to_ignore() -> List[bytes]: """ Return a list of emojis codes based on relationships that we want to remove from emoji suggestions. diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index eae040a15..5bb1f727e 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -10,7 +10,7 @@ from datetime import datetime from importlib import resources from pathlib import Path -from typing import Any, Optional +from typing import Any, List, Optional import questionary import requests @@ -466,7 +466,7 @@ def format_sublanguage_name(lang: str, language_metadata: dict = _languages) -> raise ValueError(f"{lang.capitalize()} is not a valid language or sub-language.") -def list_all_languages(language_metadata: dict = _languages) -> list[str]: +def list_all_languages(language_metadata: dict = _languages) -> List[str]: """ Return a sorted list of all languages and sub-languages from the provided metadata dictionary. @@ -499,7 +499,7 @@ def list_all_languages(language_metadata: dict = _languages) -> list[str]: def list_languages_with_metadata_for_data_type( language_metadata: dict = _languages, -) -> list[dict]: +) -> List[dict]: """ Return a sorted list of languages and their metadata (name, iso, qid) for a specific data type. @@ -665,7 +665,7 @@ def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool: index_path : pathlib.Path The path to check. - overwrite_all : cool (default=False) + overwrite_all : cool, default=False If True, automatically overwrite without prompting. Returns diff --git a/src/scribe_data/wikidata/check_query/check.py b/src/scribe_data/wikidata/check_query/check.py index 3f22ecfde..938fe80a2 100644 --- a/src/scribe_data/wikidata/check_query/check.py +++ b/src/scribe_data/wikidata/check_query/check.py @@ -11,7 +11,7 @@ import urllib.request from http import HTTPStatus from pathlib import Path -from typing import Optional +from typing import List, Optional from urllib.error import HTTPError from tqdm.auto import tqdm @@ -50,20 +50,20 @@ def ping(url: str, timeout: int) -> bool: return False -def all_queries() -> list[QueryFile]: +def all_queries() -> List[QueryFile]: """ All the SPARQL queries in, and below, 'Scribe-Data/'. Returns ------- - list[QueryFile] + List[QueryFile] List of SPARQL query files. """ parts = Path(__file__).resolve().parts prj_root_idx = parts.index(PROJECT_ROOT) prj_root = str(Path(*parts[: prj_root_idx + 1])) - queries: list[QueryFile] = [] + queries: List[QueryFile] = [] for root, _, files in os.walk(prj_root): for f in files: @@ -74,7 +74,7 @@ def all_queries() -> list[QueryFile]: return queries -def changed_queries() -> Optional[list[QueryFile]]: +def changed_queries() -> Optional[List[QueryFile]]: """ Find all the SPARQL queries that have changed. @@ -82,7 +82,7 @@ def changed_queries() -> Optional[list[QueryFile]]: Returns ------- - Optional[list[QueryFile]] + Optional[List[QueryFile]] List of changed/new SPARQL queries, or None if there's an error. """ result = subprocess.run( @@ -208,13 +208,13 @@ def check_timeout(timeout: str) -> int: ) -def main(argv: Optional[list[str]] = None) -> int: +def main(argv: Optional[List[str]] = None) -> int: """ The main function. Parameters ---------- - argv : None (default=None) + argv : Optional[List[str]], default=None If set to None then argparse will use sys.argv as the arguments. Returns @@ -339,13 +339,13 @@ def main(argv: Optional[list[str]] = None) -> int: return EXIT_FAILURE if failures else EXIT_SUCCESS -def error_report(failures: list[QueryExecutionException]) -> None: +def error_report(failures: List[QueryExecutionException]) -> None: """ Report failed queries. Parameters ---------- - failures : list[QueryExecutionException] + failures : List[QueryExecutionException] Failed queries. """ if not failures: @@ -357,13 +357,13 @@ def error_report(failures: list[QueryExecutionException]) -> None: print(failed_query, file=sys.stderr) -def success_report(successes: list[tuple[QueryFile, dict]], display: bool) -> None: +def success_report(successes: List[tuple[QueryFile, dict]], display: bool) -> None: """ Report successful queries. Parameters ---------- - successes : list[tuple[QueryFile, dict]] + successes : List[tuple[QueryFile, dict]] Successful queries. display : bool diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py index cfd7f68d9..52cb7ddd5 100644 --- a/src/scribe_data/wikidata/parse_dump.py +++ b/src/scribe_data/wikidata/parse_dump.py @@ -33,15 +33,15 @@ class LexemeProcessor: Parameters ---------- - target_lang : str, list[str] + target_lang : str, List[str] The language or languages to process lexemes for. - parse_type : list[str] + parse_type : List[str] Can be any combination of: - 'form' - 'total' - data_types : list[str] + data_types : List[str] A list of categories (e.g., ["nouns", "adverbs"]) for forms. """ @@ -56,15 +56,15 @@ def __init__( Parameters ---------- - target_lang : str, list[str] + target_lang : str, List[str] The language or languages to process lexemes for. - parse_type : list[str] + parse_type : List[str] Can be any combination of: - 'form' - 'total' - data_types : list[str] + data_types : List[str] A list of categories (e.g., ["nouns", "adverbs"]) for forms. """ # Pre-compute sets for faster lookups. diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index ab0e5b199..84f6afa56 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -92,16 +92,16 @@ def query_data( Parameters ---------- - languages : list[str] + languages : List[str] The language(s) to get. - data_types : list[str] + data_types : List[str] The data type(s) to get. output_dir : Path The output directory path for results. - overwrite : bool (default: False) + overwrite : bool, default: False Whether to overwrite existing files. interactive : bool, default=False diff --git a/src/scribe_data/wiktionary/parse_translations.py b/src/scribe_data/wiktionary/parse_translations.py index c2bab181a..3769233cf 100644 --- a/src/scribe_data/wiktionary/parse_translations.py +++ b/src/scribe_data/wiktionary/parse_translations.py @@ -1178,7 +1178,7 @@ def parse_wiktionary_translations( wiktionary_dump_path : str or Path, optional Path to a ``*wiktionary-*-pages-articles.xml.bz2`` dump file. - output_dir : Path, optional (default=DEFAULT_WIKTIONARY_JSON_EXPORT_DIR) + output_dir : Path, optional, default=DEFAULT_WIKTIONARY_JSON_EXPORT_DIR Directory where JSON files are saved. overwrite : bool, default ``False`` diff --git a/tests/cli/contracts/test_contracts_export.py b/tests/cli/contracts/test_contracts_export.py index cd617f5ef..0ff31a7a4 100644 --- a/tests/cli/contracts/test_contracts_export.py +++ b/tests/cli/contracts/test_contracts_export.py @@ -4,6 +4,7 @@ """ from pathlib import Path +from typing import List from unittest.mock import MagicMock, call, mock_open, patch from scribe_data.cli.contracts.filter import ( @@ -306,7 +307,7 @@ def exists_side_effect() -> bool: mock_filtered_verbs, ] * 2 # for both languages - def mock_path_glob(self: Path, pattern: str) -> list[Path]: + def mock_path_glob(self: Path, pattern: str) -> List[Path]: """ Mock glob method that returns files based on the path. """ diff --git a/tests/wikidata/test_check_query.py b/tests/wikidata/test_check_query.py index 882f844d8..de1105295 100755 --- a/tests/wikidata/test_check_query.py +++ b/tests/wikidata/test_check_query.py @@ -7,7 +7,7 @@ import re from http import HTTPStatus from pathlib import Path -from typing import Any +from typing import Any, List from unittest.mock import MagicMock, mock_open, patch from urllib.error import HTTPError @@ -163,7 +163,7 @@ def test_check_sparql_file_not_sparql_extension(_: MagicMock) -> None: ) @patch("subprocess.run") def test_changed_queries( - mock_run: MagicMock, git_status: str, expected: list[Any] + mock_run: MagicMock, git_status: str, expected: List[Any] ) -> None: mock_result = MagicMock() mock_result.configure_mock(**{"returncode": 0, "stdout": git_status}) @@ -208,7 +208,7 @@ def test_changed_queries_failure( ), ], ) -def test_all_queries(tree: list[Any], expected: list[Any]) -> None: +def test_all_queries(tree: List[Any], expected: List[Any]) -> None: with patch("os.walk") as mock_walk: mock_walk.return_value = tree @@ -304,7 +304,7 @@ def test_main_help(arg: str) -> None: ["-c", "-f", "-a"], ], ) -def test_main_mutex_opts(args: list[str]) -> None: +def test_main_mutex_opts(args: List[str]) -> None: """ Some options cannot be used together. """ @@ -366,7 +366,7 @@ def test_success_report_no_success_display_set(capsys: pytest.CaptureFixture) -> [[], [(a_query, {"a": 23})], [(a_query, {"a": 23}), (a_query, {"b": 53})]], ) def test_success_report_display_not_set( - successes: list[Any], capsys: pytest.CaptureFixture + successes: List[Any], capsys: pytest.CaptureFixture ) -> None: success_report(successes, display=False) out = capsys.readouterr().out