-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlang_utils.py
More file actions
104 lines (79 loc) · 3.06 KB
/
lang_utils.py
File metadata and controls
104 lines (79 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
Language detection and mapping utilities.
"""
import langdetect
from langdetect import DetectorFactory
from typing import Optional
import logging
import config
DetectorFactory.seed = 0 # Make detection deterministic
logger = logging.getLogger(__name__)
def detect_language(text: str) -> Optional[str]:
"""
Detect the language of the input text.
Args:
text: Input text to detect language for
Returns:
ISO 639-1 language code (e.g., 'hi', 'en', 'ta') or None if detection fails
"""
if len(text.strip()) < 15:
return 'en' # Too short to detect reliably
try:
# langdetect returns ISO 639-1 codes
lang_code = langdetect.detect(text)
return lang_code
except Exception as e:
logger.warning(f"Language detection failed: {e}")
return None
def get_language_name(lang_code: str) -> str:
"""
Get the native name of a language from its ISO code.
Args:
lang_code: ISO 639-1 language code (e.g., 'hi', 'en')
Returns:
Native language name (e.g., 'हिंदी', 'English')
Returns the code itself if not found in mapping
"""
return config.LANGUAGE_NAMES.get(lang_code, lang_code)
def is_indic_language(lang_code: str) -> bool:
"""
Check if a language code corresponds to an Indian language.
Args:
lang_code: ISO 639-1 language code
Returns:
True if the language is an Indian language, False otherwise
"""
return lang_code in config.INDIC_LANGUAGES
def normalize_text(text: str) -> str:
"""
Normalize text by removing extra whitespace and cleaning up.
Args:
text: Input text to normalize
Returns:
Normalized text
"""
# Remove extra whitespace
text = " ".join(text.split())
# Remove leading/trailing whitespace
text = text.strip()
return text
if __name__ == "__main__":
# Test language detection
test_texts = {
"hi": "क्या यह दवा डायबिटीज़ के इलाज में मदद करती है?",
"ta": "இந்த மருந்து நீரிழிவு நோய்க்கு உதவுமா?",
"en": "How does this drug help with diabetes treatment?",
"mr": "हे औषध मधुमेहाच्या उपचारात कसे मदत करते?",
"te": "ఈ మందు మధుమేహ చికిత్సలో ఎలా సహాయపడుతుంది?",
}
print("Language Detection Tests:")
print("-" * 60)
for expected_lang, text in test_texts.items():
detected = detect_language(text)
lang_name = get_language_name(detected) if detected else "Unknown"
is_indic = is_indic_language(detected) if detected else False
print(f"Text: {text[:50]}...")
print(f"Expected: {expected_lang}, Detected: {detected}")
print(f"Language Name: {lang_name}")
print(f"Is Indic: {is_indic}")
print("-" * 60)