Coverage for potnia/script.py: 100.00%
58 statements
« prev ^ index » next coverage.py v7.6.3, created at 2025-04-03 23:35 +0000
« prev ^ index » next coverage.py v7.6.3, created at 2025-04-03 23:35 +0000
1import re
2from functools import reduce
3from pathlib import Path
4from dataclasses import dataclass
6from .data import read_data
8@dataclass
9class Script():
10 """
11 The abstract base class for handling text transliteration and unicode conversion.
13 Attributes:
14 config (str): Path to the configuration file or configuration data in YAML format.
15 """
16 config:str
18 def __post_init__(self):
19 """Initializes configuration and sets up mappings, patterns, and regularization rules."""
21 if isinstance(self.config, (Path,str)):
22 self.config = read_data(self.config)
23 assert self.config, f"Configuration not found"
25 self.transliteration_to_unicode_dict = self.config.get('mappings', {})
26 self.unicode_to_transliteration_dict = {}
27 for k, v in self.transliteration_to_unicode_dict.items():
28 if v not in self.unicode_to_transliteration_dict:
29 self.unicode_to_transliteration_dict[v] = k
31 # Load patterns to ignore
32 patterns_to_ignore = self.config.get('patterns_to_ignore', [])
33 self.regex_to_ignore = [re.compile(pattern) for pattern in patterns_to_ignore]
35 # Load regularization rules
36 self.regularization_regex = [
37 (re.compile(re.sub(r'\\\\', r'\\', pattern)), replacement)
38 for pattern, replacement in self.config.get('regularization', [])
39 ]
41 # Load transliteration rules
42 self.transliteration_patterns = [
43 (re.compile(pattern),replacement)
44 for pattern, replacement in self.config.get('tokenization', [])
45 ]
46 self.complex_symbols = self.config.get('complex_symbols', {})
47 self.special_chars_pattern = re.compile(self.config.get('special_chars_pattern', ''))
48 self.restore_patterns = [
49 (re.compile(pattern),replacement)
50 for pattern, replacement in self.config.get('restore_patterns', [])
51 ]
53 # Reverse the complex_symbols dictionary
54 self.reversed_symbols = {v: k for k, v in self.complex_symbols.items()}
56 def tokenize_unicode(self, text:str) -> list[str]:
57 """
58 Tokenizes unicode text according to specific patterns.
60 By default, it tokenizes each character as a separate token.
61 This method can be overridden in subclasses to provide more complex tokenization.
63 Args:
64 text (str): Input text in unicode format.
66 Returns:
67 list[str]: List of tokens
68 """
69 return list(text)
71 def tokenize_transliteration(self, text:str) -> list[str]:
72 """
73 Tokenizes transliterated text according to specific patterns.
75 Args:
76 text (str): Input text in transliterated format.
78 Returns:
79 list[str]: List of tokens
80 """
81 # Replace complex symbols with placeholders
82 for symbol, placeholder in self.complex_symbols.items():
83 text = text.replace(symbol, placeholder)
85 # Apply each pattern replacement in order
86 for pattern, replacement in self.transliteration_patterns:
87 text = pattern.sub(replacement, text)
89 # Handle space replacement with a placeholder
90 space_placeholder = "\uE000" # Placeholder for spaces
91 text = text.replace(" ", space_placeholder)
93 # Tokenize using the special characters pattern
94 tokens = self.special_chars_pattern.split(text)
96 # Apply processing to each token and filter out empty tokens
97 tokenized = [
98 " " if tok == space_placeholder else
99 reduce(lambda t, p: p[0].sub(p[1], t), self.restore_patterns, tok)
100 for tok in tokens if tok and tok != "-"
101 ]
103 # Restore complex symbols using the reversed dictionary
104 for placeholder, symbol in self.reversed_symbols.items():
105 tokenized = [tok.replace(placeholder, symbol) for tok in tokenized]
107 return tokenized if tokenized else [""]
109 def to_transliteration(self, text:str) -> str:
110 """
111 Converts unicode text to transliteration format.
113 NB. This function may not work as expected for all scripts/languages
114 because there may not be a one-to-one mapping between unicode and transliteration.
116 Args:
117 text (str): Input text in unicode format.
119 Returns:
120 str: Transliterated text.
121 """
122 tokens = self.tokenize_unicode(text)
123 return "".join(
124 [
125 self.unicode_to_transliteration_dict.get(token, token)
126 for token in tokens
127 ]
128 )
130 def to_unicode(self, text:str, regularize:bool=False) -> str:
131 """
132 Converts transliterated text to unicode format.
134 Args:
135 text (str): Input text in transliterated format.
136 regularize (bool, optional): Whether to apply regularization. Defaults to False.
138 Returns:
139 str: Text converted to unicode format, optionally regularized.
140 """
141 tokens = self.tokenize_transliteration(text)
142 result = "".join([self.transliteration_to_unicode_dict.get(token, token) for token in tokens])
143 if regularize:
144 result = self.regularize(result)
145 return result
147 def __call__(self, text:str, regularize:bool=False) -> str:
148 """
149 Allows the class instance to be called as a function for unicode conversion.
151 Args:
152 text (str): Input text in transliterated format.
153 regularize (bool, optional): Whether to apply regularization. Defaults to False.
155 Returns:
156 str: Text converted to unicode format, optionally regularized.
157 """
158 return self.to_unicode(text, regularize=regularize)
160 def regularize(self, string: str) -> str:
161 """
162 Applies regularization rules to a given string.
164 Args:
165 string (str): Text string to be regularized.
167 Returns:
168 str: Regularized text string.
169 """
170 for pattern, replacement in self.regularization_regex:
171 string = pattern.sub(replacement, string)
173 for regex in self.regex_to_ignore:
174 string = regex.sub("", string)
175 string = re.sub(r'\s+', ' ', string)
176 string = re.sub('mut','',string)
177 return string.strip()