Coverage for potnia/script.py: 100.00%

58 statements  

« prev     ^ index     » next       coverage.py v7.6.3, created at 2025-04-03 23:35 +0000

1import re 

2from functools import reduce 

3from pathlib import Path 

4from dataclasses import dataclass 

5 

6from .data import read_data 

7 

8@dataclass 

9class Script(): 

10 """ 

11 The abstract base class for handling text transliteration and unicode conversion. 

12 

13 Attributes: 

14 config (str): Path to the configuration file or configuration data in YAML format. 

15 """ 

16 config:str 

17 

18 def __post_init__(self): 

19 """Initializes configuration and sets up mappings, patterns, and regularization rules.""" 

20 

21 if isinstance(self.config, (Path,str)): 

22 self.config = read_data(self.config) 

23 assert self.config, f"Configuration not found" 

24 

25 self.transliteration_to_unicode_dict = self.config.get('mappings', {}) 

26 self.unicode_to_transliteration_dict = {} 

27 for k, v in self.transliteration_to_unicode_dict.items(): 

28 if v not in self.unicode_to_transliteration_dict: 

29 self.unicode_to_transliteration_dict[v] = k 

30 

31 # Load patterns to ignore  

32 patterns_to_ignore = self.config.get('patterns_to_ignore', []) 

33 self.regex_to_ignore = [re.compile(pattern) for pattern in patterns_to_ignore] 

34 

35 # Load regularization rules 

36 self.regularization_regex = [ 

37 (re.compile(re.sub(r'\\\\', r'\\', pattern)), replacement) 

38 for pattern, replacement in self.config.get('regularization', []) 

39 ] 

40 

41 # Load transliteration rules 

42 self.transliteration_patterns = [ 

43 (re.compile(pattern),replacement) 

44 for pattern, replacement in self.config.get('tokenization', []) 

45 ] 

46 self.complex_symbols = self.config.get('complex_symbols', {}) 

47 self.special_chars_pattern = re.compile(self.config.get('special_chars_pattern', '')) 

48 self.restore_patterns = [ 

49 (re.compile(pattern),replacement) 

50 for pattern, replacement in self.config.get('restore_patterns', []) 

51 ] 

52 

53 # Reverse the complex_symbols dictionary 

54 self.reversed_symbols = {v: k for k, v in self.complex_symbols.items()} 

55 

56 def tokenize_unicode(self, text:str) -> list[str]: 

57 """ 

58 Tokenizes unicode text according to specific patterns. 

59 

60 By default, it tokenizes each character as a separate token. 

61 This method can be overridden in subclasses to provide more complex tokenization. 

62 

63 Args: 

64 text (str): Input text in unicode format. 

65 

66 Returns: 

67 list[str]: List of tokens 

68 """ 

69 return list(text) 

70 

71 def tokenize_transliteration(self, text:str) -> list[str]: 

72 """ 

73 Tokenizes transliterated text according to specific patterns. 

74 

75 Args: 

76 text (str): Input text in transliterated format. 

77 

78 Returns: 

79 list[str]: List of tokens 

80 """ 

81 # Replace complex symbols with placeholders 

82 for symbol, placeholder in self.complex_symbols.items(): 

83 text = text.replace(symbol, placeholder) 

84 

85 # Apply each pattern replacement in order 

86 for pattern, replacement in self.transliteration_patterns: 

87 text = pattern.sub(replacement, text) 

88 

89 # Handle space replacement with a placeholder 

90 space_placeholder = "\uE000" # Placeholder for spaces 

91 text = text.replace(" ", space_placeholder) 

92 

93 # Tokenize using the special characters pattern 

94 tokens = self.special_chars_pattern.split(text) 

95 

96 # Apply processing to each token and filter out empty tokens 

97 tokenized = [ 

98 " " if tok == space_placeholder else 

99 reduce(lambda t, p: p[0].sub(p[1], t), self.restore_patterns, tok) 

100 for tok in tokens if tok and tok != "-" 

101 ] 

102 

103 # Restore complex symbols using the reversed dictionary 

104 for placeholder, symbol in self.reversed_symbols.items(): 

105 tokenized = [tok.replace(placeholder, symbol) for tok in tokenized] 

106 

107 return tokenized if tokenized else [""] 

108 

109 def to_transliteration(self, text:str) -> str: 

110 """ 

111 Converts unicode text to transliteration format. 

112 

113 NB. This function may not work as expected for all scripts/languages 

114 because there may not be a one-to-one mapping between unicode and transliteration. 

115 

116 Args: 

117 text (str): Input text in unicode format. 

118 

119 Returns: 

120 str: Transliterated text. 

121 """ 

122 tokens = self.tokenize_unicode(text) 

123 return "".join( 

124 [ 

125 self.unicode_to_transliteration_dict.get(token, token) 

126 for token in tokens 

127 ] 

128 ) 

129 

130 def to_unicode(self, text:str, regularize:bool=False) -> str: 

131 """ 

132 Converts transliterated text to unicode format. 

133 

134 Args: 

135 text (str): Input text in transliterated format. 

136 regularize (bool, optional): Whether to apply regularization. Defaults to False. 

137 

138 Returns: 

139 str: Text converted to unicode format, optionally regularized. 

140 """ 

141 tokens = self.tokenize_transliteration(text) 

142 result = "".join([self.transliteration_to_unicode_dict.get(token, token) for token in tokens]) 

143 if regularize: 

144 result = self.regularize(result) 

145 return result 

146 

147 def __call__(self, text:str, regularize:bool=False) -> str: 

148 """ 

149 Allows the class instance to be called as a function for unicode conversion. 

150 

151 Args: 

152 text (str): Input text in transliterated format. 

153 regularize (bool, optional): Whether to apply regularization. Defaults to False. 

154 

155 Returns: 

156 str: Text converted to unicode format, optionally regularized. 

157 """ 

158 return self.to_unicode(text, regularize=regularize) 

159 

160 def regularize(self, string: str) -> str: 

161 """ 

162 Applies regularization rules to a given string. 

163 

164 Args: 

165 string (str): Text string to be regularized. 

166 

167 Returns: 

168 str: Regularized text string. 

169 """ 

170 for pattern, replacement in self.regularization_regex: 

171 string = pattern.sub(replacement, string) 

172 

173 for regex in self.regex_to_ignore: 

174 string = regex.sub("", string) 

175 string = re.sub(r'\s+', ' ', string) 

176 string = re.sub('mut','',string) 

177 return string.strip()