Coverage for potnia/script.py: 100.00%

1import re

2from functools import reduce

3from pathlib import Path

4from dataclasses import dataclass

6from .data import read_data

8@dataclass

9class Script():

10 """

11 The abstract base class for handling text transliteration and unicode conversion.

13 Attributes:

14 config (str): Path to the configuration file or configuration data in YAML format.

15 """

16 config:str

18 def __post_init__(self):

19 """Initializes configuration and sets up mappings, patterns, and regularization rules."""

21 if isinstance(self.config, (Path,str)):

22 self.config = read_data(self.config)

23 assert self.config, f"Configuration not found"

25 self.transliteration_to_unicode_dict = self.config.get('mappings', {})

26 self.unicode_to_transliteration_dict = {}

27 for k, v in self.transliteration_to_unicode_dict.items():

28 if v not in self.unicode_to_transliteration_dict:

29 self.unicode_to_transliteration_dict[v] = k

31 # Load patterns to ignore

32 patterns_to_ignore = self.config.get('patterns_to_ignore', [])

33 self.regex_to_ignore = [re.compile(pattern) for pattern in patterns_to_ignore]

35 # Load regularization rules

36 self.regularization_regex = [

37 (re.compile(re.sub(r'\\\\', r'\\', pattern)), replacement)

38 for pattern, replacement in self.config.get('regularization', [])

39 ]

41 # Load transliteration rules

42 self.transliteration_patterns = [

43 (re.compile(pattern),replacement)

44 for pattern, replacement in self.config.get('tokenization', [])

45 ]

46 self.complex_symbols = self.config.get('complex_symbols', {})

47 self.special_chars_pattern = re.compile(self.config.get('special_chars_pattern', ''))

48 self.restore_patterns = [

49 (re.compile(pattern),replacement)

50 for pattern, replacement in self.config.get('restore_patterns', [])

51 ]

53 # Reverse the complex_symbols dictionary

54 self.reversed_symbols = {v: k for k, v in self.complex_symbols.items()}

56 def tokenize_unicode(self, text:str) -> list[str]:

57 """

58 Tokenizes unicode text according to specific patterns.

60 By default, it tokenizes each character as a separate token.

61 This method can be overridden in subclasses to provide more complex tokenization.

63 Args:

64 text (str): Input text in unicode format.

66 Returns:

67 list[str]: List of tokens

68 """

69 return list(text)

71 def tokenize_transliteration(self, text:str) -> list[str]:

72 """

73 Tokenizes transliterated text according to specific patterns.

75 Args:

76 text (str): Input text in transliterated format.

78 Returns:

79 list[str]: List of tokens

80 """

81 # Replace complex symbols with placeholders

82 for symbol, placeholder in self.complex_symbols.items():

83 text = text.replace(symbol, placeholder)

85 # Apply each pattern replacement in order

86 for pattern, replacement in self.transliteration_patterns:

87 text = pattern.sub(replacement, text)

89 # Handle space replacement with a placeholder

90 space_placeholder = "\uE000" # Placeholder for spaces

91 text = text.replace(" ", space_placeholder)

93 # Tokenize using the special characters pattern

94 tokens = self.special_chars_pattern.split(text)

96 # Apply processing to each token and filter out empty tokens

97 tokenized = [

98 " " if tok == space_placeholder else

99 reduce(lambda t, p: p[0].sub(p[1], t), self.restore_patterns, tok)

100 for tok in tokens if tok and tok != "-"

101 ]

102

103 # Restore complex symbols using the reversed dictionary

104 for placeholder, symbol in self.reversed_symbols.items():

105 tokenized = [tok.replace(placeholder, symbol) for tok in tokenized]

106

107 return tokenized if tokenized else [""]

108

109 def to_transliteration(self, text:str) -> str:

110 """

111 Converts unicode text to transliteration format.

112

113 NB. This function may not work as expected for all scripts/languages

114 because there may not be a one-to-one mapping between unicode and transliteration.

115

116 Args:

117 text (str): Input text in unicode format.

118

119 Returns:

120 str: Transliterated text.

121 """

122 tokens = self.tokenize_unicode(text)

123 return "".join(

124 [

125 self.unicode_to_transliteration_dict.get(token, token)

126 for token in tokens

127 ]

128 )

129

130 def to_unicode(self, text:str, regularize:bool=False) -> str:

131 """

132 Converts transliterated text to unicode format.

133

134 Args:

135 text (str): Input text in transliterated format.

136 regularize (bool, optional): Whether to apply regularization. Defaults to False.

137

138 Returns:

139 str: Text converted to unicode format, optionally regularized.

140 """

141 tokens = self.tokenize_transliteration(text)

142 result = "".join([self.transliteration_to_unicode_dict.get(token, token) for token in tokens])

143 if regularize:

144 result = self.regularize(result)

145 return result

146

147 def __call__(self, text:str, regularize:bool=False) -> str:

148 """

149 Allows the class instance to be called as a function for unicode conversion.

150

151 Args:

152 text (str): Input text in transliterated format.

153 regularize (bool, optional): Whether to apply regularization. Defaults to False.

154

155 Returns:

156 str: Text converted to unicode format, optionally regularized.

157 """

158 return self.to_unicode(text, regularize=regularize)

159

160 def regularize(self, string: str) -> str:

161 """

162 Applies regularization rules to a given string.

163

164 Args:

165 string (str): Text string to be regularized.

166

167 Returns:

168 str: Regularized text string.

169 """

170 for pattern, replacement in self.regularization_regex:

171 string = pattern.sub(replacement, string)

172

173 for regex in self.regex_to_ignore:

174 string = regex.sub("", string)

175 string = re.sub(r'\s+', ' ', string)

176 string = re.sub('mut','',string)

177 return string.strip()