Coverage for potnia/scripts/linear_a.py: 100.00%
56 statements
« prev ^ index » next coverage.py v7.6.3, created at 2025-04-03 23:35 +0000
« prev ^ index » next coverage.py v7.6.3, created at 2025-04-03 23:35 +0000
1import re
2from dataclasses import dataclass
3from ..script import Script
5@dataclass
6class LinearA(Script):
7 """
8 Class for handling text transliteration and unicode conversion for Linear A.
10 To use the singleton instance, import like so:
11 ``from potnia import linear_a``
13 Attributes:
14 config (str): Path to the configuration file or configuration data in string format.
15 By default, it uses the 'linear_a.yaml file in the 'data' directory.
16 """
17 config:str = "linear_a.yaml"
19 def tokenize_transliteration(self, input_string: str) -> list[str]:
20 """
21 Tokenizes transliterated text according to specific patterns.
23 Args:
24 text (str): Input text in transliterated format.
26 Returns:
27 list[str]: List of tokens
28 """
29 tokens = []
30 token = ""
31 i = 0
33 while i < len(input_string):
34 char = input_string[i]
36 # Check for special sequences like "[?]" and "[unclassified]"
37 if char == '[':
38 if input_string[i:i + 3] == '[?]':
39 if token:
40 tokens.append(token)
41 tokens.append("[?]")
42 token = ""
43 i += 3 # Skip past "[?]"
44 continue
45 elif input_string[i:i + 14] == '[unclassified]':
46 if token:
47 tokens.append(token)
48 tokens.append("[unclassified]")
49 token = ""
50 i += 14 # Skip past "[unclassified]"
51 continue
53 # Handle characters ']', '[', and ' '
54 if char in '[] ':
55 if token:
56 tokens.append(token)
57 token = ""
58 tokens.append(char)
59 # Handle other characters
60 elif char == '-':
61 if token:
62 tokens.append(token)
63 token = ""
64 else:
65 token += char
66 i += 1
68 # Add the last token if it exists
69 if token:
70 tokens.append(token)
72 return tokens
74 def tokenize_unicode(self, text:str) -> list[str]:
75 """
76 Tokenizes a unicode string by splitting and joining words with dashes.
78 Args:
79 text (str): Input text in unicode format.
81 Returns:
82 list[str]: List of tokenized strings.
83 """
84 def is_aegean(char):
85 return "\U00010000" <= char <= "\U0001007F" or "\U00010600" <= char <= "\U0001077F"
87 # Insert hyphens between consecutive Linear B characters
88 modified_text = ""
89 prev_was_aegean = False
91 for char in text:
92 if is_aegean(char):
93 if prev_was_aegean:
94 modified_text += "-" # Add hyphen if previous character was also Linear B
95 modified_text += char
96 prev_was_aegean = True
97 else:
98 modified_text += char
99 prev_was_aegean = False # Reset flag on encountering a non-Linear B character
101 return list(modified_text)
104linear_a = LinearA()