Coverage for potnia/scripts/linear_a.py: 100.00%

56 statements  

« prev     ^ index     » next       coverage.py v7.6.3, created at 2025-04-03 23:35 +0000

1import re 

2from dataclasses import dataclass 

3from ..script import Script 

4 

5@dataclass 

6class LinearA(Script): 

7 """ 

8 Class for handling text transliteration and unicode conversion for Linear A. 

9 

10 To use the singleton instance, import like so: 

11 ``from potnia import linear_a`` 

12 

13 Attributes: 

14 config (str): Path to the configuration file or configuration data in string format.  

15 By default, it uses the 'linear_a.yaml file in the 'data' directory. 

16 """ 

17 config:str = "linear_a.yaml" 

18 

19 def tokenize_transliteration(self, input_string: str) -> list[str]: 

20 """ 

21 Tokenizes transliterated text according to specific patterns. 

22 

23 Args: 

24 text (str): Input text in transliterated format. 

25 

26 Returns: 

27 list[str]: List of tokens 

28 """ 

29 tokens = [] 

30 token = "" 

31 i = 0 

32 

33 while i < len(input_string): 

34 char = input_string[i] 

35 

36 # Check for special sequences like "[?]" and "[unclassified]" 

37 if char == '[': 

38 if input_string[i:i + 3] == '[?]': 

39 if token: 

40 tokens.append(token) 

41 tokens.append("[?]") 

42 token = "" 

43 i += 3 # Skip past "[?]" 

44 continue 

45 elif input_string[i:i + 14] == '[unclassified]': 

46 if token: 

47 tokens.append(token) 

48 tokens.append("[unclassified]") 

49 token = "" 

50 i += 14 # Skip past "[unclassified]" 

51 continue 

52 

53 # Handle characters ']', '[', and ' ' 

54 if char in '[] ': 

55 if token: 

56 tokens.append(token) 

57 token = "" 

58 tokens.append(char) 

59 # Handle other characters 

60 elif char == '-': 

61 if token: 

62 tokens.append(token) 

63 token = "" 

64 else: 

65 token += char 

66 i += 1 

67 

68 # Add the last token if it exists 

69 if token: 

70 tokens.append(token) 

71 

72 return tokens 

73 

74 def tokenize_unicode(self, text:str) -> list[str]: 

75 """ 

76 Tokenizes a unicode string by splitting and joining words with dashes. 

77 

78 Args: 

79 text (str): Input text in unicode format. 

80 

81 Returns: 

82 list[str]: List of tokenized strings. 

83 """ 

84 def is_aegean(char): 

85 return "\U00010000" <= char <= "\U0001007F" or "\U00010600" <= char <= "\U0001077F" 

86 

87 # Insert hyphens between consecutive Linear B characters 

88 modified_text = "" 

89 prev_was_aegean = False 

90 

91 for char in text: 

92 if is_aegean(char): 

93 if prev_was_aegean: 

94 modified_text += "-" # Add hyphen if previous character was also Linear B 

95 modified_text += char 

96 prev_was_aegean = True 

97 else: 

98 modified_text += char 

99 prev_was_aegean = False # Reset flag on encountering a non-Linear B character 

100 

101 return list(modified_text) 

102 

103 

104linear_a = LinearA()