Coverage for potnia/scripts/linear_b.py: 100.00%

17 statements  

« prev     ^ index     » next       coverage.py v7.6.3, created at 2025-04-03 23:35 +0000

1import re 

2from dataclasses import dataclass 

3from ..script import Script 

4 

5@dataclass 

6class LinearB(Script): 

7 """ 

8 Class for handling text transliteration and unicode conversion for Linear B. 

9 

10 To use the singleton instance, import like so: 

11 ``from potnia import linear_b`` 

12 

13 Designed especially for texts from DĀMOS (Database of Mycenaean at Oslo): https://damos.hf.uio.no/ 

14 and LiBER (Linear B Electronic Resources): https://liber.cnr.it/ 

15 

16 Attributes: 

17 config (str): Path to the configuration file or configuration data in string format.  

18 By default, it uses the 'linear_a.yaml file in the 'data' directory. 

19 """ 

20 config:str = "linear_b" 

21 

22 def regularize(self, text: str) -> str: 

23 """ 

24 Applies regularization rules to a given string. 

25 

26 Args: 

27 string (str): Text string to be regularized. 

28 

29 Returns: 

30 str: Regularized text string. 

31 """ 

32 text = super().regularize(text) 

33 

34 # Ensure there are informative characters left in the text 

35 informative_chars = set(list(re.sub(r'[%\s]', "", text))) 

36 if len(informative_chars) == 0: 

37 return "" 

38 

39 return text 

40 

41 def tokenize_unicode(self, text:str) -> list[str]: 

42 """ 

43 Tokenizes a unicode string by splitting and joining words with dashes. 

44 

45 Args: 

46 text (str): Input text in unicode format. 

47 

48 Returns: 

49 list[str]: List of tokenized strings. 

50 """ 

51 words = ['-'.join(word) for word in text.split()] 

52 text = ' '.join(words) 

53 return list(text) 

54 

55 

56linear_b = LinearB()