Coverage for potnia/scripts/linear_b.py: 100.00%
17 statements
« prev ^ index » next coverage.py v7.6.3, created at 2025-04-03 23:35 +0000
« prev ^ index » next coverage.py v7.6.3, created at 2025-04-03 23:35 +0000
1import re
2from dataclasses import dataclass
3from ..script import Script
5@dataclass
6class LinearB(Script):
7 """
8 Class for handling text transliteration and unicode conversion for Linear B.
10 To use the singleton instance, import like so:
11 ``from potnia import linear_b``
13 Designed especially for texts from DĀMOS (Database of Mycenaean at Oslo): https://damos.hf.uio.no/
14 and LiBER (Linear B Electronic Resources): https://liber.cnr.it/
16 Attributes:
17 config (str): Path to the configuration file or configuration data in string format.
18 By default, it uses the 'linear_a.yaml file in the 'data' directory.
19 """
20 config:str = "linear_b"
22 def regularize(self, text: str) -> str:
23 """
24 Applies regularization rules to a given string.
26 Args:
27 string (str): Text string to be regularized.
29 Returns:
30 str: Regularized text string.
31 """
32 text = super().regularize(text)
34 # Ensure there are informative characters left in the text
35 informative_chars = set(list(re.sub(r'[%\s]', "", text)))
36 if len(informative_chars) == 0:
37 return ""
39 return text
41 def tokenize_unicode(self, text:str) -> list[str]:
42 """
43 Tokenizes a unicode string by splitting and joining words with dashes.
45 Args:
46 text (str): Input text in unicode format.
48 Returns:
49 list[str]: List of tokenized strings.
50 """
51 words = ['-'.join(word) for word in text.split()]
52 text = ' '.join(words)
53 return list(text)
56linear_b = LinearB()