Coverage for potnia/scripts/arabic.py: 100.00%

25 statements  

« prev     ^ index     » next       coverage.py v7.6.3, created at 2025-04-03 23:35 +0000

1import re 

2from dataclasses import dataclass 

3from ..script import Script 

4 

5@dataclass 

6class Arabic(Script): 

7 """ 

8 Class for handling text transliteration and unicode conversion to Arabic. 

9 

10 To use the singleton instance, import like so: 

11 ``from potnia import arabic`` 

12 

13 Uses the DIN 31635 standard for Arabic transliteration. 

14 

15 If you need the Tim Buckwalter transliteration system, then use the PyArabic library. 

16 

17 Attributes: 

18 config (str): Path to the configuration file or configuration data in string format.  

19 By default, it uses the 'arabic.yaml file in the 'data' directory. 

20 """ 

21 config:str = "arabic" 

22 

23 def to_unicode(self, text:str, regularize:bool=False) -> str: 

24 """ 

25 Converts transliterated text to unicode format. 

26 

27 Args: 

28 text (str): Input text in transliterated format. 

29 regularize (bool, optional): Whether to apply regularization. Defaults to False. 

30 

31 Returns: 

32 str: Text converted to unicode format, optionally regularized. 

33 """ 

34 # if word ends with 'atun' then make it damataan with taa marbuta 

35 text = re.sub(r'(\w\w)atun\b', r'\1'+'َ\u0629\u064C', text) 

36 # if word has uʾ then make it a hamza on top of waw 

37 text = re.sub(r'uʾ', '\u0624', text) 

38 # if word ends with 'un' then make it damataan 

39 text = re.sub(r'(\w\w)un\b', r'\1'+'\u064C', text) 

40 # if word ends with 'in' then make it kasrataan 

41 text = re.sub(r'(\w\w)in\b', r'\1'+'\u064D', text) 

42 # if word ends with 'an' then make it fatatan 

43 text = re.sub(r'(\w\w)an\b', r'\1'+'\u064Bا', text) 

44 # if word starts with 'i' or 'a' then make it an alif with hamza 

45 text = re.sub(r'\b[i]', 'إ', text) 

46 text = re.sub(r'-[i]', "-إ", text) 

47 text = re.sub(r'\b[a]', 'أ', text) 

48 text = re.sub(r'-[a]', "-أ", text) 

49 

50 text = re.sub(r'\bʾa', 'أ', text) 

51 

52 # definite article 

53 text = re.sub(r'أl-', "ال", text) 

54 

55 text = super().to_unicode(text, regularize) 

56 

57 # fix the word 'اسم' if it is written as 'إسم' 

58 text = re.sub(r"إسم", "اسم", text) 

59 

60 arabic_consonants_with_shadda = [ 

61 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 

62 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 

63 'ه', 'و', 'ي' 

64 ] 

65 for consonant in arabic_consonants_with_shadda: 

66 text = re.sub(f'{consonant}{consonant}', f'{consonant}\u0651', text) 

67 

68 return text 

69 

70 

71arabic = Arabic()