Coverage for potnia/scripts/arabic.py: 100.00%

1import re

2from dataclasses import dataclass

3from ..script import Script

5@dataclass

6class Arabic(Script):

7 """

8 Class for handling text transliteration and unicode conversion to Arabic.

10 To use the singleton instance, import like so:

11 ``from potnia import arabic``

13 Uses the DIN 31635 standard for Arabic transliteration.

15 If you need the Tim Buckwalter transliteration system, then use the PyArabic library.

17 Attributes:

18 config (str): Path to the configuration file or configuration data in string format.

19 By default, it uses the 'arabic.yaml file in the 'data' directory.

20 """

21 config:str = "arabic"

23 def to_unicode(self, text:str, regularize:bool=False) -> str:

24 """

25 Converts transliterated text to unicode format.

27 Args:

28 text (str): Input text in transliterated format.

29 regularize (bool, optional): Whether to apply regularization. Defaults to False.

31 Returns:

32 str: Text converted to unicode format, optionally regularized.

33 """

34 # if word ends with 'atun' then make it damataan with taa marbuta

35 text = re.sub(r'(\w\w)atun\b', r'\1'+'َ\u0629\u064C', text)

36 # if word has uʾ then make it a hamza on top of waw

37 text = re.sub(r'uʾ', '\u0624', text)

38 # if word ends with 'un' then make it damataan

39 text = re.sub(r'(\w\w)un\b', r'\1'+'\u064C', text)

40 # if word ends with 'in' then make it kasrataan

41 text = re.sub(r'(\w\w)in\b', r'\1'+'\u064D', text)

42 # if word ends with 'an' then make it fatatan

43 text = re.sub(r'(\w\w)an\b', r'\1'+'\u064Bا', text)

44 # if word starts with 'i' or 'a' then make it an alif with hamza

45 text = re.sub(r'\b[i]', 'إ', text)

46 text = re.sub(r'-[i]', "-إ", text)

47 text = re.sub(r'\b[a]', 'أ', text)

48 text = re.sub(r'-[a]', "-أ", text)

50 text = re.sub(r'\bʾa', 'أ', text)

52 # definite article

53 text = re.sub(r'أl-', "ال", text)

55 text = super().to_unicode(text, regularize)

57 # fix the word 'اسم' if it is written as 'إسم'

58 text = re.sub(r"إسم", "اسم", text)

60 arabic_consonants_with_shadda = [

61 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش',

62 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن',

63 'ه', 'و', 'ي'

64 ]

65 for consonant in arabic_consonants_with_shadda:

66 text = re.sub(f'{consonant}{consonant}', f'{consonant}\u0651', text)

68 return text

71arabic = Arabic()