Coverage for potnia/scripts/arabic.py: 100.00%
25 statements
« prev ^ index » next coverage.py v7.6.3, created at 2025-04-03 23:35 +0000
« prev ^ index » next coverage.py v7.6.3, created at 2025-04-03 23:35 +0000
1import re
2from dataclasses import dataclass
3from ..script import Script
5@dataclass
6class Arabic(Script):
7 """
8 Class for handling text transliteration and unicode conversion to Arabic.
10 To use the singleton instance, import like so:
11 ``from potnia import arabic``
13 Uses the DIN 31635 standard for Arabic transliteration.
15 If you need the Tim Buckwalter transliteration system, then use the PyArabic library.
17 Attributes:
18 config (str): Path to the configuration file or configuration data in string format.
19 By default, it uses the 'arabic.yaml file in the 'data' directory.
20 """
21 config:str = "arabic"
23 def to_unicode(self, text:str, regularize:bool=False) -> str:
24 """
25 Converts transliterated text to unicode format.
27 Args:
28 text (str): Input text in transliterated format.
29 regularize (bool, optional): Whether to apply regularization. Defaults to False.
31 Returns:
32 str: Text converted to unicode format, optionally regularized.
33 """
34 # if word ends with 'atun' then make it damataan with taa marbuta
35 text = re.sub(r'(\w\w)atun\b', r'\1'+'َ\u0629\u064C', text)
36 # if word has uʾ then make it a hamza on top of waw
37 text = re.sub(r'uʾ', '\u0624', text)
38 # if word ends with 'un' then make it damataan
39 text = re.sub(r'(\w\w)un\b', r'\1'+'\u064C', text)
40 # if word ends with 'in' then make it kasrataan
41 text = re.sub(r'(\w\w)in\b', r'\1'+'\u064D', text)
42 # if word ends with 'an' then make it fatatan
43 text = re.sub(r'(\w\w)an\b', r'\1'+'\u064Bا', text)
44 # if word starts with 'i' or 'a' then make it an alif with hamza
45 text = re.sub(r'\b[i]', 'إ', text)
46 text = re.sub(r'-[i]', "-إ", text)
47 text = re.sub(r'\b[a]', 'أ', text)
48 text = re.sub(r'-[a]', "-أ", text)
50 text = re.sub(r'\bʾa', 'أ', text)
52 # definite article
53 text = re.sub(r'أl-', "ال", text)
55 text = super().to_unicode(text, regularize)
57 # fix the word 'اسم' if it is written as 'إسم'
58 text = re.sub(r"إسم", "اسم", text)
60 arabic_consonants_with_shadda = [
61 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش',
62 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن',
63 'ه', 'و', 'ي'
64 ]
65 for consonant in arabic_consonants_with_shadda:
66 text = re.sub(f'{consonant}{consonant}', f'{consonant}\u0651', text)
68 return text
71arabic = Arabic()