Module:ar-translit

Documentation for this module may be created at Module:ar-translit/doc
local export = {}

local tt = {
	-- consonants
	["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
	["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
	["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʿ", ["غ"]="ḡ",
	["ف"]="f", ["ق"]="q", ["ك"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
	["ه"]="h",
	-- control characters
	["\226\128\140"]="-", -- ZWNJ (zero-width non-joiner)
	-- ["\226\128\141"]="", -- ZWJ (zero-width joiner)
	-- rare latters
	["پ"]="p", ["چ"]="č", ["ڤ"]="v", ["گ"]="g", ["ڨ"]="g", ["ڧ"]="q",
	-- semivowels or long vowels, alif, hamza, special letters
	["\216\167"]="ā",
	-- hamzated latters
	["\216\163"]="ʾ", ["إ"]="ʾ", ["ؤ"]="ʾ", ["ئ"]="ʾ", ["ء"]="ʾ",
	["و"]="ū", --"w" before and after vowels except for ḍámma (u)
	["ي"]="ī", --"y" before and after vowels except for kásra (i)
	["ى"]="ā",
	["\216\162"]="ʾā",
	["ٱ"]= "", -- hámzat-al-wáṣl
	["\217\176"] = "ā", -- ʾálif xanjaríyya
	-- short vowels, šádda and sukūn
	["\217\142"]="a", -- fátḥa
	["\217\144"]="i", -- kásra
	["\217\143"]="u", -- ḍámma
	["\217\146"]="", --sukūn - no vowel
	-- ligatures
	["ﻻ"]="lā",
	["ﷲ"]="llāh",
	-- tatwīl
	["ـ"]="", -- taṭwīl, no sound
	-- numerals
	["١"]="1", ["٢"]="2", ["٣"]="3", ["٤"]="4", ["٥"]="5",
	["٦"]="6", ["٧"]="7", ["٨"]="8", ["٩"]="9", ["٠"]="0",
	-- punctuation (leave on separate lines)
	["؟"]="?", -- question mark
	["\216\140"]=",", -- comma
	["؛"]=";" -- semicolon
}
-- tāʾ marbūṭa (special) - always after a fátḥa (a), silent at the end of an utterance, "t" in ʾiḍāfa or with pronounced tanwīn
-- tanwin nasb: often used with ʾálif (before ʾálif in formal writing, misspelled after ʾálif)

-- translit any words or phrases
function export.tr(text, lang, sc, showI3raab)
	-- shadda-fatha gets replaced with fatha-shadda during NFC normalisation, which MediaWiki
	-- does for all Unicode strings; however, it makes the transliteration process
	-- inconvenient, so undo it.
	text = mw.ustring.gsub(text, "\217\142\217\145", "\217\145\217\142")
	
	text = mw.ustring.gsub(text, "\216\167([\217\142\217\143])", "\216\163%1") -- add hamza
	text = mw.ustring.gsub(text, "\216\167\217\144", "\216\165\217\144")
	text = mw.ustring.gsub(text, "\217\143\217\136\216\167", "ū") -- ignore alif jamīla
	text = mw.ustring.gsub(text, "(.)\217\145", "%1%1") -- shadda
	text = mw.ustring.gsub(text, "\217\136([\217\139\217\140\217\141\217\142\217\143\217\144\217\145\217\146])", "w%1") -- if it has diacritic marks then it's w, otherwise ū
	text = mw.ustring.gsub(text, "\217\138([\217\139\217\140\217\141\217\142\217\143\217\144\217\145\217\146])", "y%1") -- if it has diacritic marks then it's y, otherwise ī
	text = mw.ustring.gsub(text, "^[\216\167\217\177]\217\142?\217\132", "al-")
	text = mw.ustring.gsub(text, "%s[\216\167\217\177]\217\142?\217\132", " al-")
	if showI3raab then -- show ʾiʿrāb (desinential inflection) in transliteration
		text = mw.ustring.gsub(text, ".", {
			["\216\169"] = "t", ["\217\139"] = "an", ["\217\141"] = "in", ["\217\140"] = "un",
			["\217\142"] = "a", ["\217\144"] = "i" , ["\217\143"] = "u"
		})
	else
		text = mw.ustring.gsub(text, "\216\169", "(t)")
		text = mw.ustring.gsub(text, "[\217\139\217\140\217\141]", "")
		text = mw.ustring.gsub(text, "[\217\142\217\143\217\144]%s", " ")
		text = mw.ustring.gsub(text, "[\217\142\217\143\217\144]$", "")
	end
	text = mw.ustring.gsub(text, ".", tt)
	text = mw.ustring.gsub(text, "aā", "ā")
	text = mw.ustring.gsub(text, "iī", "ī")
	text = mw.ustring.gsub(text, "uū", "ū")

	return text
end

return export