Module:Tibt-common

From Wiktionary, the free dictionary
Jump to navigation Jump to search


local export = {}

local m_str_utils = require("Module:string utilities")

local find = m_str_utils.find
local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local match = m_str_utils.match
local sub = m_str_utils.sub
local toNFC = mw.ustring.toNFC
local u = m_str_utils.char

-- Finds the main stack of a given syllable, which allows all other components to be determined (and is the basis for sorting and transliteration). Currently defaults to Classical Tibetan, but if a ruleset for a specific language exists, it will use that instead. Once the main stack has been located, the process of sorting and transliteration is the same. Because of this, [[Module:Tibt-sortkey]] and [[Module:Tibt-translit]] only need to be pointed at this common function.
-- Uses a (somewhat expanded) implementation of the algorithm found in "Algorithmic description of the decomposition and checking of a Classical Tibetan syllable" by Roux, Hildt & Drupchen: https://escholarship.org/uc/item/70z8069f
function export.findMainStack(text, langCode)
	-- If a language-specific module exists, use the ruleset in that. If not, fall back on the Tibetan module [[Module:bo-common]].
	local langModuleCheck, langModule = pcall(function() langModule = require("Module:" .. langCode .. "-common") return langModule end)
	if not langModuleCheck then
		langModule = require("Module:bo-common")
	end
	local sc = require("Module:scripts").getByCode("Tibt")
	text = sc:fixDiscouragedSequences(text)
	text = sc:toFixedNFC(text)
	local origText, mainStack = text
	
	-- If halantas are present, the the input must be modified so as to treat the parent consonant + any that follow as a pseudo-stack before being processed by the rules. The locations are then stored, so that the pseudo-stack can be converted back again if it is found to be the main stack.
	local halantaSubs, halantas = {}, {}
	if match(text, "྄") and match(text, "[^྄]$") then
		halantaSubs = {
			{"྄ཀ", "ྐ"}, {"྄ཁ", "ྑ"}, {"྄ག", "ྒ"}, {"྄ང", "ྔ"}, {"྄ཅ", "ྕ"}, {"྄ཆ", "ྖ"}, {"྄ཇ", "ྗ"}, {"྄ཉ", "ྙ"}, {"྄ཊ", "ྚ"}, {"྄ཋ", "ྛ"}, {"྄ཌ", "ྜ"}, {"྄ཎ", "ྞ"}, {"྄ཏ", "ྟ"}, {"྄ཐ", "ྠ"}, {"྄ད", "ྡ"}, {"྄ན", "ྣ"}, {"྄པ", "ྤ"}, {"྄ཕ", "ྥ"}, {"྄བ", "ྦ"}, {"྄མ", "ྨ"}, {"྄ཙ", "ྩ"}, {"྄ཚ", "ྪ"}, {"྄ཛ", "ྫ"}, {"྄ཝ", "ྭ"}, {"྄ཞ", "ྮ"}, {"྄ཟ", "ྯ"}, {"྄འ", "ྰ"}, {"྄ཡ", "ྱ"}, {"྄ར", "ྲ"}, {"྄ལ", "ླ"}, {"྄ཤ", "ྴ"}, {"྄ཥ", "ྵ"}, {"྄ས", "ྶ"}, {"྄ཧ", "ྷ"}, {"྄ཨ", "ྸ"}, {"྄ཪ", "ྼ"}
		}
		
		local convHalantas = {}
		for _, halantaSub in pairs(halantaSubs) do
			convHalantas[halantaSub[1]] = halantaSub[2]
		end
		
		for halanta in gmatch(text, "྄.") do
			halantaSub = u(0xF000) .. (gsub(halanta, ".*", convHalantas))
			text = gsub(text, halanta, halantaSub, 1)
			table.insert(halantas, find(text, u(0xF000)))
			text = gsub(text, u(0xF000), "")
		end
		halantas = require("Module:table").compressSparseArray(halantas)
	end
	
	local function err()
		return error("Invalid syllable " .. toNFC(origText) .. ".")
	end
	
	text = langModule.preconvert(text)
	
	for _, check in pairs(langModule.mainStackChecks(text)) do
		if check then
			mainStack = check
			if match(origText, "྄") then
				local convHalantas = {}
				for _, halantaSub in pairs(halantaSubs) do
					convHalantas[halantaSub[2]] = halantaSub[1]
				end
				local offset = find(text, mainStack)
				for i, halanta in ipairs(halantas) do
					mainStack = gsub(mainStack, sub(mainStack, (halanta-offset)+i, (halanta-offset)+i), convHalantas, 1)
				end
			end
			return mainStack
		end
	end
	
	-- If ambiguous, return the most likely stack, along with a second value (true) so that this can be taken into account.
	for syllable, mainStack in pairs(langModule.ambiguousSyllables) do
		if match(text, "^" .. syllable .. "$") then
			return langModule.postconvert(mainStack), true
		end
	end
	return err()
end

function export.getWords(text)
	return gmatch(text, "[ༀ་-༒" .. u(0xF35) .. u(0xF37) .. u(0xF39) .. "-ྼ]+")
end

function export.getSyllables(text)
	return gmatch(text, "[ༀ" .. u(0xF35) .. u(0xF37) .. u(0xF39) .. "-ྼ]+")
end

return export