Jump to content

Module:data consistency check

ពីWiktionary

Documentation for this module may be created at Module:data consistency check/doc

local export = {}

local messages = {}

local function discrepancy(modname, ...)
	if not messages[modname] then
		messages[modname] = {}
	end
	
	table.insert(messages[modname], string.format(...))
end

local all_codes = {}

local language_names = {}
local family_names = {}
local script_names = {}

local nonempty_fams = {}
local nonempty_scrs = {}

local function check_languages()
	local m_family_data = mw.loadData('Module:families/data')
	local m_script_data = mw.loadData('Module:scripts/data')
	
	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguage$") then
			return "[[:Category:" .. name .. "|" .. name .. "]]"
		else
			return "[[:Category:" .. name .. " language|" .. name .. " language]]"
		end
	end
	
	local function check_language(modname, code, data)
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not (data.names and data.names[1]) then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[data.names[1]] then
			discrepancy(modname, "%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.", link(data.names[1]), code, language_names[data.names[1]])
		else
			language_names[data.names[1]] = code
		end
		
		if (data.type ~= "regular") and (data.type ~= "reconstructed") and (data.type ~= "appendix-constructed") then
			discrepancy(modname, "%s (<code>%s</code>) is of an invalid type <code>%s</code>.", link(data.names and data.names[1]), code, data.type)
		end
		
		if not (data.scripts and data.scripts[1]) then
			discrepancy(modname, "%s (<code>%s</code>) has no scripts listed.", link(data.names and data.names[1]), code)
		else
			for i, sccode in ipairs(data.scripts) do
				if not m_script_data[sccode] then
					discrepancy(modname, "%s (<code>%s</code>) lists an invalid script code <code>%s</code>.", link(data.names and data.names[1]), code, sccode)
				end
	
				nonempty_scrs[sccode] = true
			end
		end
		
		if not m_family_data[data.family] then
			discrepancy(modname, "%s (<code>%s</code>) has an invalid family code <code>%s</code>.", link(data.names and data.names[1]), code, data.family)
		end
		
		nonempty_fams[data.family] = true
	end
	
	-- Check two-letter codes
	local modname = "languages/data2"
	local data2 = mw.loadData("Module:" .. modname)
	
	for code, data in pairs(data2) do
		if not code:find("^[a-z][a-z]$") then
			discrepancy(modname, '%s (<code>%s</code>) does not have a two-letter code.', link(data.names and data.names[1]), code)
		end
		
		check_language(modname, code, data)
	end
	
	-- Check three-letter codes
	for i = string.byte('a'), string.byte('z') do
		local letter = string.char(i)
		local modname = "languages/data3/" .. letter
		local data3 = mw.loadData("Module:" .. modname)
		
		for code, data in pairs(data3) do
			if not code:find("^" .. letter .. "[a-z][a-z]$") then
				discrepancy(modname, '%s (<code>%s</code>) does not have a three-letter code starting with "<code>%s</code>".', link(data.names and data.names[1]), code, letter)
			end
			
			check_language(modname, code, data)
		end
	end
	
	-- Check exceptional codes
	local modname = "languages/datax"
	local datax = mw.loadData("Module:" .. modname)
	
	for code, data in pairs(datax) do
		if code:find("^[a-z][a-z][a-z]?$") then
			discrepancy(modname, '%s (<code>%s</code>) has a two- or three-letter code.', link(data.names and data.names[1]), code)
		end
		
		check_language(modname, code, data)
	end
end

local function check_etym_languages()
	local modname = "etymology language/data"
	local m_etym_language_data = require("Module:" .. modname) -- no mw.loadData
	local m_language_data = mw.loadData("Module:languages/alldata")
	local m_family_data = mw.loadData('Module:families/data')
	
	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguage$") then
			return name
		else
			return name .. " language"
		end
	end
	
	for code, data in pairs(m_etym_language_data) do
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not (data.names and data.names[1]) then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[data.names[1]] then
			--discrepancy(modname, "%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.", link(data.names[1]), code, language_names[data.names[1]])
		else
			language_names[data.names[1]] = code
		end
		
		if data.parent then
			if not m_language_data[data.parent] and not m_family_data[data.parent] and not m_etym_language_data[data.parent] then
				discrepancy(modname, "Etymology-only %s (<code>%s</code>) has invalid parent language or family code <code>%s</code>.", link(data.names and data.names[1]), code, data.parent)
			end
			
			nonempty_fams[data.parent] = true
		else
			discrepancy(modname, "Etymology-only %s (<code>%s</code>) has no parent language or family code.", link(data.names and data.names[1]), code)
		end
	end

	local checked = {}
	for code, data in pairs(m_etym_language_data) do
		local stack = {}

		while data do
			if checked[data] then
				break	
			end
			if stack[data] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data.names and data.names[1]), code,
					link(m_etym_language_data[data.parent].names and m_etym_language_data[data.parent].names[1]), data.parent
				)
				break
			end
			stack[data] = true
			code, data = data.parent, data.parent and m_etym_language_data[data.parent]
		end
		
		for data in pairs(stack) do
			checked[data] = true	
		end
	end
end

local function check_families()
	local modname = "families/data"
	local m_family_data = mw.loadData("Module:" .. modname)

	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguages$") then
			return "[[:Category:" .. name .. "|" .. name .. " family]]"
		else
			return "[[:Category:" .. name .. " languages|" .. name .. " family]]"
		end
	end
	
	for code, data in pairs(m_family_data) do
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not (data.names and data.names[1]) then
			discrepancy(modname, "<code>%s</code> has no canonical name specified.", code)
		elseif family_names[data.names[1]] then
			discrepancy(modname, "%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.", link(data.names[1]), code, family_names[data.names[1]])
		else
			family_names[data.names[1]] = code
		end
		
		if not data.family then
			discrepancy(modname, "%s (<code>%s</code>) has no parent family specified.", link(data.names and data.names[1]), code)
		elseif not m_family_data[data.family] then
			discrepancy(modname, "%s (<code>%s</code>) has an invalid parent family code <code>%s</code>.", link(data.names and data.names[1]), code, data.family)
		end
		
		nonempty_fams[data.family] = true
	end
	
	for code, data in pairs(m_family_data) do
		if not nonempty_fams[code] then
			discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data.names and data.names[1]), code)
		end
	end

	local checked = { ['qfa-not'] = true }
	for code, data in pairs(m_family_data) do
		local stack = {}

		while data do
			if checked[code] then
				break	
			end
			if stack[code] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data.names and data.names[1]), code,
					link(m_family_data[data.family].names and m_family_data[data.family].names[1]), data.family
				)
				break
			end
			stack[code] = true
			code, data = data.family, m_family_data[data.family]
		end
		
		for code in pairs(stack) do
			checked[code] = true	
		end
	end
end

local function check_scripts()
	local modname = "scripts/data"
	local m_script_data = mw.loadData("Module:" .. modname)
	
	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ss]cript$") then
			return "[[:Category:" .. name .. "|" .. name .. "]]"
		else
			return "[[:Category:" .. name .. " script|" .. name .. " script]]"
		end
	end
	
	for code, data in pairs(m_script_data) do
		if not (data.names and data.names[1]) then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif script_names[data.names[1]] then
			--discrepancy(modname, "%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.", link(data.names[1]), code, script_names[data.names[1]])
		else
			script_names[data.names[1]] = code
		end
		
		if not nonempty_scrs[code] then
			discrepancy(modname, "%s (<code>%s</code>) is not used for any language%s.", link(data.names and data.names[1]), code, data.characters and "" or " and has no characters listed for auto-detection")
		end

		if data.characters then
			if not pcall(mw.ustring.find, "", data.characters) then
				discrepancy(modname, "%s (<code>%s</code>) specifies an invalid pattern for character detection: <code>%s</code>", link(data.names and data.names[1]), code, data.characters)
			end
		end
	end
end

function export.perform(frame)
	check_languages()
	check_etym_languages()

	-- families and scripts must be checked AFTER languages; languages checks fill out
	-- the nonempty_fams and nonempty_scrs tables, used for testing if a family/script
	-- is ever used in the data
	check_families()
	check_scripts()
	
	-- Format the messages
	local modnames = {}
	
	for modname, msglist in pairs(messages) do
		table.insert(modnames, modname)
		messages[modname] = '\n===[[Module:' .. modname .. ']]===\n*' .. table.concat(msglist, '\n* ') .. '\n'
	end
	
	table.sort(modnames)
	
	-- Are there any messages?
	if #modnames == 0 then
		return '<b class="success">Glory to Arstotzka.</b>'
	else
		local ret = '<b class="warning">Discrepancies detected:</b>'
		
		for _, modname in ipairs(modnames) do
			ret = ret .. messages[modname]
		end
		
		return ret
	end
end

return export