Module:Language/scripts: Difference between revisions
< Language
*>Erutuon (copying table unnecessary) |
m (1 revision imported) |
(No difference)
|
Latest revision as of 09:11, 15 May 2018
Documentation for this module may be created at Module:Language/scripts/doc
local p = {}
local gsub = mw.ustring.gsub
local length = mw.ustring.len
local floor = math.floor
local UTF8Char = "[%z\1-\127\194-\244][\128-\191]*"
local codepoint_data = mw.loadData("Module:language/scripts/codepoints")
local data = require("Module:Language/scripts/data")
function p.print(frame)
local scriptCode = frame.args[1]
local scriptData = scriptCode and data[scriptCode] or "Please supply a valid script code."
local characters = scriptData and scriptData.characters or "No characters found for " .. scriptCode .. "."
return characters
end
local script = {}
-- Based on the Script:countCharacters() function of Module:scripts on Wiktionary
local function countCharacters(text, scriptCode)
if not data[scriptCode]["characters"] then
return 0
else
local _, count = gsub(text, "[" .. data[scriptCode]["characters"] .. "]", "")
return count
end
end
function p.isLatn(text)
if type(tostring(text)) == "string" then
local count = countCharacters(text, "Latn")
if count < (length(text) / 4) then -- Only 25% of characters in string are Latin
return false
else
return true
end
else
return nil
end
end
function p.Latin(frame)
local text = frame.args[1]
return p.isLatn(text)
end
local ignore_script = require("Module:table").listToSet{
"Zinh", "Zmth", "Zsym", "Zsye", "Zxxx", "Zyyy", "Zzzz"
}
local function map(func, t)
local array = {}
if t[1] then
for i, v in ipairs(t) do
array[i] = func(v, i, t)
end
else
local i = 0
for k, v in pairs(t) do
i = i + 1
array[i] = func(v, k, t)
end
end
return array
end
local function filter(t, func)
local new_t = {}
if t[1] then
local new_t_i = 0
for i, v in ipairs(t) do
if func(v, i, t) then
new_t_i = new_t_i + 1
new_t[new_t_i] = v
end
end
else
for k, v in pairs(t) do
if func(v, k, t) then
new_t[k] = v
end
end
end
return new_t
end
local function sortRange(range1, range2)
return range1[1] < range2[1]
end
--[[
Binary search: efficient for long lists of codepoint ranges.
]]
local function binarySearch(ranges, value)
if not ranges then
return nil
end
-- Initialize numbers.
local bottom, i, top = 1, 0, ranges.length
if top == 0 then
return nil
end
-- Do search.
while bottom <= top do
-- Calculate current index.
i = floor((bottom + top) / 2)
-- Get range array; for instance, { 0x41, 0x7A, "Latn"}.
local range = ranges[i]
if value < range[1] then
top = i - 1
-- Return matching range array so that it can be placed in cache.
elseif value <= range[2] then
return range
else
bottom = i + 1
end
end
return nil
end
--[[
-- For debugging
local function toHex(number)
return ("0x%X"):format(number)
end
local function logRange(range, number)
return mw.log(toHex(range[1]), toHex(number) .. " (" .. mw.ustring.char(number) .. ")", toHex(range[2]), range[3])
end
--]]
local function lookUpInOrder(number, ranges)
for i, range in ipairs(ranges) do
if number < range[1] then
return nil
elseif number <= range[2] then
return range[3]
end
end
end
-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}
--[=[
Takes a codepoint and returns the script code that is appropriate for it,
based on the data module [[Module:Language/scripts/codepoints]].
The data module uses the official Unicode script codes.
Returns a script code from the codepoint-to-script map, or one of the ranges
in the array of ranges, else returns Zzzz.
]=]
function p.codepointToScript(codepoint)
local lookup = codepoint_data
local t = type(codepoint)
if t ~= "number" then
error("Argument to codepointToScript should be a number, but its type is " .. t .. ".")
end
local individualMatch = lookup.individual[codepoint]
if individualMatch then
return individualMatch
else
local script = lookUpInOrder(codepoint, rangesCache)
if script then
return script
end
local range = binarySearch(lookup.ranges, codepoint)
if range then
table.insert(rangesCache, range)
table.sort(rangesCache, sortRange)
return range[3]
end
end
return "Zzzz"
end
local function charToScript(char)
return p.codepointToScript(mw.ustring.codepoint(char))
end
function p.countScripts(text)
if type(text) ~= "string" then
error("countScripts requires a string")
end
local scriptCounts = {}
local codepointToScript = p.codepointToScript
for codepoint in mw.ustring.gcodepoint(text) do
local script = codepointToScript(codepoint)
if script then
if not scriptCounts[script] then
scriptCounts[script] = 0
end
scriptCounts[script] = scriptCounts[script] + 1
end
end
return scriptCounts
end
function p.getScript(text)
local scripts = {}
local i = 0
for code in pairs(p.countScripts(text)) do
i = i + 1
scripts[i] = code
end
scripts = filter(scripts,
function (scCode)
return not ignore_script[scCode]
end)
if not scripts[2] then
return scripts[1]
else
error("More than one script was found for " .. text)
end
end
function p.showScripts(frame)
return table.concat(
map(function(arg)
return "* " .. arg .. ": " .. table.concat(
map(function(count, script)
return script .. " (" .. count .. ")"
end,
p.countScripts(arg)),
", ")
end,
frame.args),
"\n")
end
return p