Module:LanguageNormalization
Appearance
Documentation for this module may be created at Module:LanguageNormalization/doc
-- Module:LanguageNormalization
-- Maps language inputs (ISO codes, native names) to canonical English names.
--
-- Features:
-- * Maps ISO 639-1/2/3 codes to canonical names
-- * Recognizes native names (e.g., "Español" → "Spanish")
-- * Displays native forms with canonical names (toggleable)
-- * Strips diacritics for flexible matching
-- * Formats multiple languages for templates
--
-- Configuration:
-- * setShowNativeForms(true/false) - Toggle native forms display
-- * getShowNativeForms() - Check current setting
--
-- Dependencies:
-- * Module:CanonicalForms - Normalization pattern
-- * Module:DiacriticNormalization - Diacritic removal
local p = {}
local CanonicalForms = require('Module:CanonicalForms')
local DiacriticNormalization = require('Module:DiacriticNormalization')
-- Configuration
local config = {
showNativeForms = true -- Default: show native forms
}
-- Cache (persists during page render)
local normalizeCache = {}
local isNativeFormCache = {}
local getNativeFormCache = {}
-- Language mapping table
-- Format: {canonical = "English Name", synonyms = {codes, variations}, native = {native names}}
local languageMapping = {
-- ========================================================================
-- Indo-European Languages
-- ========================================================================
-- Germanic Branch
{canonical = "English",
synonyms = {"en", "eng", "english"},
native = {"English"}},
{canonical = "German",
synonyms = {"de", "deu", "ger", "german"},
native = {"Deutsch"}},
{canonical = "Dutch",
synonyms = {"nl", "nld", "dut", "dutch", "flemish"},
native = {"Nederlands"}},
{canonical = "Swedish",
synonyms = {"sv", "swe", "swedish"},
native = {"Svenska"}},
{canonical = "Danish",
synonyms = {"da", "dan", "danish"},
native = {"Dansk"}},
{canonical = "Norwegian",
synonyms = {"no", "nor", "norwegian"},
native = {"Norsk"}},
{canonical = "Icelandic",
synonyms = {"is", "isl", "ice", "icelandic"},
native = {"Íslenska"}},
{canonical = "Afrikaans",
synonyms = {"af", "afr", "afrikaans"},
native = {"Afrikaans"}},
{canonical = "Luxembourgish",
synonyms = {"lb", "ltz", "luxembourgish"},
native = {"Lëtzebuergesch"}},
-- Romance Branch
{canonical = "Spanish",
synonyms = {"es", "spa", "spanish", "castilian", "castelhano"},
native = {"Español"}},
{canonical = "Portuguese",
synonyms = {"pt", "por", "portuguese"},
native = {"Português"}},
{canonical = "French",
synonyms = {"fr", "fra", "fre", "french"},
native = {"Français"}},
{canonical = "Italian",
synonyms = {"it", "ita", "italian"},
native = {"Italiano"}},
{canonical = "Romanian",
synonyms = {"ro", "ron", "rum", "romanian"},
native = {"Română"}},
{canonical = "Catalan",
synonyms = {"ca", "cat", "catalan"},
native = {"Català"}},
{canonical = "Galician",
synonyms = {"gl", "glg", "galician"},
native = {"Galego"}},
{canonical = "Occitan",
synonyms = {"oc", "oci", "occitan"},
native = {"Occitan"}},
{canonical = "Sardinian",
synonyms = {"sc", "srd", "sardinian", "sardo"},
native = {"Sardu"}},
{canonical = "Corsican",
synonyms = {"co", "cos", "corsican", "corsu"},
native = {"Corsu"}},
-- Slavic Branch
{canonical = "Russian",
synonyms = {"ru", "rus", "russian", "русский", "руский"},
native = {"Русский"}},
{canonical = "Polish",
synonyms = {"pl", "pol", "polish"},
native = {"Polski"}},
{canonical = "Ukrainian",
synonyms = {"uk", "ukr", "ukrainian", "українська"},
native = {"Українська"}},
{canonical = "Czech",
synonyms = {"cs", "ces", "cze", "czech"},
native = {"Čeština"}},
{canonical = "Slovak",
synonyms = {"sk", "slk", "slo", "slovak"},
native = {"Slovenčina"}},
{canonical = "Bulgarian",
synonyms = {"bg", "bul", "bulgarian", "български"},
native = {"Български"}},
{canonical = "Croatian",
synonyms = {"hr", "hrv", "croatian"},
native = {"Hrvatski"}},
{canonical = "Serbian",
synonyms = {"sr", "srp", "serbian", "српски"},
native = {"Српски"}},
{canonical = "Slovenian",
synonyms = {"sl", "slv", "slovenian", "slovene"},
native = {"Slovenščina"}},
{canonical = "Belarusian",
synonyms = {"be", "bel", "belarusian", "belorussian", "беларуская"},
native = {"Беларуская"}},
{canonical = "Macedonian",
synonyms = {"mk", "mkd", "mac", "macedonian", "македонски"},
native = {"Македонски"}},
{canonical = "Bosnian",
synonyms = {"bs", "bos", "bosnian"},
native = {"Bosanski"}},
-- Indo-Aryan Branch
{canonical = "Hindi",
synonyms = {"hi", "hin", "hindi", "हिन्दी", "हिंदी"},
native = {"हिन्दी"}},
{canonical = "Bengali",
synonyms = {"bn", "ben", "bengali", "bangla", "বাংলা"},
native = {"বাংলা"}},
{canonical = "Punjabi",
synonyms = {"pa", "pan", "punjabi", "ਪੰਜਾਬੀ"},
native = {"ਪੰਜਾਬੀ"}},
{canonical = "Urdu",
synonyms = {"ur", "urd", "urdu", "اردو"},
native = {"اردو"}},
{canonical = "Gujarati",
synonyms = {"gu", "guj", "gujarati", "ગુજરાતી"},
native = {"ગુજરાતી"}},
{canonical = "Marathi",
synonyms = {"mr", "mar", "marathi", "मराठी"},
native = {"मराठी"}},
{canonical = "Nepali",
synonyms = {"ne", "nep", "nepali", "नेपाली"},
native = {"नेपाली"}},
{canonical = "Sinhala",
synonyms = {"si", "sin", "sinhala", "sinhalese", "සිංහල"},
native = {"සිංහල"}},
{canonical = "Odia",
synonyms = {"or", "ori", "odia", "oriya", "ଓଡ଼ିଆ"},
native = {"ଓଡ଼ିଆ"}},
{canonical = "Assamese",
synonyms = {"as", "asm", "assamese", "অসমীয়া"},
native = {"অসমীয়া"}},
{canonical = "Maithili",
synonyms = {"mai", "maithili", "मैथिली"},
native = {"मैथिली"}},
{canonical = "Rajasthani",
synonyms = {"raj", "rajasthani", "राजस्थानी"},
native = {"राजस्थानी"}},
-- Iranian Branch
{canonical = "Persian",
synonyms = {"fa", "fas", "per", "persian", "farsi", "فارسی"},
native = {"فارسی"}},
{canonical = "Kurdish",
synonyms = {"ku", "kur", "kurdish", "كوردی"},
native = {"كوردی"}},
{canonical = "Pashto",
synonyms = {"ps", "pus", "pashto", "پښتو"},
native = {"پښتو"}},
{canonical = "Tajik",
synonyms = {"tg", "tgk", "tajik", "тоҷики"},
native = {"тоҷики"}},
-- Baltic Branch
{canonical = "Lithuanian",
synonyms = {"lt", "lit", "lithuanian"},
native = {"Lietuvių"}},
{canonical = "Latvian",
synonyms = {"lv", "lav", "latvian"},
native = {"Latviešu"}},
-- Celtic Branch
{canonical = "Irish",
synonyms = {"ga", "gle", "irish", "irish gaelic"},
native = {"Gaeilge"}},
{canonical = "Welsh",
synonyms = {"cy", "cym", "wel", "welsh"},
native = {"Cymraeg"}},
{canonical = "Scottish Gaelic",
synonyms = {"gd", "gla", "scottish gaelic", "gaelic"},
native = {"Gàidhlig"}},
-- Other Indo-European
{canonical = "Greek",
synonyms = {"el", "ell", "gre", "greek", "ελληνικά"},
native = {"Ελληνικά"}},
{canonical = "Albanian",
synonyms = {"sq", "sqi", "alb", "albanian"},
native = {"Shqip"}},
{canonical = "Armenian",
synonyms = {"hy", "hye", "arm", "armenian", "հայերեն"},
native = {"հայերեն"}},
-- ========================================================================
-- Uralic Languages
-- ========================================================================
{canonical = "Hungarian",
synonyms = {"hu", "hun", "hungarian"},
native = {"Magyar"}},
{canonical = "Finnish",
synonyms = {"fi", "fin", "finnish"},
native = {"Suomi"}},
{canonical = "Estonian",
synonyms = {"et", "est", "estonian"},
native = {"Eesti"}},
-- ========================================================================
-- Sino-Tibetan Languages
-- ========================================================================
-- Chinese Languages
{canonical = "Mandarin Chinese",
synonyms = {"zh", "zho", "cmn", "chinese", "mandarin", "中文", "汉语", "普通话"},
native = {"中文"}},
{canonical = "Cantonese",
synonyms = {"yue", "cantonese", "canton", "粵語", "广东话"},
native = {"粵語"}},
{canonical = "Wu Chinese",
synonyms = {"wuu", "wu", "wu chinese", "shanghainese", "吳語", "上海话"},
native = {"吳語"}},
{canonical = "Minnan Chinese",
synonyms = {"nan", "minnan", "hokkien", "taiwanese", "閩南語", "台湾话"},
native = {"閩南語"}},
{canonical = "Hakka Chinese",
synonyms = {"hak", "hakka", "客家話", "客家语"},
native = {"客家話"}},
-- Tibeto-Burman Languages
{canonical = "Burmese",
synonyms = {"my", "mya", "bur", "burmese", "မြန်မာဘာသာ"},
native = {"မြန်မာဘာသာ"}},
{canonical = "Tibetan",
synonyms = {"bo", "bod", "tib", "tibetan", "བོད་སྐད་"},
native = {"བོད་སྐད་"}},
-- ========================================================================
-- Japonic and Koreanic Languages
-- ========================================================================
{canonical = "Japanese",
synonyms = {"ja", "jpn", "japanese", "日本語", "にほんご", "にっぽんご"},
native = {"日本語"}},
{canonical = "Korean",
synonyms = {"ko", "kor", "korean", "한국어", "조선말"},
native = {"한국어"}},
-- ========================================================================
-- Turkic Languages
-- ========================================================================
{canonical = "Turkish",
synonyms = {"tr", "tur", "turkish"},
native = {"Türkçe"}},
{canonical = "Azerbaijani",
synonyms = {"az", "aze", "azerbaijani", "azeri", "azərbaycan dili"},
native = {"Azərbaycan dili"}},
{canonical = "Uzbek",
synonyms = {"uz", "uzb", "uzbek", "o'zbek"},
native = {"Oʻzbek"}},
{canonical = "Kazakh",
synonyms = {"kk", "kaz", "kazakh", "қазақ тілі"},
native = {"қазақ тілі"}},
{canonical = "Kyrgyz",
synonyms = {"ky", "kir", "kyrgyz", "кыргызча"},
native = {"кыргызча"}},
-- ========================================================================
-- Austronesian Languages
-- ========================================================================
{canonical = "Indonesian",
synonyms = {"id", "ind", "indonesian"},
native = {"Bahasa Indonesia"}},
{canonical = "Malay",
synonyms = {"ms", "msa", "may", "malay"},
native = {"Bahasa Melayu"}},
{canonical = "Tagalog",
synonyms = {"fil", "filipino", "pilipino", "tagalog", "tl", "tgl"},
native = {"Tagalog"}},
{canonical = "Javanese",
synonyms = {"jv", "jav", "javanese", "basa jawa"},
native = {"Basa Jawa"}},
{canonical = "Sundanese",
synonyms = {"su", "sun", "sundanese", "basa sunda"},
native = {"Basa Sunda"}},
{canonical = "Hawaiian",
synonyms = {"haw", "hawaiian"},
native = {"ʻŌlelo Hawaiʻi"}},
{canonical = "Māori",
synonyms = {"mi", "mao", "mri", "maori", "reo māori"},
native = {"Te Reo Māori"}},
{canonical = "Samoan",
synonyms = {"sm", "smo", "samoan"},
native = {"Gagana Samoa"}},
{canonical = "Fijian",
synonyms = {"fj", "fij", "fijian"},
native = {"Vosa Vakaviti"}},
-- ========================================================================
-- Dravidian Languages
-- ========================================================================
{canonical = "Tamil",
synonyms = {"ta", "tam", "tamil", "தமிழ்"},
native = {"தமிழ்"}},
{canonical = "Telugu",
synonyms = {"te", "tel", "telugu", "తెలుగు"},
native = {"తెలుగు"}},
{canonical = "Kannada",
synonyms = {"kn", "kan", "kannada", "ಕನ್ನಡ"},
native = {"ಕನ್ನಡ"}},
{canonical = "Malayalam",
synonyms = {"ml", "mal", "malayalam", "മലയാളം"},
native = {"മലയാളം"}},
-- ========================================================================
-- Tai-Kadai Languages
-- ========================================================================
{canonical = "Thai",
synonyms = {"th", "tha", "thai", "ไทย"},
native = {"ไทย"}},
{canonical = "Lao",
synonyms = {"lo", "lao", "laotian", "ລາວ"},
native = {"ລາວ"}},
-- ========================================================================
-- Austro-Asiatic Languages
-- ========================================================================
{canonical = "Vietnamese",
synonyms = {"vi", "vie", "vietnamese"},
native = {"Tiếng Việt"}},
{canonical = "Khmer",
synonyms = {"km", "khm", "khmer", "cambodian", "ខ្មែរ"},
native = {"ខ្មែរ"}},
-- ========================================================================
-- Afro-Asiatic Languages
-- ========================================================================
-- Semitic Branch
{canonical = "Arabic",
synonyms = {"ar", "ara", "arabic", "عربى", "عربي", "عربية"},
native = {"العربية"}},
{canonical = "Modern Standard Arabic",
synonyms = {"msa", "modern standard arabic", "literary arabic", "standard arabic", "العربية الفصحى"},
native = {"العربية الفصحى"}},
{canonical = "Hebrew",
synonyms = {"he", "heb", "hebrew", "עברית"},
native = {"עברית"}},
{canonical = "Amharic",
synonyms = {"am", "amh", "amharic", "አማርኛ"},
native = {"አማርኛ"}},
{canonical = "Tigrinya",
synonyms = {"ti", "tir", "tigrinya", "ትግርኛ"},
native = {"ትግርኛ"}},
{canonical = "Maltese",
synonyms = {"mt", "mlt", "maltese"},
native = {"Malti"}},
-- Cushitic Branch
{canonical = "Somali",
synonyms = {"so", "som", "somali"},
native = {"Soomaali"}},
-- Chadic Branch
{canonical = "Hausa",
synonyms = {"ha", "hau", "hausa", "هَوُسَ"},
native = {"هَوُسَ"}},
-- ========================================================================
-- Niger-Congo Languages
-- ========================================================================
-- Bantu Branch
{canonical = "Swahili",
synonyms = {"sw", "swa", "swahili"},
native = {"Kiswahili"}},
{canonical = "Zulu",
synonyms = {"zu", "zul", "zulu"},
native = {"isiZulu"}},
{canonical = "Xhosa",
synonyms = {"xh", "xho", "xhosa"},
native = {"isiXhosa"}},
{canonical = "Shona",
synonyms = {"sn", "sna", "shona"},
native = {"chiShona"}},
{canonical = "Lingala",
synonyms = {"ln", "lin", "lingala"},
native = {"Lingála"}},
{canonical = "Kinyarwanda",
synonyms = {"rw", "kin", "kinyarwanda"},
native = {"Ikinyarwanda"}},
-- West African Branch
{canonical = "Yoruba",
synonyms = {"yo", "yor", "yoruba"},
native = {"Èdè Yorùbá"}},
{canonical = "Igbo",
synonyms = {"ig", "ibo", "igbo"},
native = {"Asụsụ Igbo"}},
{canonical = "Fula",
synonyms = {"ff", "ful", "fula", "fulfulde", "peul"},
native = {"Fulfulde"}},
{canonical = "Wolof",
synonyms = {"wo", "wol", "wolof"},
native = {"Wolof"}},
{canonical = "Kongo",
synonyms = {"kg", "kon", "kongo", "kikongo"},
native = {"Kikongo"}},
-- ========================================================================
-- Americas Indigenous Languages
-- ========================================================================
-- Quechuan Languages
{canonical = "Quechua",
synonyms = {"qu", "que", "quechua"},
native = {"Runa Simi"}},
-- Tupi-Guarani Languages
{canonical = "Guarani",
synonyms = {"gn", "grn", "guarani"},
native = {"Avañe'ẽ"}},
-- Aymaran Languages
{canonical = "Aymara",
synonyms = {"ay", "aym", "aymara"},
native = {"Aymar aru"}},
-- Uto-Aztecan Languages
{canonical = "Nahuatl",
synonyms = {"nah", "nahuatl", "aztec"},
native = {"Nāhuatl"}},
-- Na-Dene Languages
{canonical = "Navajo",
synonyms = {"nv", "nav", "navajo"},
native = {"Diné bizaad"}},
-- Eskimo-Aleut Languages
{canonical = "Inuktitut",
synonyms = {"iu", "iku", "inuktitut", "ᐃᓄᒃᑎᑐᑦ"},
native = {"ᐃᓄᒃᑎᑐᑦ"}},
-- ========================================================================
-- Creole Languages
-- ========================================================================
{canonical = "Haitian Creole",
synonyms = {"ht", "hat", "haitian", "haitian creole", "kreyòl"},
native = {"Kreyòl Ayisyen"}},
-- ========================================================================
-- Kartvelian Languages
-- ========================================================================
{canonical = "Georgian",
synonyms = {"ka", "kat", "geo", "georgian", "ქართული"},
native = {"ქართული"}},
-- ========================================================================
-- Mongolic Languages
-- ========================================================================
{canonical = "Mongolian",
synonyms = {"mn", "mon", "mongolian", "монгол хэл"},
native = {"Монгол хэл"}},
-- ========================================================================
-- Language Isolates
-- ========================================================================
{canonical = "Basque",
synonyms = {"eu", "eus", "baq", "basque"},
native = {"Euskara"}},
-- ========================================================================
-- Constructed Languages
-- ========================================================================
{canonical = "Esperanto",
synonyms = {"eo", "epo", "esperanto"},
native = {"Esperanto"}},
}
--------------------------------------------------------------------------------
-- Helper Functions
--------------------------------------------------------------------------------
-- Get native form for a canonical language name
function p.getNativeForm(canonicalName)
if not canonicalName then return nil end
-- Check cache first
if getNativeFormCache[canonicalName] ~= nil then
return getNativeFormCache[canonicalName]
end
for _, lang in ipairs(languageMapping) do
if lang.canonical == canonicalName and lang.native and #lang.native > 0 then
-- Cache and return the first native form (typically the most common one)
getNativeFormCache[canonicalName] = lang.native[1]
return lang.native[1]
end
end
-- Cache negative results
getNativeFormCache[canonicalName] = nil
return nil
end
-- Remove diacritics (delegated to DiacriticNormalization)
function p.removeDiacritics(text)
return DiacriticNormalization.removeDiacritics(text)
end
-- Prepare mapping with normal and diacritic-free versions
local function prepareMapping()
local enhancedMapping = {}
for _, lang in ipairs(languageMapping) do
local entry = {
canonical = lang.canonical,
synonyms = {}
}
-- Add synonyms
for _, syn in ipairs(lang.synonyms or {}) do
table.insert(entry.synonyms, syn)
end
-- Add native names
if lang.native then
for _, native in ipairs(lang.native) do
table.insert(entry.synonyms, native:lower())
-- Add diacritic-free version
local stripped = DiacriticNormalization.removeDiacritics(native:lower())
if stripped and stripped ~= native:lower() then
table.insert(entry.synonyms, stripped)
end
end
end
table.insert(enhancedMapping, entry)
end
return enhancedMapping
end
-- Generate enhanced mapping on module load
local enhancedMapping = prepareMapping()
-- Normalize language to canonical English form
function p.normalize(inputLanguage)
if not inputLanguage or inputLanguage == "" then
return inputLanguage
end
-- Check cache first
if normalizeCache[inputLanguage] then
return normalizeCache[inputLanguage]
end
-- Try direct match
local canonical = CanonicalForms.normalize(inputLanguage, enhancedMapping)
if canonical then
normalizeCache[inputLanguage] = canonical
return canonical
end
-- Try with diacritics removed
local strippedInput = DiacriticNormalization.removeDiacritics(inputLanguage:lower())
if strippedInput and strippedInput ~= inputLanguage:lower() then
canonical = CanonicalForms.normalize(strippedInput, enhancedMapping)
if canonical then
normalizeCache[inputLanguage] = canonical
return canonical
end
end
-- If no match, return original
normalizeCache[inputLanguage] = inputLanguage
return inputLanguage
end
-- Check if input is a native form and get canonical form
function p.isNativeForm(input)
if not input then return nil end
-- Check cache first
if isNativeFormCache[input] ~= nil then
return isNativeFormCache[input]
end
-- Convert to lowercase
local lowerInput = input:lower()
for _, lang in ipairs(languageMapping) do
if lang.native then
for _, native in ipairs(lang.native) do
if lowerInput == native:lower() then
isNativeFormCache[input] = lang.canonical
return lang.canonical
end
end
end
end
-- Cache negative results
isNativeFormCache[input] = nil
return nil
end
-- Format multiple languages with normalization
function p.formatLanguages(inputLanguages)
if not inputLanguages or inputLanguages == "" then return "" end
-- Split by semicolons
local languages = {}
for lang in inputLanguages:gmatch("[^;]+") do
local trimmed = lang:match("^%s*(.-)%s*$")
if trimmed and trimmed ~= "" then
-- Check if native form
local canonicalFromNative = p.isNativeForm(trimmed)
-- If native form, use canonical + input as native
local normalized, originalInput
if canonicalFromNative then
normalized = canonicalFromNative
originalInput = trimmed
else
-- Otherwise normalize as usual
normalized = p.normalize(trimmed)
originalInput = nil
end
-- Get and format with native form if available
local nativeForm = originalInput or p.getNativeForm(normalized)
local formattedLang
-- Exception: English never shows native form
if nativeForm and config.showNativeForms and normalized ~= "English" then
formattedLang = string.format("%s<br/><span style=\"display:inline-block; width:0.1em; visibility:hidden;\">*</span><span style=\"font-size:75%%;\">%s</span>", normalized, nativeForm)
else
formattedLang = normalized
end
table.insert(languages, formattedLang)
end
end
-- Format based on language count
if #languages > 1 then
local listItems = {}
for _, lang in ipairs(languages) do
table.insert(listItems, string.format("<li>%s</li>", lang))
end
return string.format("<ul class=\"template-list template-list-language\" style=\"margin:0; padding-left:1em;\">%s</ul>", table.concat(listItems, ""))
elseif #languages == 1 then
return languages[1]
end
return ""
end
-- Toggle native forms display
function p.setShowNativeForms(value)
config.showNativeForms = (value == true)
end
-- Get native forms display setting
function p.getShowNativeForms()
return config.showNativeForms
end
return p