Jump to content

Module:CountryData

From ICANNWiki

Documentation for this module may be created at Module:CountryData/doc

-- Module:CountryData
-- Unified module for country data management, providing a single source of truth
-- for country names, codes, regions, and variations. Also includes country display
-- functionality (formerly in MultiCountryDisplay.lua).
--
-- Features:
--   * Loads country data from JSON stored in MediaWiki
--   * Normalizes country names to canonical forms
--   * Maps countries to ICANN regions
--   * Provides extensible property access
--   * Integrates with Semantic MediaWiki
--   * Formats country lists with region-specific styling
--   * Processes countries for category assignment
--
-- Dependencies:
--   * Module:NormalizationDiacritic - For diacritic removal

local DiacriticNormalization = require('Module:NormalizationDiacritic')

-- Module-level cache tables for improved performance
local dataCache = nil
local nameLookupCache = nil
local regionLookupCache = nil
local propertyCache = {}
local functionCache = {}

-- Default data structure to use if JSON loading fails
local DEFAULT_DATA = {
    schema_version = 1,
    last_updated = os.date('!%Y-%m-%dT%H:%M:%SZ'),
    countries = {},
    icann_regions = {}
}

--------------------------------------------------------------------------------
-- Helper Functions
--------------------------------------------------------------------------------

-- Helper function to remove diacritics and normalize text
local function normalizeText(text)
    if not text or text == "" then
        return text
    end
    
    -- Consolidate string operations to reduce iterations
    -- 1. Trim leading/trailing whitespace
    -- 2. Convert to lowercase
    -- 3. Normalize apostrophes
    -- 4. Remove apostrophes
    -- 5. Remove periods
    -- 6. Replace hyphens, em dashes, en dashes and similar characters with spaces
    -- 7. Collapse multiple spaces
    local normalized = text:gsub("^%s*(.-)%s*$", "%1")  -- Trim whitespace
                          :lower()                       -- Convert to lowercase
                          :gsub("['`]", "'")             -- Normalize apostrophes
                          :gsub("'", "")                 -- Remove apostrophes
                          :gsub("%.", "")                -- Remove periods
                          :gsub("[-–—_/]", " ")          -- Replace hyphens, dashes, underscores, slashes with spaces
                          :gsub("%s+", " ")              -- Collapse multiple spaces
    return normalized
end

-- Create a cache key from a function name and arguments
local function createCacheKey(funcName, ...)
    local args = {...}
    local keyParts = {funcName}
    for i = 1, #args do
        table.insert(keyParts, tostring(args[i]) or "nil")
    end
    return table.concat(keyParts, ":")
end

-- Function to safely check if a table has a property
local function hasProperty(tbl, property)
    return tbl and type(tbl) == "table" and tbl[property] ~= nil
end

--------------------------------------------------------------------------------
-- Data Loading Layer
--------------------------------------------------------------------------------

-- Get name lookup cache - builds if not already cached
local function getNameLookup(data)
    if nameLookupCache then
        return nameLookupCache
    end
    
    -- If no data provided, return empty lookup
    if not data or not data.countries then
        nameLookupCache = {}
        return nameLookupCache
    end
    
    local lookup = {}
    
    -- Pre-count how many mappings we'll create to optimize memory allocation
    local mappingCount = 0
    for code, country in pairs(data.countries) do
        -- Count canonical name
        mappingCount = mappingCount + 1
        
        -- Count variations if they exist
        if country.variations and type(country.variations) == "table" then
            mappingCount = mappingCount + #country.variations
        end
    end
    
    -- Now build the lookup table with pre-counted size
    for code, country in pairs(data.countries) do
        -- Add name field as primary display name
        local displayName = country.name or country.canonical_name
        if displayName then
            lookup[normalizeText(displayName)] = code
        end
        
        -- Add canonical_name if different from name
        if country.canonical_name and country.canonical_name ~= country.name then
            lookup[normalizeText(country.canonical_name)] = code
        end
        
        -- Add variations
        if country.variations and type(country.variations) == "table" then
            for _, variation in ipairs(country.variations) do
                lookup[normalizeText(variation)] = code
            end
        end
    end
    
    nameLookupCache = lookup
    return lookup
end

-- Get region lookup cache - builds if not already cached
local function getRegionLookup(data)
    if regionLookupCache then
        return regionLookupCache
    end
    
    -- If no data provided, return empty lookup
    if not data or not data.icann_regions then
        regionLookupCache = {}
        return regionLookupCache
    end
    
    local lookup = {}
    
    -- Pre-count how many mappings we'll create to optimize memory allocation
    local mappingCount = 0
    for code, region in pairs(data.icann_regions) do
        -- Count canonical name
        mappingCount = mappingCount + 1
        
        -- Count variations if they exist
        if region.variations and type(region.variations) == "table" then
            mappingCount = mappingCount + #region.variations
        end
    end
    
    -- Now build the lookup table with pre-counted size
    for code, region in pairs(data.icann_regions) do
        -- Add canonical name
        if region.name then
            lookup[normalizeText(region.name)] = code
        end
        
        -- Add variations
        if region.variations and type(region.variations) == "table" then
            for _, variation in ipairs(region.variations) do
                lookup[normalizeText(variation)] = code
            end
        end
    end
    
    regionLookupCache = lookup
    return lookup
end

-- Main data loading function with multiple fallback methods
local function loadData(frame)
    -- Use the module-level cache if we already loaded data once
    if dataCache then
        return dataCache
    end

    local success, data = pcall(function()
        -- Get the JSON content using frame:preprocess if available
        local jsonText
        if frame and type(frame) == "table" and frame.preprocess then
            local preprocessSuccess, preprocessResult = pcall(function()
                return frame:preprocess('{{MediaWiki:CountryData.json}}')
            end)
            
            if preprocessSuccess and preprocessResult then
                jsonText = preprocessResult
            end
        end
        
        -- If we couldn't get JSON from frame:preprocess, fall back to direct content loading
        if not jsonText then
            -- Try using mw.loadJsonData first (preferred method)
            if mw.loadJsonData then
                local loadJsonSuccess, jsonData = pcall(function()
                    return mw.loadJsonData('MediaWiki:CountryData.json')
                end)
                
                if loadJsonSuccess and jsonData and type(jsonData) == 'table' then
                    return jsonData
                end
            end
            
            -- Direct content loading approach as fallback
            local pageTitle = mw.title.new('MediaWiki:CountryData.json')
            if not pageTitle or not pageTitle.exists then
                return DEFAULT_DATA
            end
            
            -- Get raw content from the wiki page
            local contentSuccess, content = pcall(function()
                return pageTitle:getContent()
            end)
            
            if contentSuccess and content and content ~= "" then
                -- Remove any BOM or leading whitespace that might cause issues
                content = content:gsub("^%s+", "")
                if content:byte(1) == 239 and content:byte(2) == 187 and content:byte(3) == 191 then
                    content = content:sub(4)
                end
                
                jsonText = content
            else
                return DEFAULT_DATA
            end
        end
        
        -- Try different JSON decode approaches
        if jsonText and mw.text and mw.text.jsonDecode then
            -- First try WITHOUT PRESERVE_KEYS flag (standard approach)
            local jsonDecodeSuccess, jsonData = pcall(function()
                return mw.text.jsonDecode(jsonText)
            end)
            
            if jsonDecodeSuccess and jsonData then
                return jsonData
            end
            
            -- If that failed, try with JSON_TRY_FIXING flag
            jsonDecodeSuccess, jsonData = pcall(function()
                return mw.text.jsonDecode(jsonText, mw.text.JSON_TRY_FIXING)
            end)
            
            if jsonDecodeSuccess and jsonData then
                return jsonData
            end
        end
        
        -- As absolute last resort, use local default data
        return DEFAULT_DATA
    end)

    if not success or not data then
        data = DEFAULT_DATA
    end

    -- Ensure minimum data structure
    if not data.countries then
        data.countries = {}
    end
    
    if not data.icann_regions then
        data.icann_regions = {}
    end

    dataCache = data
    return data
end

-- Reset the module-level caches (useful for testing)
local function resetCaches()
    dataCache = nil
    nameLookupCache = nil
    regionLookupCache = nil
    propertyCache = {}
    functionCache = {}
end

--------------------------------------------------------------------------------
-- Core API Functions
--------------------------------------------------------------------------------

local CountryData = {}

-- Load data and initialize caches
function CountryData.loadData(frame)
    return loadData(frame)
end

-- Reset all caches (primarily for testing)
function CountryData.resetCaches()
    resetCaches()
    return true
end

-- Get country data by ISO code
function CountryData.getCountryByCode(code)
    if not code or code == "" then
        return nil
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getCountryByCode", code)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    
    -- Standardize code to uppercase for consistency
    code = code:upper()
    
    local result = nil
    if data and data.countries and data.countries[code] then
        result = data.countries[code]
    end
    
    -- Cache the result (including nil)
    functionCache[cacheKey] = result
    return result
end

-- Get country data by name (including variations)
function CountryData.getCountryByName(name)
    if not name or name == "" then
        return nil
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getCountryByName", name)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local nameLookup = getNameLookup(data)
    
    -- Normalize the input
    local normalized = normalizeText(name)
    
    -- Look up the code
    local code = nameLookup[normalized]
    
    local result = nil
    if code and data.countries[code] then
        result = data.countries[code]
    else
        -- Try with diacritics removed
        local stripped = DiacriticNormalization.removeDiacritics(normalized)
        if stripped ~= normalized then
            code = nameLookup[stripped]
            if code and data.countries[code] then
                result = data.countries[code]
            end
        end
    end
    
    -- Cache the result (including nil)
    functionCache[cacheKey] = result
    return result
end

-- Get country code by name
function CountryData.getCountryCodeByName(name)
    if not name or name == "" then
        return nil
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getCountryCodeByName", name)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local nameLookup = getNameLookup(data)
    
    -- Normalize the input
    local normalized = normalizeText(name)
    
    -- Look up the code
    local code = nameLookup[normalized]
    
    if not code then
        -- Try with diacritics removed
        local stripped = DiacriticNormalization.removeDiacritics(normalized)
        if stripped ~= normalized then
            code = nameLookup[stripped]
        end
    end
    
    -- Cache the result (including nil)
    functionCache[cacheKey] = code
    return code
end

-- Normalize country name to canonical form with "(Unrecognized)" fallback
function CountryData.normalizeCountryName(name)
    if not name or name == "" then
        return name
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("normalizeCountryName", name)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local country = CountryData.getCountryByName(name)
    
    local result
    if country then
        -- Return name as the primary display name
        result = country.name or country.canonical_name
    else
        -- If no match, return "(Unrecognized)"
        result = "(Unrecognized)"
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get ICANN region for a country
function CountryData.getRegionByCountry(name)
    if not name or name == "" then
        return nil
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getRegionByCountry", name)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local country = CountryData.getCountryByName(name)
    
    local result
    if country and country.icann_region then
        result = country.icann_region
    else
        -- Return "(Unrecognized)" for consistency with normalizeCountryName
        result = "(Unrecognized)"
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get all countries in a specific region
function CountryData.getCountriesByRegion(region)
    if not region or region == "" then
        return {}
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getCountriesByRegion", region)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local regionLookup = getRegionLookup(data)
    
    -- Normalize the input
    local normalized = normalizeText(region)
    
    -- Look up the region code
    local regionCode = regionLookup[normalized]
    
    local result = {}
    if regionCode and data.countries then
        -- Pre-count number of countries in region for allocation
        local countryCount = 0
        for _, country in pairs(data.countries) do
            if country.icann_region == regionCode then
                countryCount = countryCount + 1
            end
        end
        
        -- Now populate the result with the pre-allocated size
        local index = 1
        for code, country in pairs(data.countries) do
            if country.icann_region == regionCode then
                result[index] = {
                    code = code,
                    name = country.name or country.canonical_name
                }
                index = index + 1
            end
        end
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get list of all country codes
function CountryData.getAllCountryCodes()
    -- Check function cache first
    local cacheKey = "getAllCountryCodes"
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local result = {}
    
    if data and data.countries then
        -- Pre-allocate the array to the number of countries
        local countryCount = 0
        for _ in pairs(data.countries) do
            countryCount = countryCount + 1
        end
        
        -- Now populate the array
        local index = 1
        for code in pairs(data.countries) do
            result[index] = code
            index = index + 1
        end
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get list of all canonical country names
function CountryData.getAllCountryNames()
    -- Check function cache first
    local cacheKey = "getAllCountryNames"
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    local result = {}
    
    if data and data.countries then
        -- Pre-allocate the array to the number of countries
        local countryCount = 0
        for _ in pairs(data.countries) do
            countryCount = countryCount + 1
        end
        
        -- Now populate the array
        local index = 1
        for _, country in pairs(data.countries) do
            local name = country.name or country.canonical_name
            result[index] = name
            index = index + 1
        end
    end
    
    -- Cache the result
    functionCache[cacheKey] = result
    return result
end

-- Get any property for a country by code
function CountryData.getCountryProperty(code, property)
    if not code or code == "" or not property or property == "" then
        return nil
    end
    
    -- Check property cache first
    local cacheKey = createCacheKey("getCountryProperty", code, property)
    if propertyCache[cacheKey] ~= nil then
        return propertyCache[cacheKey]
    end
    
    local country = CountryData.getCountryByCode(code)
    
    local result = nil
    if country and country[property] ~= nil then
        result = country[property]
    end
    
    -- Cache the result (including nil)
    propertyCache[cacheKey] = result
    return result
end

-- Get any property for a country by name
function CountryData.getCountryPropertyByName(name, property)
    if not name or name == "" or not property or property == "" then
        return nil
    end
    
    -- Check property cache first
    local cacheKey = createCacheKey("getCountryPropertyByName", name, property)
    if propertyCache[cacheKey] ~= nil then
        return propertyCache[cacheKey]
    end
    
    local code = CountryData.getCountryCodeByName(name)
    
    local result = nil
    if code then
        result = CountryData.getCountryProperty(code, property)
    end
    
    -- Cache the result (including nil)
    propertyCache[cacheKey] = result
    return result
end

-- List all available properties for a country
function CountryData.getAvailableProperties(code)
    if not code or code == "" then
        return {}
    end
    
    -- Check function cache first
    local cacheKey = createCacheKey("getAvailableProperties", code)
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    if not data or not data.countries or not data.countries[code] then
        return {}
    end
    
    local properties = {}
    
    -- Pre-allocate the table based on the number of properties
    local propertyCount = 0
    for _ in pairs(data.countries[code]) do
        propertyCount = propertyCount + 1
    end
    
    -- Fill the table with property names
    local index = 1
    for property in pairs(data.countries[code]) do
        properties[index] = property
        index = index + 1
    end
    
    -- Cache the result
    functionCache[cacheKey] = properties
    return properties
end

-- Get all unique property names across all countries
function CountryData.getAllPropertyNames()
    -- Check function cache first
    local cacheKey = "getAllPropertyNames"
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    local data = loadData()
    if not data or not data.countries then
        return {}
    end
    
    local properties = {}
    local seen = {}
    
    -- First pass: count unique properties for pre-allocation
    local propertyCount = 0
    for _, country in pairs(data.countries) do
        for property in pairs(country) do
            if not seen[property] then
                seen[property] = true
                propertyCount = propertyCount + 1
            end
        end
    end
    
    -- Reset seen table
    seen = {}
    
    -- Second pass: fill the pre-allocated table
    local index = 1
    for _, country in pairs(data.countries) do
        for property in pairs(country) do
            if not seen[property] then
                seen[property] = true
                properties[index] = property
                index = index + 1
            end
        end
    end
    
    -- Cache the result
    functionCache[cacheKey] = properties
    return properties
end

-- Get semantic property for a country
function CountryData.getCountrySemanticProperty(name, property)
    local code = CountryData.getCountryCodeByName(name)
    if not code then
        return "(Unrecognized)"
    end
    
    local value = CountryData.getCountryProperty(code, property)
    if value == nil then
        return "(Unrecognized)"
    end
    
    return value
end

-- Get semantic property name from ConfigRepository
function CountryData.getSemanticPropertyName(propertyKey)
    local ConfigRepository = require('Module:ConfigRepository')
    
    -- Look through all template configurations
    for templateName, templateConfig in pairs(ConfigRepository.templates) do
        -- Check if this template has semantics configuration
        if templateConfig.semantics and templateConfig.semantics.additionalProperties then
            -- Check if the property key exists in additionalProperties
            if templateConfig.semantics.additionalProperties[propertyKey] then
                return propertyKey
            end
        end
    end
    
    -- If not found, return nil
    return nil
end

-- Add country semantic properties to a page
function CountryData.addCountrySemanticProperties(countryValue, semanticOutput)
    if not countryValue or countryValue == "" then
        return semanticOutput
    end
    
    -- Get property names from ConfigRepository
    local countryPropertyName = CountryData.getSemanticPropertyName("Has country")
    local regionPropertyName = CountryData.getSemanticPropertyName("Has ICANN region")
    
    -- If property names are not found in ConfigRepository, we can't proceed
    if not countryPropertyName or not regionPropertyName then
        return semanticOutput
    end
    
    -- For non-SMW case, collect property HTML fragments in a table for efficient concatenation
    local propertyHtml = {}
    
    -- Split multi-value country strings
    local countries = {}
    for country in string.gmatch(countryValue, "[^;]+") do
        local trimmedCountry = country:match("^%s*(.-)%s*$")
        if trimmedCountry and trimmedCountry ~= "" then
            table.insert(countries, trimmedCountry)
        end
    end
    
    -- Process each country
    for _, country in ipairs(countries) do
        local normalizedCountry = CountryData.normalizeCountryName(country)
        
        -- Only process recognized countries
        if normalizedCountry ~= "(Unrecognized)" then
            -- Add as semantic property
            if mw.smw then
                local property = {}
                property[countryPropertyName] = normalizedCountry
                mw.smw.set(property)
            else
                -- Collect HTML fragments instead of concatenating strings
                table.insert(propertyHtml, '<div style="display:none;">')
                table.insert(propertyHtml, '  {{#set: ' .. countryPropertyName .. '=' .. normalizedCountry .. ' }}')
                table.insert(propertyHtml, '</div>')
            end
            
            -- Add region as semantic property
            local region = CountryData.getRegionByCountry(country)
            if region and region ~= "(Unrecognized)" then
                if mw.smw then
                    local property = {}
                    property[regionPropertyName] = region
                    mw.smw.set(property)
                else
                    -- Collect HTML fragments instead of concatenating strings
                    table.insert(propertyHtml, '<div style="display:none;">')
                    table.insert(propertyHtml, '  {{#set: ' .. regionPropertyName .. '=' .. region .. ' }}')
                    table.insert(propertyHtml, '</div>')
                end
            end
        end
    end
    
    -- For non-SMW case, concatenate all property HTML fragments at once
    if not mw.smw and #propertyHtml > 0 then
        semanticOutput = semanticOutput .. "\n" .. table.concat(propertyHtml, "\n")
    end
    
    return semanticOutput
end

-- Export country data as JSON string (for JavaScript usage)
function CountryData.exportAsJson()
    local data = loadData()
    
    -- Ensure we have valid data
    if not data or not data.countries then
        return '{}'
    end
    
    -- Use MediaWiki's JSON encoder
    if mw.text and mw.text.jsonEncode then
        local success, result = pcall(function()
            return mw.text.jsonEncode(data)
        end)
        
        if success and result then
            return result
        end
    end
    
    -- Fallback to simple string if JSON encoding fails
    return '{}'
end

--------------------------------------------------------------------------------
-- Country Display Functions (Migrated from MultiCountryDisplay)
--------------------------------------------------------------------------------

-- Get region-specific CSS class for country display
local function getRegionClass(region)
    if not region or region == "(Unrecognized)" then
        return "region-default"
    end
    
    if region == "NA" or region == "LAC" then
        return "region-americas"
    elseif region == "AP" then
        return "region-asia-pacific"
    else
        return "region-europe-africa"
    end
end

-- Format a list of countries from a semicolon-separated string
-- Returns either plain text (single country) or bullet points (multiple countries)
-- Each country gets its own region-specific class for styling
function CountryData.formatCountryList(value)
    if not value or value == "" then return "" end
    
    -- Split and normalize countries
    local countries = {}
    for country in string.gmatch(value, "[^;]+") do
        local trimmed = country:match("^%s*(.-)%s*$")
        if trimmed and trimmed ~= "" then
            table.insert(countries, trimmed)
        end
    end
    
    local normalizedCountries = {}
    local validCountriesCount = 0
    
    for _, country in ipairs(countries) do
        local normalized = CountryData.normalizeCountryName(country)
        -- Only include recognized countries
        if normalized ~= "(Unrecognized)" then
            validCountriesCount = validCountriesCount + 1
            normalizedCountries[validCountriesCount] = normalized
        end
    end
    
    -- Generate output based on number of countries
    if validCountriesCount > 1 then
        local listItems = {}
        
        for _, country in ipairs(normalizedCountries) do
            -- Get the region for this specific country
            local countryRegion = CountryData.getRegionByCountry(country)
            local regionClass = getRegionClass(countryRegion)
            
            -- Create a list item with region-specific class
            table.insert(listItems, string.format("<li class=\"%s\">%s</li>", regionClass, country))
        end
        
        return string.format("<ul class=\"template-list template-list-country\">%s</ul>", 
                             table.concat(listItems, ""))
    elseif validCountriesCount == 1 then
        -- For a single country, create a similar list with just one item
        local countryRegion = CountryData.getRegionByCountry(normalizedCountries[1])
        local regionClass = getRegionClass(countryRegion)
        
        -- Single item list with the same styling
        return string.format("<ul class=\"template-list template-list-country\"><li class=\"%s\">%s</li></ul>", 
                             regionClass, normalizedCountries[1])
    end
    
    return ""
end

-- Alias for backward compatibility
function CountryData.formatCountries(value)
    return CountryData.formatCountryList(value)
end

-- Get a list of normalized countries for category assignment
function CountryData.getCountriesForCategories(value)
    if not value or value == "" then return {} end
    
    local countries = {}
    for country in string.gmatch(value, "[^;]+") do
        local trimmed = country:match("^%s*(.-)%s*$")
        if trimmed and trimmed ~= "" then
            table.insert(countries, trimmed)
        end
    end
    
    local normalizedCountries = {}
    local validCount = 0
    
    for _, country in ipairs(countries) do
        local normalized = CountryData.normalizeCountryName(country)
        -- Only include recognized countries
        if normalized ~= "(Unrecognized)" then
            validCount = validCount + 1
            normalizedCountries[validCount] = normalized
        end
    end
    
    return normalizedCountries
end

-- Return the module for use
return CountryData