Jump to content

Module:CountryNormalization

From ICANNWiki

Documentation for this module may be created at Module:CountryNormalization/doc

-- Module:CountryNormalization
-- Standardizes country names based on their ISO canonical form by mapping common variations, abbreviations, and alternative spellings to a single, consistent output, ensuring uniformity in data representation.

local p = {}

-- Define the mapping table once at module scope.
local mapping = {
    -- Brunei Darussalam
    ["brunei darussalam"] = "Brunei Darussalam",
    ["brunei"] = "Brunei Darussalam",

    -- Cocos (Keeling) Islands
    ["cocos (keeling) islands"] = "Cocos (Keeling) Islands",
    ["cocos islands"] = "Cocos (Keeling) Islands",
    ["keeling islands"] = "Cocos (Keeling) Islands",

    -- Congo
    ["congo"] = "Congo",
    ["republic of the congo"] = "Congo",
    ["congo-brazzaville"] = "Congo",

    -- Congo, Democratic Republic of the
    ["congo, democratic republic of the"] = "Congo, Democratic Republic of the",
    ["democratic republic of the congo"] = "Congo, Democratic Republic of the",
    ["drc"] = "Congo, Democratic Republic of the",
    ["dr congo"] = "Congo, Democratic Republic of the",
    ["congo-kinshasa"] = "Congo, Democratic Republic of the",

    -- Côte d'Ivoire
    ["côte d'ivoire"] = "Côte d'Ivoire",
    ["cote d'ivoire"] = "Côte d'Ivoire",
    ["ivory coast"] = "Côte d'Ivoire",

    -- Curaçao
    ["curaçao"] = "Curaçao",
    ["curacao"] = "Curaçao",

    -- Czechia
    ["czechia"] = "Czechia",
    ["czech republic"] = "Czechia",

    -- Eswatini
    ["eswatini"] = "Eswatini",
    ["swaziland"] = "Eswatini",

    -- The Gambia
    ["the gambia"] = "The Gambia",
    ["gambia"] = "The Gambia",

    -- Iran (Islamic Republic of)
    ["iran (islamic republic of)"] = "Iran (Islamic Republic of)",
    ["iran"] = "Iran (Islamic Republic of)",

    -- Lao People's Democratic Republic
    ["lao people's democratic republic"] = "Lao People's Democratic Republic",
    ["lao pdr"] = "Lao People's Democratic Republic",
    ["laos"] = "Lao People's Democratic Republic",

    -- Macao
    ["macao"] = "Macao",
    ["macau"] = "Macao",

    -- Micronesia (Federated States of)
    ["micronesia (federated states of)"] = "Micronesia (Federated States of)",
    ["federated states of micronesia"] = "Micronesia (Federated States of)",
    ["micronesia"] = "Micronesia (Federated States of)",

    -- Myanmar
    ["myanmar"] = "Myanmar",
    ["burma"] = "Myanmar",

    -- Netherlands
    ["netherlands"] = "Netherlands",
    ["holland"] = "Netherlands",

    -- Russian Federation
    ["russian federation"] = "Russian Federation",
    ["russia"] = "Russian Federation",

    -- Saint Barthélemy
    ["saint barthélemy"] = "Saint Barthélemy",
    ["saint barthelemy"] = "Saint Barthélemy",
    ["st barthelemy"] = "Saint Barthélemy",

    -- Saint Kitts and Nevis
    ["saint kitts and nevis"] = "Saint Kitts and Nevis",
    ["st kitts and nevis"] = "Saint Kitts and Nevis",

    -- Saint Pierre and Miquelon
    ["saint pierre and miquelon"] = "Saint Pierre and Miquelon",
    ["st pierre and miquelon"] = "Saint Pierre and Miquelon",

    -- Saint Vincent and the Grenadines
    ["saint vincent and the grenadines"] = "Saint Vincent and the Grenadines",
    ["st vincent and the grenadines"] = "Saint Vincent and the Grenadines",

    -- Syrian Arab Republic
    ["syrian arab republic"] = "Syrian Arab Republic",
    ["syria"] = "Syrian Arab Republic",

    -- United Arab Emirates
    ["united arab emirates"] = "United Arab Emirates",
    ["uae"] = "United Arab Emirates",
    ["u a e"] = "United Arab Emirates",

    -- United Kingdom of Great Britain and Northern Ireland
    ["united kingdom of great britain and northern ireland"] = "United Kingdom of Great Britain and Northern Ireland",
    ["united kingdom"] = "United Kingdom of Great Britain and Northern Ireland",
    ["uk"] = "United Kingdom of Great Britain and Northern Ireland",
    ["u k"] = "United Kingdom of Great Britain and Northern Ireland",

    -- United States of America
    ["united states of america"] = "United States of America",
    ["united states"] = "United States of America",
    ["usa"] = "United States of America",
    ["us"] = "United States of America",
    ["u s a"] = "United States of America",
    ["u s"] = "United States of America",

    -- Viet Nam
    ["viet nam"] = "Viet Nam",
    ["vietnam"] = "Viet Nam",

    -- Virgin Islands (U.S.)
    ["virgin islands (us)"] = "Virgin Islands (U.S.)",
    ["virgin islands (u s)"] = "Virgin Islands (U.S.)",
    ["united states virgin islands"] = "Virgin Islands (U.S.)",
    ["us virgin islands"] = "Virgin Islands (U.S.)",
}

    function p.formatCountry(inputCountry)
        if not inputCountry or inputCountry == "" then
            return inputCountry
        end
    
        -- Trim leading/trailing whitespace.
        inputCountry = inputCountry:gsub("^%s*(.-)%s*$", "%1")
        -- Convert to lowercase and normalize apostrophes.
        local lower = inputCountry:lower():gsub("[’`]", "'")
        -- Remove periods, collapse multiple spaces, and trim.
        lower = lower:gsub("%.", ""):gsub("%s+", " "):gsub("^%s*(.-)%s*$", "%1")
        
        local canonical = mapping[lower]
        if canonical then
            return canonical
        else
            return inputCountry
        end
    end
    
    return p