Jump to content

Module:CanonicalForms

Documentation for this module may be created at Module:CanonicalForms/doc

--[[
* Name: CanonicalForms
* Author: Mark W. Datysgeld
* Description: Text normalization utility that removes wiki markup and maps user input to canonical values using configurable lookup tables
* Notes: Example usage: local mapping = { { canonical = "gTLD", synonyms = {"generic", "g"} }, { canonical = "ccTLD", synonyms = {"country", "cc"} } }; local canonical, css, category = require('Module:CanonicalForms').normalize(inputString, mapping)
]]

local CanonicalForms = {}

-- Normalize an input string
-- Removes wiki markup, converts to lowercase, and maps to canonical form
function CanonicalForms.normalize(input, mappingTable)
    if not input or input == "" then
        return nil, nil, nil
    end

    -- Remove wiki internal link markup (e.g., "[[Brand TLD]]" → "Brand TLD")
    local cleanInput = input:gsub("%[%[([^|%]]+)|?[^%]]*%]%]", "%1"):lower()

    -- Create lookup table for faster matching (first call only)
    if not mappingTable._lookupCache then
        local lookupCache = {}
        for _, group in ipairs(mappingTable) do
            -- Add the canonical form itself to the lookup (in lowercase)
            lookupCache[group.canonical:lower()] = group
            
            -- Add all synonyms to the lookup
            for _, syn in ipairs(group.synonyms or {}) do
                lookupCache[syn:lower()] = group
            end
        end
        mappingTable._lookupCache = lookupCache
    end

    -- Direct lookup via cache
    local match = mappingTable._lookupCache[cleanInput]
    if match then
        return match.canonical, match.css, match.category
    end

    return cleanInput, nil, nil
end

return CanonicalForms