Jump to content

Module:NormalizationText

Documentation for this module may be created at Module:NormalizationText/doc

-- Module:NormalizationText
-- Comprehensive text normalization module for standardizing text formats.
-- Features:
--   * String normalization (case, whitespace, punctuation)
--   * Wiki link processing
--   * Multi-value string handling
--   * User input sanitization
--   * Extensive caching for performance

local p = {}

-- Module-level caches
local functionCache = {}
local wikiLinkCache = {}

-- Pattern categories for sanitizing user input
p.SANITIZE_PATTERNS = {
    WIKI_LINKS = {
        { 
            pattern = "%[%[([^|%]]+)%]%]", 
            replacement = function(match) 
                return p.processWikiLink("[[" .. match .. "]]", "strip") 
            end 
        },
        { 
            pattern = "%[%[([^|%]]+)|([^%]]+)%]%]", 
            replacement = function(match1, match2) 
                return p.processWikiLink("[[" .. match1 .. "|" .. match2 .. "]]", "strip") 
            end 
        }
    },
    SINGLE_BRACES = {
        { pattern = "{([^{}]+)}", replacement = "%1" }                   -- {text} -> text
    },
    HTML_BASIC = {
        { pattern = "</?[bi]>", replacement = "" },                      -- Remove <b>, </b>, <i>, </i>
        { pattern = "</?span[^>]*>", replacement = "" }                  -- Remove <span...>, </span>
    },
    LOGO = {
        { pattern = "^[Ff][Ii][Ll][Ee]%s*:", replacement = "" }          -- Remove "File:" prefix
    },
    IMAGE_FILES = {
        { pattern = "%[%[([^|%]]+)%]%]", replacement = "%1" },            -- [[Image.jpg]] -> Image.jpg
        { pattern = "%[%[([^|%]]+)|.+%]%]", replacement = "%1" },         -- [[Image.jpg|...]] -> Image.jpg
        { pattern = "^[Ff][Ii][Ll][Ee]%s*:", replacement = "" },          -- Remove "File:" prefix
        { pattern = "^[Ii][Mm][Aa][Gg][Ee]%s*:", replacement = "" }       -- Remove "Image:" prefix too
    }
}

-- Helper for generating cache keys
function p.generateCacheKey(prefix, ...)
    local args = {...}
    local parts = {prefix}
    
    for i, arg in ipairs(args) do
        if type(arg) == "table" then
            -- For tables, we can't reliably generate a cache key
            -- So we just use a placeholder with the table's memory address
            parts[i+1] = "table:" .. tostring(arg)
        elseif type(arg) == "nil" then
            parts[i+1] = "nil"
        else
            parts[i+1] = tostring(arg)
        end
    end
    
    return table.concat(parts, ":")
end

-- Generic caching wrapper
function p.withCache(cacheKey, operation)
    -- Check if result is already cached
    if functionCache[cacheKey] ~= nil then
        return functionCache[cacheKey]
    end
    
    -- Execute operation and cache result
    local result = operation()
    functionCache[cacheKey] = result
    return result
end

-- Core text normalization function
function p.normalizeText(text)
    if not text or text == "" then
        return text
    end
    
    -- Create a cache key
    local cacheKey = p.generateCacheKey("normalizeText", text)
    
    -- Use the caching wrapper
    return p.withCache(cacheKey, function()
        -- Consolidate string operations to reduce iterations
        local normalized = text:gsub("^%s*(.-)%s*$", "%1")  -- Trim whitespace
                             :lower()                       -- Convert to lowercase
                             :gsub("['`]", "'")             -- Normalize apostrophes
                             :gsub("'", "")                  -- Remove apostrophes
                             :gsub("[,%.]", "")              -- Remove commas & periods
                             :gsub("%s*&%s*", " and ")       -- Normalize "&" to "and"
                             :gsub("[-–—_/]", " ")          -- Replace hyphens, dashes, underscores, slashes with spaces
                             :gsub("%s+", " ")              -- Collapse multiple spaces
        return normalized
    end)
end

-- Trims leading and trailing whitespace
function p.trim(s)
    if s == nil then
        return ""
    end
    
    -- Create a cache key
    local cacheKey = p.generateCacheKey("trim", s)
    
    -- Use the caching wrapper
    return p.withCache(cacheKey, function()
        return (s:gsub("^%s+", ""):gsub("%s+$", ""))
    end)
end

-- Process wiki links with different modes
function p.processWikiLink(value, mode)
    -- Default mode is "extract"
    mode = mode or "extract"
    
    -- Early return for nil or empty values
    if not value or value == "" then
        return mode == "check" and false or value
    end
    
    -- Create cache key combining value and mode
    local cacheKey = value .. ":" .. mode
    
    -- Check cache first
    if wikiLinkCache[cacheKey] ~= nil then
        return wikiLinkCache[cacheKey]
    end
    
    -- Check if the value is a wiki link
    local isWikiLink = value:match("^%[%[.-%]%]$") ~= nil
    
    -- For check mode, just return whether it's a wiki link
    if mode == "check" then
        wikiLinkCache[cacheKey] = isWikiLink
        return isWikiLink
    end
    
    -- If it's not a wiki link, return the original value
    if not isWikiLink then
        wikiLinkCache[cacheKey] = value
        return value
    end
    
    -- Extract components from the wiki link
    local pageName, displayText
    
    -- Try to match [[PageName|DisplayText]] format
    pageName, displayText = value:match("^%[%[([^%|%]]+)%|([^%]]+)%]%]$")
    
    if not pageName then
        -- Try to match [[PageName]] format
        pageName = value:match("^%[%[([^%|%]]+)%]%]$")
        displayText = pageName -- In this case, display text is the same as page name
    end
    
    -- Determine result based on mode
    local result
    if mode == "extract" then
        result = pageName or value
    elseif mode == "strip" then
        result = displayText or value
    else
        -- Default to extract mode for unknown modes
        result = pageName or value
    end
    
    -- Store result in cache
    wikiLinkCache[cacheKey] = result
    
    return result
end

-- Extract page name from wiki link
function p.extractFromWikiLink(value)
    return p.processWikiLink(value, "extract")
end

-- Sanitizes user input by removing unwanted patterns
function p.sanitizeUserInput(value, patternCategories, customPatterns, options)
    -- Fast path for nil/empty values
    if not value or value == "" then return "" end
    
    -- Initialize options
    options = options or {}
    
    -- Create a cache key
    -- For caching, we need to handle the case where patternCategories or customPatterns are tables
    local patternCategoriesKey
    if type(patternCategories) == "table" then
        patternCategoriesKey = "table:" .. tostring(patternCategories)
    else
        patternCategoriesKey = tostring(patternCategories)
    end
    
    local customPatternsKey
    if type(customPatterns) == "table" then
        customPatternsKey = "table:" .. tostring(customPatterns)
    else
        customPatternsKey = tostring(customPatterns)
    end
    
    local preserveWikiLinksKey = options.preserveWikiLinks and "true" or "false"
    
    local cacheKey = p.generateCacheKey("sanitizeUserInput", value, patternCategoriesKey, customPatternsKey, preserveWikiLinksKey)
    
    -- Use the caching wrapper
    return p.withCache(cacheKey, function()
        -- Collect patterns to apply
        local patternsToApply = {}
        local patternCount = 0
        
        -- Process requested pattern categories
        if patternCategories then
            -- Handle single category string
            if type(patternCategories) == "string" then
                -- Skip WIKI_LINKS category if preserveWikiLinks is true
                if not (options.preserveWikiLinks and patternCategories == "WIKI_LINKS") then
                    if p.SANITIZE_PATTERNS[patternCategories] then
                        for _, pattern in ipairs(p.SANITIZE_PATTERNS[patternCategories]) do
                            patternCount = patternCount + 1
                            patternsToApply[patternCount] = pattern
                        end
                    end
                end
            -- Handle table of categories
            elseif type(patternCategories) == "table" then
                for _, category in ipairs(patternCategories) do
                    -- Skip WIKI_LINKS category if preserveWikiLinks is true
                    if not (options.preserveWikiLinks and category == "WIKI_LINKS") then
                        if p.SANITIZE_PATTERNS[category] then
                            for _, pattern in ipairs(p.SANITIZE_PATTERNS[category]) do
                                patternCount = patternCount + 1
                                patternsToApply[patternCount] = pattern
                            end
                        end
                    end
                end
            end
        else
            -- Default to WIKI_LINKS and SINGLE_BRACES if no categories specified
            -- Skip WIKI_LINKS if preserveWikiLinks is true
            if not options.preserveWikiLinks then
                for _, pattern in ipairs(p.SANITIZE_PATTERNS.WIKI_LINKS) do
                    patternCount = patternCount + 1
                    patternsToApply[patternCount] = pattern
                end
            end
            
            -- Always include SINGLE_BRACES
            for _, pattern in ipairs(p.SANITIZE_PATTERNS.SINGLE_BRACES) do
                patternCount = patternCount + 1
                patternsToApply[patternCount] = pattern
            end
        end
        
        -- Add any custom patterns
        if customPatterns and type(customPatterns) == "table" then
            for _, pattern in ipairs(customPatterns) do
                patternCount = patternCount + 1
                patternsToApply[patternCount] = pattern
            end
        end
        
        -- Fast path if no patterns to apply
        if patternCount == 0 then
            return value
        end
        
        -- Apply each pattern sequentially
        local result = value
        for i = 1, patternCount do
            local patternInfo = patternsToApply[i]
            result = result:gsub(patternInfo.pattern, patternInfo.replacement)
        end
        
        return result
    end)
end

-- Default delimiters for splitMultiValueString
p.DEFAULT_DELIMITERS = {
    {pattern = "%s+and%s+", replacement = ";"},
    {pattern = ";%s*", replacement = ";"}
}

-- Semicolon-only pattern for backward compatibility
p.SEMICOLON_PATTERN = {{pattern = ";%s*", replacement = ";"}}

-- Splits multi-value strings with various delimiters
function p.splitMultiValueString(value, delimiters)
    if not value or value == "" then return {} end
    
    -- Create a cache key
    local delimitersKey = delimiters and "custom" or "default"
    local cacheKey = p.generateCacheKey("splitMultiValueString", value, delimitersKey)
    
    -- Use the caching wrapper
    return p.withCache(cacheKey, function()
        -- Use provided delimiters or default ones
        delimiters = delimiters or p.DEFAULT_DELIMITERS
        
        -- Standardize all delimiters to semicolons
        local standardizedInput = value
        for _, delimiter in ipairs(delimiters) do
            standardizedInput = standardizedInput:gsub(delimiter.pattern, delimiter.replacement)
        end
        
        -- Pre-allocate table based on delimiter count
        -- Count semicolons to estimate the number of items
        local count = 0
        for _ in standardizedInput:gmatch(";") do 
            count = count + 1 
        end
        
        -- Pre-allocate table with estimated size (count+1 for the last item)
        local items = {}
        
        -- Split by semicolons and return the array
        local index = 1
        for item in standardizedInput:gmatch("[^;]+") do
            local trimmed = item:match("^%s*(.-)%s*$")
            if trimmed and trimmed ~= "" then
                items[index] = trimmed
                index = index + 1
            end
        end
        
        return items
    end)
end

-- Joins a table of values with the specified delimiter
function p.joinValues(values, delimiter)
    delimiter = delimiter or "; "
    if not values or #values == 0 then return "" end
    
    -- Create a cache key
    local cacheKey = p.generateCacheKey("joinValues", table.concat(values, "||"), delimiter)
    
    -- Use the caching wrapper
    return p.withCache(cacheKey, function()
        return table.concat(values, delimiter)
    end)
end

return p