Module:NormalizationText
Appearance
Documentation for this module may be created at Module:NormalizationText/doc
-- Module:NormalizationText
-- Comprehensive text normalization module for standardizing text formats.
-- Features:
-- * String normalization (case, whitespace, punctuation)
-- * Wiki link processing
-- * Multi-value string handling
-- * User input sanitization
-- * Extensive caching for performance
local p = {}
-- Module-level caches
local functionCache = {}
local wikiLinkCache = {}
-- Pattern categories for sanitizing user input
p.SANITIZE_PATTERNS = {
WIKI_LINKS = {
{
pattern = "%[%[([^|%]]+)%]%]",
replacement = function(match)
return p.processWikiLink("[[" .. match .. "]]", "strip")
end
},
{
pattern = "%[%[([^|%]]+)|([^%]]+)%]%]",
replacement = function(match1, match2)
return p.processWikiLink("[[" .. match1 .. "|" .. match2 .. "]]", "strip")
end
}
},
SINGLE_BRACES = {
{ pattern = "{([^{}]+)}", replacement = "%1" } -- {text} -> text
},
HTML_BASIC = {
{ pattern = "</?[bi]>", replacement = "" }, -- Remove <b>, </b>, <i>, </i>
{ pattern = "</?span[^>]*>", replacement = "" } -- Remove <span...>, </span>
},
LOGO = {
{ pattern = "^[Ff][Ii][Ll][Ee]%s*:", replacement = "" } -- Remove "File:" prefix
},
IMAGE_FILES = {
{ pattern = "%[%[([^|%]]+)%]%]", replacement = "%1" }, -- [[Image.jpg]] -> Image.jpg
{ pattern = "%[%[([^|%]]+)|.+%]%]", replacement = "%1" }, -- [[Image.jpg|...]] -> Image.jpg
{ pattern = "^[Ff][Ii][Ll][Ee]%s*:", replacement = "" }, -- Remove "File:" prefix
{ pattern = "^[Ii][Mm][Aa][Gg][Ee]%s*:", replacement = "" } -- Remove "Image:" prefix too
}
}
-- Helper for generating cache keys
function p.generateCacheKey(prefix, ...)
local args = {...}
local parts = {prefix}
for i, arg in ipairs(args) do
if type(arg) == "table" then
-- For tables, we can't reliably generate a cache key
-- So we just use a placeholder with the table's memory address
parts[i+1] = "table:" .. tostring(arg)
elseif type(arg) == "nil" then
parts[i+1] = "nil"
else
parts[i+1] = tostring(arg)
end
end
return table.concat(parts, ":")
end
-- Generic caching wrapper
function p.withCache(cacheKey, operation)
-- Check if result is already cached
if functionCache[cacheKey] ~= nil then
return functionCache[cacheKey]
end
-- Execute operation and cache result
local result = operation()
functionCache[cacheKey] = result
return result
end
-- Core text normalization function
function p.normalizeText(text)
if not text or text == "" then
return text
end
-- Create a cache key
local cacheKey = p.generateCacheKey("normalizeText", text)
-- Use the caching wrapper
return p.withCache(cacheKey, function()
-- Consolidate string operations to reduce iterations
local normalized = text:gsub("^%s*(.-)%s*$", "%1") -- Trim whitespace
:lower() -- Convert to lowercase
:gsub("['`]", "'") -- Normalize apostrophes
:gsub("'", "") -- Remove apostrophes
:gsub("[,%.]", "") -- Remove commas & periods
:gsub("%s*&%s*", " and ") -- Normalize "&" to "and"
:gsub("[-–—_/]", " ") -- Replace hyphens, dashes, underscores, slashes with spaces
:gsub("%s+", " ") -- Collapse multiple spaces
return normalized
end)
end
-- Trims leading and trailing whitespace
function p.trim(s)
if s == nil then
return ""
end
-- Create a cache key
local cacheKey = p.generateCacheKey("trim", s)
-- Use the caching wrapper
return p.withCache(cacheKey, function()
return (s:gsub("^%s+", ""):gsub("%s+$", ""))
end)
end
-- Process wiki links with different modes
function p.processWikiLink(value, mode)
-- Default mode is "extract"
mode = mode or "extract"
-- Early return for nil or empty values
if not value or value == "" then
return mode == "check" and false or value
end
-- Create cache key combining value and mode
local cacheKey = value .. ":" .. mode
-- Check cache first
if wikiLinkCache[cacheKey] ~= nil then
return wikiLinkCache[cacheKey]
end
-- Check if the value is a wiki link
local isWikiLink = value:match("^%[%[.-%]%]$") ~= nil
-- For check mode, just return whether it's a wiki link
if mode == "check" then
wikiLinkCache[cacheKey] = isWikiLink
return isWikiLink
end
-- If it's not a wiki link, return the original value
if not isWikiLink then
wikiLinkCache[cacheKey] = value
return value
end
-- Extract components from the wiki link
local pageName, displayText
-- Try to match [[PageName|DisplayText]] format
pageName, displayText = value:match("^%[%[([^%|%]]+)%|([^%]]+)%]%]$")
if not pageName then
-- Try to match [[PageName]] format
pageName = value:match("^%[%[([^%|%]]+)%]%]$")
displayText = pageName -- In this case, display text is the same as page name
end
-- Determine result based on mode
local result
if mode == "extract" then
result = pageName or value
elseif mode == "strip" then
result = displayText or value
else
-- Default to extract mode for unknown modes
result = pageName or value
end
-- Store result in cache
wikiLinkCache[cacheKey] = result
return result
end
-- Extract page name from wiki link
function p.extractFromWikiLink(value)
return p.processWikiLink(value, "extract")
end
-- Sanitizes user input by removing unwanted patterns
function p.sanitizeUserInput(value, patternCategories, customPatterns, options)
-- Fast path for nil/empty values
if not value or value == "" then return "" end
-- Initialize options
options = options or {}
-- Create a cache key
-- For caching, we need to handle the case where patternCategories or customPatterns are tables
local patternCategoriesKey
if type(patternCategories) == "table" then
patternCategoriesKey = "table:" .. tostring(patternCategories)
else
patternCategoriesKey = tostring(patternCategories)
end
local customPatternsKey
if type(customPatterns) == "table" then
customPatternsKey = "table:" .. tostring(customPatterns)
else
customPatternsKey = tostring(customPatterns)
end
local preserveWikiLinksKey = options.preserveWikiLinks and "true" or "false"
local cacheKey = p.generateCacheKey("sanitizeUserInput", value, patternCategoriesKey, customPatternsKey, preserveWikiLinksKey)
-- Use the caching wrapper
return p.withCache(cacheKey, function()
-- Collect patterns to apply
local patternsToApply = {}
local patternCount = 0
-- Process requested pattern categories
if patternCategories then
-- Handle single category string
if type(patternCategories) == "string" then
-- Skip WIKI_LINKS category if preserveWikiLinks is true
if not (options.preserveWikiLinks and patternCategories == "WIKI_LINKS") then
if p.SANITIZE_PATTERNS[patternCategories] then
for _, pattern in ipairs(p.SANITIZE_PATTERNS[patternCategories]) do
patternCount = patternCount + 1
patternsToApply[patternCount] = pattern
end
end
end
-- Handle table of categories
elseif type(patternCategories) == "table" then
for _, category in ipairs(patternCategories) do
-- Skip WIKI_LINKS category if preserveWikiLinks is true
if not (options.preserveWikiLinks and category == "WIKI_LINKS") then
if p.SANITIZE_PATTERNS[category] then
for _, pattern in ipairs(p.SANITIZE_PATTERNS[category]) do
patternCount = patternCount + 1
patternsToApply[patternCount] = pattern
end
end
end
end
end
else
-- Default to WIKI_LINKS and SINGLE_BRACES if no categories specified
-- Skip WIKI_LINKS if preserveWikiLinks is true
if not options.preserveWikiLinks then
for _, pattern in ipairs(p.SANITIZE_PATTERNS.WIKI_LINKS) do
patternCount = patternCount + 1
patternsToApply[patternCount] = pattern
end
end
-- Always include SINGLE_BRACES
for _, pattern in ipairs(p.SANITIZE_PATTERNS.SINGLE_BRACES) do
patternCount = patternCount + 1
patternsToApply[patternCount] = pattern
end
end
-- Add any custom patterns
if customPatterns and type(customPatterns) == "table" then
for _, pattern in ipairs(customPatterns) do
patternCount = patternCount + 1
patternsToApply[patternCount] = pattern
end
end
-- Fast path if no patterns to apply
if patternCount == 0 then
return value
end
-- Apply each pattern sequentially
local result = value
for i = 1, patternCount do
local patternInfo = patternsToApply[i]
result = result:gsub(patternInfo.pattern, patternInfo.replacement)
end
return result
end)
end
-- Default delimiters for splitMultiValueString
p.DEFAULT_DELIMITERS = {
{pattern = "%s+and%s+", replacement = ";"},
{pattern = ";%s*", replacement = ";"}
}
-- Semicolon-only pattern for backward compatibility
p.SEMICOLON_PATTERN = {{pattern = ";%s*", replacement = ";"}}
-- Splits multi-value strings with various delimiters
function p.splitMultiValueString(value, delimiters)
if not value or value == "" then return {} end
-- Create a cache key
local delimitersKey = delimiters and "custom" or "default"
local cacheKey = p.generateCacheKey("splitMultiValueString", value, delimitersKey)
-- Use the caching wrapper
return p.withCache(cacheKey, function()
-- Use provided delimiters or default ones
delimiters = delimiters or p.DEFAULT_DELIMITERS
-- Standardize all delimiters to semicolons
local standardizedInput = value
for _, delimiter in ipairs(delimiters) do
standardizedInput = standardizedInput:gsub(delimiter.pattern, delimiter.replacement)
end
-- Pre-allocate table based on delimiter count
-- Count semicolons to estimate the number of items
local count = 0
for _ in standardizedInput:gmatch(";") do
count = count + 1
end
-- Pre-allocate table with estimated size (count+1 for the last item)
local items = {}
-- Split by semicolons and return the array
local index = 1
for item in standardizedInput:gmatch("[^;]+") do
local trimmed = item:match("^%s*(.-)%s*$")
if trimmed and trimmed ~= "" then
items[index] = trimmed
index = index + 1
end
end
return items
end)
end
-- Joins a table of values with the specified delimiter
function p.joinValues(values, delimiter)
delimiter = delimiter or "; "
if not values or #values == 0 then return "" end
-- Create a cache key
local cacheKey = p.generateCacheKey("joinValues", table.concat(values, "||"), delimiter)
-- Use the caching wrapper
return p.withCache(cacheKey, function()
return table.concat(values, delimiter)
end)
end
return p