Jump to content

Module:DateNormalization

From ICANNWiki

Documentation for this module may be created at Module:DateNormalization/doc

-- Module:DateNormalization
-- Parses date inputs from various formats, normalizes month names (including abbreviations with dots), and converts them into a standardized "Month DD, YYYY" format, or returns only the year if no full date is provided.

local p = {}

-- Normalize capitalization of months and abbreviations.
local months = {
  Jan = 1, January = 1, Feb = 2, February = 2, Mar = 3, March = 3,
  Apr = 4, April = 4, May = 5, Jun = 6, June = 6, Jul = 7, July = 7,
  Aug = 8, August = 8, Sep = 9, September = 9, Oct = 10, October = 10,
  Nov = 11, November = 11, Dec = 12, December = 12
}

-- Cache for previously processed dates (persists during a single page render)
local dateCache = {}

-- Grouped by type
local patterns = {
  -- Text-based formats (with month names)
  textFormats = {
    -- "DD Month YYYY" e.g. "12 March 1980"
    { "^(%d?%d)%s+(%a+%.?)%s+(%d%d%d%d)$", function(d, monthName, y)
        monthName = monthName:gsub("%.$", "")
        local m = months[monthName:sub(1,1):upper() .. monthName:sub(2):lower()]
        return tonumber(y), m, tonumber(d)
      end },
    -- "YYYY Month DD" e.g. "2000 July 12"
    { "^(%d%d%d%d)%s+(%a+%.?)%s+(%d?%d)$", function(y, monthName, d)
        monthName = monthName:gsub("%.$", "")
        local m = months[monthName:sub(1,1):upper() .. monthName:sub(2):lower()]
        return tonumber(y), m, tonumber(d)
      end },
    -- "DDth Month YYYY" e.g. "12th July 2000"
    { "^(%d?%d)[a-zA-Z]+%s+(%a+%.?)%s+(%d%d%d%d)$", function(d, monthName, y)
        monthName = monthName:gsub("%.$", "")
        local m = months[monthName:sub(1,1):upper() .. monthName:sub(2):lower()]
        return tonumber(y), m, tonumber(d)
      end },
    -- "Month DDth, YYYY" e.g. "July 12th, 2000"
    { "^(%a+%.?)%s+(%d?%d)[a-zA-Z]*[,]?%s*(%d%d%d%d)$", function(monthName, d, y)
        monthName = monthName:gsub("%.$", "")
        local m = months[monthName:sub(1,1):upper() .. monthName:sub(2):lower()]
        return tonumber(y), m, tonumber(d)
      end },
    -- "27 Feb. 2014", "7-Feb.-2014"
    { "(%d?%d)[%- ](%a+%.?)[%- ](%d%d%d%d)", function(d, monthName, y)
        monthName = monthName:gsub("%.$", "")
        local m = months[monthName:sub(1,1):upper() .. monthName:sub(2):lower()]
        return tonumber(y), m, tonumber(d)
      end },
    -- "Feb. 7, 2014"
    { "(%a+%.?)[, ](%d?%d)[, ](%d%d%d%d)", function(monthName, d, y)
        monthName = monthName:gsub("%.$", "")
        local m = months[monthName:sub(1,1):upper() .. monthName:sub(2):lower()]
        return tonumber(y), m, tonumber(d)
      end }
  },
  
  -- Numeric formats (without month names)
  numericFormats = {
    -- Compact form: "20000712" (YYYYMMDD)
    { "^(%d%d%d%d)(%d%d)(%d%d)$", function(y, m, d)
        return tonumber(y), tonumber(m), tonumber(d)
      end },
    -- YYYY-MM-DD or YYYY/MM/DD
    { "(%d%d%d%d)[%-/](%d?%d)[%-/](%d?%d)", function(y, m, d)
        return tonumber(y), tonumber(m), tonumber(d)
      end },
    -- DD-MM-YYYY or DD/MM/YYYY
    { "(%d?%d)[%-/](%d?%d)[%-/](%d%d%d%d)", function(d, m, y)
        return tonumber(y), tonumber(m), tonumber(d)
      end },
    -- DD-MM-YY or DD/MM/YY
    { "(%d?%d)[%-/](%d?%d)[%-/](%d%d)", function(d, m, y)
        return tonumber("20"..y), tonumber(m), tonumber(d)
      end }
  },
  
  -- Year-only format
  yearOnly = {
    -- YYYY only
    { "^(%d%d%d%d)$", function(y)
        return tonumber(y), nil, nil
      end }
  }
}

function p.formatDate(inputDate)
  if not inputDate or inputDate == "" then
    return inputDate
  end
  
  -- Check cache first for previously processed dates
  if dateCache[inputDate] then
    return dateCache[inputDate]
  end

  -- Normalize input: trim spaces, normalize whitespace, and remove multiple spaces
  inputDate = inputDate:gsub("^%s*(.-)%s*$", "%1"):gsub("%s+", " ")

  -- Quick check for year-only format first (most common and simplest)
  for _, pattern in ipairs(patterns.yearOnly) do
    local match = { string.match(inputDate, pattern[1]) }
    if #match > 0 then
      local y = pattern[2](match[1])
      if y then
        local result = tostring(y)
        dateCache[inputDate] = result
        return result
      end
    end
  end
  
  -- Check for text-based formats (with month names)
  for _, pattern in ipairs(patterns.textFormats) do
    local match = { string.match(inputDate, pattern[1]) }
    if #match > 0 then
      local y, m, d = pattern[2](match[1], match[2], match[3])
      if y and m and d then
        local timestamp = os.time{year = y, month = m, day = d}
        if timestamp then
          local result = os.date("%B %d, %Y", timestamp)
          dateCache[inputDate] = result
          return result
        end
      end
    end
  end
  
  -- Check for numeric formats
  for _, pattern in ipairs(patterns.numericFormats) do
    local match = { string.match(inputDate, pattern[1]) }
    if #match > 0 then
      local y, m, d = pattern[2](match[1], match[2], match[3])
      if y and m and d then
        local timestamp = os.time{year = y, month = m, day = d}
        if timestamp then
          local result = os.date("%B %d, %Y", timestamp)
          dateCache[inputDate] = result
          return result
        end
      end
    end
  end

  -- Fallback: try tokenizing and heuristically identifying components
  -- Count tokens first to pre-allocate the table
  local tokenCount = 0
  for _ in inputDate:gmatch("([%w]+)") do
    tokenCount = tokenCount + 1
  end
  
  -- Pre-allocate tokens table to avoid reallocation
  local tokens = {}
  tokens[tokenCount] = nil  -- Pre-allocate
  
  -- Fill tokens table
  local i = 0
  for token in inputDate:gmatch("([%w]+)") do
    i = i + 1
    tokens[i] = token
  end
  
  local year, month, day
  for _, token in ipairs(tokens) do
    local num = tonumber(token)
    if num then
      if num >= 1000 and num <= 3000 then
        year = num
      elseif num <= 31 and not day then
        day = num
      elseif num <= 12 and not month then
        month = num
      end
    else
      local clean = token:gsub("%.$", "")
      local mVal = months[clean:sub(1,1):upper() .. clean:sub(2):lower()]
      if mVal then
        month = mVal
      end
    end
    
    -- Early return if we have all components
    if year and month and day then
      local timestamp = os.time{year = year, month = month, day = day}
      if timestamp then
        local result = os.date("%B %d, %Y", timestamp)
        dateCache[inputDate] = result
        return result
      end
    end
  end

  -- If we only found a year but not month and day
  if year and not (month and day) then
    local result = tostring(year)
    dateCache[inputDate] = result
    return result
  end

  -- Cache the original input if no formatting was possible
  dateCache[inputDate] = inputDate
  return inputDate
end

return p