Jump to content

Module:Punycode

From ICANNWiki

Documentation for this module may be created at Module:Punycode/doc

-- Module:Punycode
-- Implements RFC3492 (Punycode) encoding and decoding.
-- Requires mw.ustring for proper Unicode support.

local punycode = {}

--------------------------
-- Configuration Constants
--------------------------
local base         = 36
local tmin         = 1
local tmax         = 26
local skew         = 38
local damp         = 700
local initial_bias = 72
local initial_n    = 128   -- 0x80
local delimiter    = '-'   -- ASCII hyphen

--------------------------
-- Helper functions for Unicode handling.
--------------------------
-- Converts a UTF-8 string to an array of Unicode code points.
local function toCodePoints(s)
    local cps = {}
    for char in mw.ustring.gmatch(s, ".") do
        table.insert(cps, mw.ustring.codepoint(char))
    end
    return cps
end

-- Converts an array of Unicode code points to a UTF-8 string.
local function fromCodePoints(cps)
    local chars = {}
    for _, cp in ipairs(cps) do
        table.insert(chars, mw.ustring.char(cp))
    end
    return table.concat(chars)
end

--------------------------
-- Digit conversion functions
--------------------------
local function digitToBasic(digit)
    if digit < 26 then
        return string.char(digit + string.byte('a'))
    else
        return string.char(digit - 26 + string.byte('0'))
    end
end

local function basicToDigit(cp)
    if cp >= string.byte('0') and cp <= string.byte('9') then
        return cp - string.byte('0') + 26
    elseif cp >= string.byte('A') and cp <= string.byte('Z') then
        return cp - string.byte('A')
    elseif cp >= string.byte('a') and cp <= string.byte('z') then
        return cp - string.byte('a')
    else
        return base
    end
end

--------------------------
-- Bias adaptation (RFC3492, Section 3.4)
--------------------------
local function adapt(delta, numpoints, first)
    if first then
        delta = math.floor(delta / damp)
    else
        delta = math.floor(delta / 2)
    end
    delta = delta + math.floor(delta / numpoints)
    local k = 0
    while delta > ((base - tmin) * tmax) / 2 do
        delta = math.floor(delta / (base - tmin))
        k = k + base
    end
    return k + math.floor(((base - tmin + 1) * delta) / (delta + skew))
end

--------------------------
-- Punycode Encoding Function
--------------------------
function punycode.encode(input)
    local output = {}
    local cp_array = toCodePoints(input)
    local n = initial_n
    local delta = 0
    local bias = initial_bias
    local basic_count = 0

    -- Copy basic code points (ASCII < 128)
    for _, cp in ipairs(cp_array) do
        if cp < 128 then
            table.insert(output, mw.ustring.char(cp))
            basic_count = basic_count + 1
        end
    end

    local h = basic_count
    if basic_count > 0 then
        table.insert(output, delimiter)
    end

    while h < #cp_array do
        local m = 0x7FFFFFFF
        for _, cp in ipairs(cp_array) do
            if cp >= n and cp < m then
                m = cp
            end
        end

        delta = delta + (m - n) * (h + 1)
        n = m
        for _, cp in ipairs(cp_array) do
            if cp < n then
                delta = delta + 1
            elseif cp == n then
                local q = delta
                local k = base
                while true do
                    local t
                    if k <= bias then
                        t = tmin
                    elseif k >= bias + tmax then
                        t = tmax
                    else
                        t = k - bias
                    end
                    if q < t then break end
                    local code = t + ((q - t) % (base - t))
                    table.insert(output, digitToBasic(code))
                    q = math.floor((q - t) / (base - t))
                    k = k + base
                end
                table.insert(output, digitToBasic(q))
                bias = adapt(delta, h + 1, h == basic_count)
                delta = 0
                h = h + 1
            end
        end
        delta = delta + 1
        n = n + 1
    end

    return table.concat(output)
end

--------------------------
-- Punycode Decoding Function
--------------------------
function punycode.decode(input)
    local cp_array = {}
    local d = input:find(delimiter, 1, true)
    local b = 0
    if d then
        for i = 1, d - 1 do
            local cp = input:byte(i)
            table.insert(cp_array, cp)
            b = b + 1
        end
    else
        d = 0
    end

    local n = initial_n
    local bias = initial_bias
    local i = 0
    local index = d + 1
    local input_len = #input

    while index <= input_len do
        local oldi = i
        local w = 1
        local k = base
        while true do
            if index > input_len then
                error("Invalid input: punycode decode incomplete")
            end
            local digit = basicToDigit(input:byte(index))
            index = index + 1
            i = i + digit * w
            local t
            if k <= bias then
                t = tmin
            elseif k >= bias + tmax then
                t = tmax
            else
                t = k - bias
            end
            if digit < t then break end
            w = w * (base - t)
            k = k + base
        end
        bias = adapt(i - oldi, #cp_array + 1, oldi == 0)
        n = n + math.floor(i / (#cp_array + 1))
        i = i % (#cp_array + 1)
        table.insert(cp_array, i + 1, n)
        i = i + 1
    end

    return fromCodePoints(cp_array)
end

return punycode