Modul:URLutil
local URLutil = { suite = "URLutil",
serial = "2015-12-05" };
--[=[ Utilities for URL etc. on www.
- getAuthority()
- getFragment()
- getHost()
- getLocation()
- getPath()
- getPort()
- getQuery()
- getQueryTable()
- getRelativePath()
- getScheme()
- getTLD()
- getTop2domain()
- getTop3domain()
- isAuthority()
- isDomain()
- isDomainExample()
- isDomainInt()
- isHost()
- isIP()
- isIPlocal()
- isIPv4()
- isIPv6()
- isMailAddress()
- isMailLink()
- isProtocolDialog()
- isProtocolWiki()
- isResourceURL()
- isSuspiciousURL()
- isUnescapedURL()
- isWebURL()
- wikiEscapeURL()
Only dotted decimal notation for IPv4 supported. Does not support dotted hexadecimal, dotted octal, or single-number formats. IPv6 URL (bracketed) not yet implemented; might need Wikintax escaping anyway. ]=]
URLutil.getURIScheme = function ( uri )
if type( uri ) == "string" then local prot, colon, slashes = uri:match( "^%s*([a-zA-Z]*)(:?)(/?/?)" ) if #colon == 1 and #prot >= 2 then return prot:lower() elseif #slashes == 2 and #prot == 0 then return "//" end end return false
end -- getURIScheme()
local getTopDomain = function ( url, mode )
local r = URLutil.getHost( url ) if r then local pattern = "[%w%%]+%.%a[%w-]*%a)$" if mode == 3 then pattern = "[%w%%]+%." .. pattern end r = mw.ustring.match( "." .. r, "%.(" .. pattern ) if not r then r = false end else r = false end return r
end -- getTopDomain()
URLutil.getAuthority = function ( url )
local r if type( url ) == "string" then local colon, host, port local pattern = "^%s*%w*:?//([%w%.%%-]+)(:?)([%d]*)/" local s = mw.text.decode( url ) local i = s:find( "#", 6, true ) if i then s = s:sub( 1, i - 1 ) .. "/" else s = s .. "/" end host, colon, port = mw.ustring.match( s, pattern ) if URLutil.isHost( host ) then host = mw.ustring.lower( host ) if colon == ":" then if port:find( "^[1-9]" ) then r = ( host .. ":" .. port ) end elseif #port == 0 then r = host end end else r = false end return r
end -- URLutil.getAuthority()
URLutil.getFragment = function ( url, decode )
local r if type( url ) == "string" then local s = mw.text.decode( url ) local i = s:find( "#", 1, true ) if i then r = mw.text.trim( s:sub( i ) ):sub( 2 ) if type( decode ) == "string" then local encoding = mw.text.trim( decode ) local launch if encoding == "%" then launch = true elseif encoding == "WIKI" then r = r:gsub( "%.(%x%x)", "%%%1" ) :gsub( "_", " " ) launch = true end if launch then r = mw.uri.decode( r, "PATH" ) end end else r = false end else r = nil end return r
end -- URLutil.getFragment()
URLutil.getHost = function ( url )
local r = URLutil.getAuthority( url ) if r then r = mw.ustring.match( r, "^([%w%.%%_-]+):?[%d]*$" ) end return r
end -- URLutil.getHost()
URLutil.getLocation = function ( url )
local r if type( url ) == "string" then r = mw.text.trim( url ) if r == "" then r = false else local i r = mw.text.decode( r ) i = r:find( "#", 1, true ) if i then if i == 1 then r = false else r = r:sub( 1, i - 1 ) end end end else r = nil end return r
end -- URLutil.getLocation()
URLutil.getPath = function ( url )
local r = URLutil.getRelativePath( url ) if r then local s = r:match( "^([^%?]*)%?" ) if s then r = s end s = r:match( "^([^#]*)#" ) if s then r = s end end return r
end -- URLutil.getPath()
URLutil.getPort = function ( url )
local r = URLutil.getAuthority( url ) if r then r = r:match( ":([1-9][0-9]*)$" ) if r then r = tonumber( r ) else r = false end end return r
end -- URLutil.getPort()
URLutil.getQuery = function ( url, key, separator )
local r = URLutil.getLocation( url ) if r then r = r:match( "^[^%?]*%?(.+)$" ) if r then if type( key ) == "string" then local single = mw.text.trim( key ) local sep = "&" local s, scan if type( separator ) == "string" then s = mw.text.trim( separator ) if s:match( "^[&;,/]$" ) then sep = s end end s = string.format( "%s%s%s", sep, r, sep ) scan = string.format( "%s%s=([^%s]*)%s", sep, key, sep, sep ) r = s:match( scan ) end end if not r then r = false end end return r
end -- URLutil.getQuery()
URLutil.getQueryTable = function ( url, separator )
local r = URLutil.getQuery( url ) if r then local sep = "&" local n, pairs, s, set if type( separator ) == "string" then s = mw.text.trim( separator ) if s:match( "^[&;,/]$" ) then sep = s end end pairs = mw.text.split( r, sep, true ) n = #pairs r = { } for i = 1, n do s = pairs[ i ] if s:find( "=", 2, true ) then s, set = s:match( "^([^=]+)=(.*)$" ) if s then r[ s ] = set end else r[ s ] = false end end -- for i end return r
end -- URLutil.getQueryTable()
URLutil.getRelativePath = function ( url )
local r if type( url ) == "string" then local s = url:match( "^%s*[a-zA-Z]*://(.*)$" ) if s then s = s:match( "[^/]+(/.*)$" ) else local x x, s = url:match( "^%s*(/?)(/.*)$" ) if x == "/" then s = s:match( "/[^/]+(/.*)$" ) end end if s then r = mw.text.trim( s ) elseif URLutil.isResourceURL( url ) then r = "/" else r = false end else r = nil end return r
end -- URLutil.getRelativePath()
URLutil.getScheme = function ( url )
local r if type( url ) == "string" then local pattern = "^%s*([a-zA-Z]*)(:?)(//)" local prot, colon, slashes = url:match( pattern ) r = false if slashes == "//" then if colon == ":" then if #prot > 2 then r = prot:lower() .. "://" end elseif #prot == 0 then r = "//" end end else r = nil end return r
end -- URLutil.getScheme()
URLutil.getTLD = function ( url )
local r = URLutil.getHost( url ) if r then r = mw.ustring.match( r, "[%w]+%.(%a[%w-]*%a)$" ) if not r then r = false end end return r
end -- URLutil.getTLD()
URLutil.getTop2domain = function ( url )
return getTopDomain( url, 2 )
end -- URLutil.getTop2domain()
URLutil.getTop3domain = function ( url )
return getTopDomain( url, 3 )
end -- URLutil.getTop3domain()
URLutil.isAuthority = function ( s )
local r if type( s ) == "string" then local pattern = "^%s*([%w%.%%-]+)(:?)(%d*)%s*$" local host, colon, port = mw.ustring.match( s, pattern ) if colon == ":" then port = port:match( "^[1-9][0-9]*$" ) if type( port ) ~= "string" then r = false end elseif port ~= "" then r = false end r = URLutil.isHost( host ) else r = nil end return r
end -- URLutil.isAuthority()
URLutil.isDomain = function ( s )
local r if type( s ) == "string" then local scan = "^%s*([%w%.%%-]+%w)%.(%a[%w-]*%a)%s*$" local scope s, scope = mw.ustring.match( s, scan ) if type( s ) == "string" then if mw.ustring.find( s, "^%w" ) then if mw.ustring.find( s, "..", 1, true ) then r = false else r = true end end end else r = nil end return r
end -- URLutil.isDomain()
URLutil.isDomainExample = function ( url )
-- RFC 2606: example.com example.net example.org example.edu local r = getTopDomain( url, 2 ) if r then local s = r:lower():match( "^example%.([a-z][a-z][a-z])$" ) if s then r = ( s == "com" or s == "edu" or s == "net" or s == "org" ) else r = false end end return r
end -- URLutil.isDomainExample()
URLutil.isDomainInt = function ( url )
-- Internationalized Domain Name (Punycode) local r = URLutil.getHost( url ) if r then if r:match( "^[!-~]+$" ) then local s = "." .. r if s:find( ".xn--", 1, true ) then r = true else r = false end else r = true end end return r
end -- URLutil.isDomainInt()
URLutil.isHost = function ( s )
return URLutil.isDomain( s ) or URLutil.isIP( s )
end -- URLutil.isHost()
URLutil.isIP = function ( s )
return URLutil.isIPv4( s ) and 4 or URLutil.isIPv6( s ) and 6
end -- URLutil.isIP()
URLutil.isIPlocal = function ( s )
-- IPv4 according to RFC 1918, RFC 1122; even any 0.0.0.0 (RFC 5735) local r = false local num = s:match( "^ *([01][0-9]*)%." ) if num then num = tonumber( num ) if num == 0 then r = s:match( "^ *0+%.[0-9]+%.[0-9]+%.[0-9]+ *$" ) elseif num == 10 or num == 127 then -- loopback; private/local host: 127.0.0.1 r = URLutil.isIPv4( s ) elseif num == 169 then -- 169.254.*.* elseif num == 172 then -- 172.(16...31).*.* num = s:match( "^ *0*172%.([0-9]+)%." ) if num then num = tonumber( num ) if num >= 16 and num <= 31 then r = URLutil.isIPv4( s ) end end elseif beg == 192 then -- 192.168.*.* num = s:match( "^ *0*192%.([0-9]+)%." ) if num then num = tonumber( num ) if num == 168 then r = URLutil.isIPv4( s ) end end end end if r then r = true end return r
end -- URLutil.isIPlocal()
URLutil.isIPv4 = function ( s )
local function legal( n ) return ( tonumber( n ) < 256 ) end local r = false if type( s ) == "string" then local p1, p2, p3, p4 = s:match( "^%s*([1-9][0-9]?[0-9]?)%.([12]?[0-9]?[0-9])%.([12]?[0-9]?[0-9])%.([12]?[0-9]?[0-9])%s*$" ) if p1 and p2 and p3 and p4 then r = legal( p1 ) and legal( p2 ) and legal( p3 ) and legal( p4 ) end end return r
end -- URLutil.isIPv4()
URLutil.isIPv6 = function ( s )
local dcolon, groups if type( s ) ~= "string" or s:len() == 0 or s:find( "[^:%x]" ) -- only colon and hex digits are legal chars or s:find( "^:[^:]" ) -- can begin or end with :: but not with single : or s:find( "[^:]:$" ) or s:find( ":::" ) then return false end s = mw.text.trim( s ) s, dcolon = s:gsub( "::", ":" ) if dcolon > 1 then return false end -- at most one :: s = s:gsub( "^:?", ":" ) -- prepend : if needed, upper s, groups = s:gsub( ":%x%x?%x?%x?", "" ) -- remove valid groups, and count them return ( ( dcolon == 1 and groups < 8 ) or ( dcolon == 0 and groups == 8 ) ) and ( s:len() == 0 or ( dcolon == 1 and s == ":" ) ) -- might be one dangling : if original ended with ::
end -- URLutil.isIPv6()
URLutil.isMailAddress = function ( s )
if type( s ) == "string" then s = mw.ustring.match( s, "^%s*[%w%.%%_-]+@([%w%.%%-]+)%s*$" ) return URLutil.isDomain( s ) end return false
end -- URLutil.isMailAddress()
URLutil.isMailLink = function ( s )
if type( s ) == "string" then local addr s, addr = mw.ustring.match( s, "^%s*([Mm][Aa][Ii][Ll][Tt][Oo]):(%S[%w%.%%_-]*@[%w%.%%-]+)%s*$" ) if type( s ) == "string" then if s:lower() == "mailto" then return URLutil.isMailAddress( addr ) end end end return false
end -- URLutil.isMailLink()
local function isProtocolAccepted( prot, supplied )
if type( prot ) == "string" then local scheme, colon, slashes = mw.ustring.match( prot, "^%s*([a-zA-Z]*)(:?)(/?/?)%s*$" ) if slashes ~= "/" then if scheme == "" then if colon ~= ":" and slashes == "//" then return true end elseif colon == ":" or slashes == "" then local s = supplied:match( " " .. scheme:lower() .. " " ) if type( s ) == "string" then return true end end end end return false
end -- isProtocolAccepted()
URLutil.isProtocolMW = function ( prot )
return isProtocolAccepted( prot, " http https ftp ftps ssh sftp irc ircs xmpp sip sips gopher telnet nntp worldwind mailto tel sms news svn git mms bitcoin magnet urn geo " )
end -- URLutil.isProtocolMW()
URLutil.isProtocolDialog = function ( prot )
return isProtocolAccepted( prot, " mailto irc ircs ssh telnet " )
end -- URLutil.isProtocolDialog()
URLutil.isProtocolWiki = function ( prot )
return isProtocolAccepted( prot, " ftp ftps git http https nntp sftp svn worldwind " )
end -- URLutil.isProtocolWiki()
URLutil.isResourceURL = function ( url )
local scheme = URLutil.getScheme( url ) if scheme then local s = " // http:// https:// ftp:// sftp:// " s = s:find( string.format( " %s ", scheme ) ) if s then if URLutil.getAuthority( url ) then if not url:match( "%S%s+%S" ) then return true end end end end return false
end -- URLutil.isResourceURL()
URLutil.isSuspiciousURL = function ( url )
if URLutil.isResourceURL( url ) then local s = URLutil.getAuthority( url ) local pat = "[%[|%]" .. mw.ustring.char( 8201, 45, 8207, 8234, 45, 8239, 8288 ) .. "]" if s:find( "@" ) or url:find( "" ) or url:find( pat ) or url:find( "[%.,]$" ) then return true end -- TODO zero width character ?? return false end return true
end -- URLutil.isSuspiciousURL()
URLutil.isUnescapedURL = function ( url, trailing )
if type( trailing ) ~= "string" then if URLutil.isWebURL( url ) then if url:match( "[%[|%]]" ) then return true end end end return false
end -- URLutil.isUnescapedURL()
URLutil.isWebURL = function ( url )
if URLutil.getScheme( url ) and URLutil.getAuthority( url ) then if not url:match( "%S%s+%S" ) then return true end end return false
end -- URLutil.isWebURL()
URLutil.wikiEscapeURL = function ( url )
if url:find( "[%[|%]]" ) then local n url, n = url:gsub( "%[", "[" ) :gsub( "|", "|" ) :gsub( "%]", "]" ) end return url
end -- URLutil.wikiEscapeURL()
-- Provide template access and expose URLutil table to require
local p = {}
function p.getURIScheme( frame )
return URLutil.getURIScheme( frame.args[ 1 ] ) or ""
end function p.getAuthority( frame )
return URLutil.getAuthority( frame.args[ 1 ] ) or ""
end function p.getFragment( frame )
local r = URLutil.getFragment( frame.args[ 1 ], frame.args[ 2 ] ) if r then r = "#" .. r else r = "" end return r
end function p.getHost( frame )
return URLutil.getHost( frame.args[ 1 ] ) or ""
end function p.getLocation( frame )
return URLutil.getLocation( frame.args[ 1 ] ) or ""
end function p.getPath( frame )
return URLutil.getPath( frame.args[ 1 ] ) or ""
end function p.getPort( frame )
return URLutil.getPort( frame.args[ 1 ] ) or ""
end function p.getQuery( frame )
local r local key = frame.args[ 2 ] if key then key = mw.text.trim( key ) if key == "" then key = nil end end r = URLutil.getQuery( frame.args[ 1 ], key, frame.args[ 3 ] ) if r then if not key then r = "?" .. r end else r = "" end return r
end function p.getRelativePath( frame )
return URLutil.getRelativePath( frame.args[ 1 ] ) or ""
end function p.getScheme( frame )
return URLutil.getScheme( frame.args[ 1 ] ) or ""
end function p.getTLD( frame )
return URLutil.getTLD( frame.args[ 1 ] ) or ""
end function p.getTop2domain( frame )
return URLutil.getTop2domain( frame.args[ 1 ] ) or ""
end function p.getTop3domain( frame )
return URLutil.getTop3domain( frame.args[ 1 ] ) or ""
end function p.isAuthority( frame )
return URLutil.isAuthority( frame.args[ 1 ] ) and "1" or ""
end function p.isDomain( frame )
return URLutil.isDomain( frame.args[ 1 ] ) and "1" or ""
end function p.isDomainExample( frame )
return URLutil.isDomainExample( frame.args[ 1 ] ) and "1" or ""
end function p.isDomainInt( frame )
return URLutil.isDomainInt( frame.args[ 1 ] ) and "1" or ""
end function p.isHost( frame )
return URLutil.isHost( frame.args[ 1 ] ) and "1" or ""
end function p.isIP( frame )
return URLutil.isIP( frame.args[ 1 ] ) or ""
end function p.isIPlocal( frame )
return URLutil.isIPlocal( frame.args[ 1 ] ) and "1" or ""
end function p.isIPv4( frame )
return URLutil.isIPv4( frame.args[ 1 ] ) and "1" or ""
end function p.isIPv6( frame )
return URLutil.isIPv6( frame.args[ 1 ] ) and "1" or ""
end function p.isMailAddress( frame )
return URLutil.isMailAddress( frame.args[ 1 ] ) and "1" or ""
end function p.isMailLink( frame )
return URLutil.isMailLink( frame.args[ 1 ] ) and "1" or ""
end function p.isProtocolMW( frame )
return URLutil.isProtocolMW( frame.args[ 1 ] ) and "1" or ""
end function p.isProtocolDialog( frame )
return URLutil.isProtocolDialog( frame.args[ 1 ] ) and "1" or ""
end function p.isProtocolWiki( frame )
return URLutil.isProtocolWiki( frame.args[ 1 ] ) and "1" or ""
end function p.isResourceURL( frame )
return URLutil.isResourceURL( frame.args[ 1 ] ) and "1" or ""
end function p.isSuspiciousURL( frame )
return URLutil.isSuspiciousURL( frame.args[ 1 ] ) and "1" or ""
end function p.isUnescapedURL( frame )
return URLutil.isUnescapedURL( frame.args[ 1 ], frame.args[ 2 ] ) and "1" or ""
end function p.isWebURL( frame )
return URLutil.isWebURL( frame.args[ 1 ] ) and "1" or ""
end function p.wikiEscapeURL( frame )
return URLutil.wikiEscapeURL( frame.args[ 1 ] )
end function p.URLutil()
return URLutil
end
return p