Modul:URLutil

From Bayernflora
Jump to: navigation, search

local URLutil = { suite = "URLutil",

                 serial = "2015-12-05" };

--[=[ Utilities for URL etc. on www.

  • getAuthority()
  • getFragment()
  • getHost()
  • getLocation()
  • getPath()
  • getPort()
  • getQuery()
  • getQueryTable()
  • getRelativePath()
  • getScheme()
  • getTLD()
  • getTop2domain()
  • getTop3domain()
  • isAuthority()
  • isDomain()
  • isDomainExample()
  • isDomainInt()
  • isHost()
  • isIP()
  • isIPlocal()
  • isIPv4()
  • isIPv6()
  • isMailAddress()
  • isMailLink()
  • isProtocolDialog()
  • isProtocolWiki()
  • isResourceURL()
  • isSuspiciousURL()
  • isUnescapedURL()
  • isWebURL()
  • wikiEscapeURL()

Only dotted decimal notation for IPv4 supported. Does not support dotted hexadecimal, dotted octal, or single-number formats. IPv6 URL (bracketed) not yet implemented; might need Wikintax escaping anyway. ]=]


URLutil.getURIScheme = function ( uri )

   if type( uri ) == "string" then
       local prot, colon, slashes = uri:match( "^%s*([a-zA-Z]*)(:?)(/?/?)" )
       if #colon == 1 and #prot >= 2 then
           return prot:lower()
       elseif #slashes == 2 and #prot == 0 then
           return "//"
       end
   end
   return false

end -- getURIScheme()


local getTopDomain = function ( url, mode )

   local r = URLutil.getHost( url )
   if r then
       local pattern = "[%w%%]+%.%a[%w-]*%a)$"
       if mode == 3 then
           pattern = "[%w%%]+%." .. pattern
       end
       r = mw.ustring.match( "." .. r,  "%.(" .. pattern )
       if not r then
           r = false
       end
   else
       r = false
   end
   return r

end -- getTopDomain()


URLutil.getAuthority = function ( url )

   local r
   if type( url ) == "string" then
       local colon, host, port
       local pattern = "^%s*%w*:?//([%w%.%%-]+)(:?)([%d]*)/"
       local s = mw.text.decode( url )
       local i = s:find( "#", 6, true )
       if i then
           s = s:sub( 1,  i - 1 )  ..  "/"
       else
           s = s .. "/"
       end
       host, colon, port = mw.ustring.match( s, pattern )
       if URLutil.isHost( host ) then
           host = mw.ustring.lower( host )
           if colon == ":" then
               if port:find( "^[1-9]" ) then
                   r = ( host .. ":" .. port )
               end
           elseif #port == 0 then
               r = host
           end
       end
   else
       r = false
   end
   return r

end -- URLutil.getAuthority()


URLutil.getFragment = function ( url, decode )

   local r
   if type( url ) == "string" then
       local s = mw.text.decode( url )
       local i = s:find( "#", 1, true )
       if i then
           r = mw.text.trim( s:sub( i ) ):sub( 2 )
           if type( decode ) == "string" then
               local encoding = mw.text.trim( decode )
               local launch
               if encoding == "%" then
                   launch = true
               elseif encoding == "WIKI" then
                   r = r:gsub( "%.(%x%x)", "%%%1" )
                        :gsub( "_", " " )
                   launch = true
               end
               if launch then
                   r = mw.uri.decode( r, "PATH" )
               end
           end
       else
           r = false
       end
   else
       r = nil
   end
   return r

end -- URLutil.getFragment()


URLutil.getHost = function ( url )

   local r = URLutil.getAuthority( url )
   if r then
       r = mw.ustring.match( r, "^([%w%.%%_-]+):?[%d]*$" )
   end
   return r

end -- URLutil.getHost()


URLutil.getLocation = function ( url )

   local r
   if type( url ) == "string" then
       r = mw.text.trim( url )
       if r == "" then
           r = false
       else
           local i
           r = mw.text.decode( r )
           i = r:find( "#", 1, true )
           if i then
               if i == 1 then
                   r = false
               else
                   r = r:sub( 1,  i - 1 )
               end
           end
       end
   else
       r = nil
   end
   return r

end -- URLutil.getLocation()


URLutil.getPath = function ( url )

   local r = URLutil.getRelativePath( url )
   if r then
       local s = r:match( "^([^%?]*)%?" )
       if s then
           r = s
       end
       s = r:match( "^([^#]*)#" )
       if s then
           r = s
       end
   end
   return r

end -- URLutil.getPath()


URLutil.getPort = function ( url )

   local r = URLutil.getAuthority( url )
   if r then
       r = r:match( ":([1-9][0-9]*)$" )
       if r then
           r = tonumber( r )
       else
           r = false
       end
   end
   return r

end -- URLutil.getPort()


URLutil.getQuery = function ( url, key, separator )

   local r = URLutil.getLocation( url )
   if r then
       r = r:match( "^[^%?]*%?(.+)$" )
       if r then
           if type( key ) == "string" then
               local single = mw.text.trim( key )
               local sep = "&"
               local s, scan
               if type( separator ) == "string" then
                   s = mw.text.trim( separator )
                   if s:match( "^[&;,/]$" ) then
                       sep = s
                   end
               end
               s = string.format( "%s%s%s", sep, r, sep )
               scan = string.format( "%s%s=([^%s]*)%s",
                                     sep, key, sep, sep )
               r = s:match( scan )
           end
       end
       if not r then
           r = false
       end
   end
   return r

end -- URLutil.getQuery()


URLutil.getQueryTable = function ( url, separator )

   local r = URLutil.getQuery( url )
   if r then
       local sep = "&"
       local n, pairs, s, set
       if type( separator ) == "string" then
           s = mw.text.trim( separator )
           if s:match( "^[&;,/]$" ) then
               sep = s
           end
       end
       pairs = mw.text.split( r, sep, true )
       n = #pairs
       r = { }
       for i = 1, n do
           s = pairs[ i ]
           if s:find( "=", 2, true ) then
               s, set = s:match( "^([^=]+)=(.*)$" )
               if s then
                   r[ s ] = set
               end
           else
               r[ s ] = false
           end
       end -- for i
   end
   return r

end -- URLutil.getQueryTable()


URLutil.getRelativePath = function ( url )

   local r
   if type( url ) == "string" then
       local s = url:match( "^%s*[a-zA-Z]*://(.*)$" )
       if s then
           s = s:match( "[^/]+(/.*)$" )
       else
           local x
           x, s = url:match( "^%s*(/?)(/.*)$" )
           if x == "/" then
               s = s:match( "/[^/]+(/.*)$" )
           end
       end
       if s then
           r = mw.text.trim( s )
       elseif URLutil.isResourceURL( url ) then
           r = "/"
       else
           r = false
       end
   else
       r = nil
   end
   return r

end -- URLutil.getRelativePath()


URLutil.getScheme = function ( url )

   local r
   if type( url ) == "string" then
       local pattern = "^%s*([a-zA-Z]*)(:?)(//)"
       local prot, colon, slashes = url:match( pattern )
       r = false
       if slashes == "//" then
           if colon == ":" then
               if #prot > 2 then
                   r = prot:lower() .. "://"
               end
           elseif #prot == 0 then
               r = "//"
           end
       end
   else
       r = nil
   end
   return r

end -- URLutil.getScheme()


URLutil.getTLD = function ( url )

   local r = URLutil.getHost( url )
   if r then
       r = mw.ustring.match( r, "[%w]+%.(%a[%w-]*%a)$" )
       if not r then
           r = false
       end
   end
   return r

end -- URLutil.getTLD()


URLutil.getTop2domain = function ( url )

   return getTopDomain( url, 2 )

end -- URLutil.getTop2domain()


URLutil.getTop3domain = function ( url )

   return getTopDomain( url, 3 )

end -- URLutil.getTop3domain()


URLutil.isAuthority = function ( s )

   local r
   if type( s ) == "string" then
       local pattern = "^%s*([%w%.%%-]+)(:?)(%d*)%s*$"
       local host, colon, port = mw.ustring.match( s, pattern )
       if colon == ":" then
           port = port:match( "^[1-9][0-9]*$" )
           if type( port ) ~= "string" then
               r = false
           end
       elseif port ~= "" then
           r = false
       end
       r = URLutil.isHost( host )
   else
       r = nil
   end
   return r

end -- URLutil.isAuthority()


URLutil.isDomain = function ( s )

   local r
   if type( s ) == "string" then
       local scan = "^%s*([%w%.%%-]+%w)%.(%a[%w-]*%a)%s*$"
       local scope
       s, scope = mw.ustring.match( s, scan )
       if type( s ) == "string" then
           if mw.ustring.find( s, "^%w" ) then
               if mw.ustring.find( s, "..", 1, true ) then
                   r = false
               else
                   r = true
               end
           end
       end
   else
       r = nil
   end
   return r

end -- URLutil.isDomain()


URLutil.isDomainExample = function ( url )

   -- RFC 2606: example.com example.net example.org example.edu
   local r = getTopDomain( url, 2 )
   if r then
       local s = r:lower():match( "^example%.([a-z][a-z][a-z])$" )
       if s then
           r = ( s == "com" or
                 s == "edu" or
                 s == "net" or
                 s == "org" )
       else
           r = false
       end
   end
   return r

end -- URLutil.isDomainExample()


URLutil.isDomainInt = function ( url )

   -- Internationalized Domain Name (Punycode)
   local r = URLutil.getHost( url )
   if r then
       if r:match( "^[!-~]+$" ) then
           local s = "." .. r
           if s:find( ".xn--", 1, true ) then
               r = true
           else
               r = false
           end
       else
           r = true
       end
   end
   return r

end -- URLutil.isDomainInt()


URLutil.isHost = function ( s )

   return URLutil.isDomain( s ) or URLutil.isIP( s )

end -- URLutil.isHost()


URLutil.isIP = function ( s )

   return URLutil.isIPv4( s ) and 4 or URLutil.isIPv6( s ) and 6

end -- URLutil.isIP()


URLutil.isIPlocal = function ( s )

   -- IPv4 according to RFC 1918, RFC 1122; even any 0.0.0.0 (RFC 5735)
   local r = false
   local num = s:match( "^ *([01][0-9]*)%." )
   if num then
       num = tonumber( num )
       if num == 0 then
           r = s:match( "^ *0+%.[0-9]+%.[0-9]+%.[0-9]+ *$" )
       elseif num == 10  or  num == 127 then
           -- loopback; private/local host: 127.0.0.1
           r = URLutil.isIPv4( s )
       elseif num == 169 then
           -- 169.254.*.*
       elseif num == 172 then
           -- 172.(16...31).*.*
           num = s:match( "^ *0*172%.([0-9]+)%." )
           if num then
               num = tonumber( num )
               if num >= 16  and  num <= 31 then
                   r = URLutil.isIPv4( s )
               end
           end
       elseif beg == 192 then
           -- 192.168.*.*
           num = s:match( "^ *0*192%.([0-9]+)%." )
           if num then
               num = tonumber( num )
               if num == 168 then
                   r = URLutil.isIPv4( s )
               end
           end
       end
   end
   if r then
       r = true
   end
   return r

end -- URLutil.isIPlocal()


URLutil.isIPv4 = function ( s )

   local function legal( n )
             return ( tonumber( n ) < 256 )
         end
   local r = false
   if type( s ) == "string" then
       local p1, p2, p3, p4 = s:match( "^%s*([1-9][0-9]?[0-9]?)%.([12]?[0-9]?[0-9])%.([12]?[0-9]?[0-9])%.([12]?[0-9]?[0-9])%s*$" )
       if p1 and p2 and p3 and p4 then
           r = legal( p1 ) and legal( p2 ) and legal( p3 ) and legal( p4 )
       end
   end
   return r

end -- URLutil.isIPv4()


URLutil.isIPv6 = function ( s )

   local dcolon, groups
   if type( s ) ~= "string"
       or s:len() == 0
       or s:find( "[^:%x]" ) -- only colon and hex digits are legal chars
       or s:find( "^:[^:]" ) -- can begin or end with :: but not with single :
       or s:find( "[^:]:$" )
       or s:find( ":::" )
   then
       return false
   end
   s = mw.text.trim( s )
   s, dcolon = s:gsub( "::", ":" )
   if dcolon > 1 then
       return false
   end -- at most one ::
   s = s:gsub( "^:?", ":" ) -- prepend : if needed, upper
   s, groups = s:gsub( ":%x%x?%x?%x?", "" ) -- remove valid groups, and count them
   return ( ( dcolon == 1 and groups < 8 ) or
            ( dcolon == 0 and groups == 8 ) )
       and ( s:len() == 0 or ( dcolon == 1 and s == ":" ) ) -- might be one dangling : if original ended with ::

end -- URLutil.isIPv6()


URLutil.isMailAddress = function ( s )

   if type( s ) == "string" then
       s = mw.ustring.match( s, "^%s*[%w%.%%_-]+@([%w%.%%-]+)%s*$" )
       return URLutil.isDomain( s )
   end
   return false

end -- URLutil.isMailAddress()


URLutil.isMailLink = function ( s )

   if type( s ) == "string" then
       local addr
       s, addr = mw.ustring.match( s, "^%s*([Mm][Aa][Ii][Ll][Tt][Oo]):(%S[%w%.%%_-]*@[%w%.%%-]+)%s*$" )
       if type( s ) == "string" then
           if s:lower() == "mailto" then
               return URLutil.isMailAddress( addr )
           end
       end
   end
   return false

end -- URLutil.isMailLink()


local function isProtocolAccepted( prot, supplied )

   if type( prot ) == "string" then
       local scheme, colon, slashes = mw.ustring.match( prot, "^%s*([a-zA-Z]*)(:?)(/?/?)%s*$" )
       if slashes ~= "/" then
           if scheme == "" then
               if colon ~= ":" and slashes == "//" then
                   return true
               end
            elseif colon == ":" or slashes == "" then
               local s = supplied:match( " " .. scheme:lower() .. " " )
               if type( s ) == "string" then
                   return true
               end
           end
       end
   end
   return false

end -- isProtocolAccepted()


URLutil.isProtocolMW = function ( prot )

   return isProtocolAccepted( prot,
                              " http https ftp ftps ssh sftp irc ircs xmpp sip sips gopher telnet nntp worldwind mailto tel sms news svn git mms bitcoin magnet urn geo " )

end -- URLutil.isProtocolMW()


URLutil.isProtocolDialog = function ( prot )

   return isProtocolAccepted( prot, " mailto irc ircs ssh telnet " )

end -- URLutil.isProtocolDialog()


URLutil.isProtocolWiki = function ( prot )

   return isProtocolAccepted( prot,
                              " ftp ftps git http https nntp sftp svn worldwind " )

end -- URLutil.isProtocolWiki()


URLutil.isResourceURL = function ( url )

   local scheme = URLutil.getScheme( url )
   if scheme then
       local s = " // http:// https:// ftp:// sftp:// "
       s = s:find( string.format( " %s ", scheme ) )
       if s then
           if URLutil.getAuthority( url ) then
               if not url:match( "%S%s+%S" ) then
                   return true
               end
           end
       end
   end
   return false

end -- URLutil.isResourceURL()


URLutil.isSuspiciousURL = function ( url )

   if URLutil.isResourceURL( url ) then
       local s = URLutil.getAuthority( url )
       local pat = "[%[|%]" ..
                   mw.ustring.char( 8201, 45, 8207, 8234, 45, 8239, 8288 )
                   .. "]"
       if s:find( "@" )
          or url:find( "" )
          or url:find( pat )
          or url:find( "[%.,]$" ) then
           return true
       end
       -- TODO  zero width character ??
       return false
   end
   return true

end -- URLutil.isSuspiciousURL()


URLutil.isUnescapedURL = function ( url, trailing )

   if type( trailing ) ~= "string" then
       if URLutil.isWebURL( url ) then
           if url:match( "[%[|%]]" ) then
               return true
           end
       end
   end
   return false

end -- URLutil.isUnescapedURL()


URLutil.isWebURL = function ( url )

   if URLutil.getScheme( url ) and URLutil.getAuthority( url ) then
       if not url:match( "%S%s+%S" ) then
           return true
       end
   end
   return false

end -- URLutil.isWebURL()


URLutil.wikiEscapeURL = function ( url )

   if url:find( "[%[|%]]" ) then
       local n
       url, n = url:gsub( "%[", "[" )
                   :gsub( "|", "|" )
                   :gsub( "%]", "]" )
   end
   return url

end -- URLutil.wikiEscapeURL()


-- Provide template access and expose URLutil table to require

local p = {}

function p.getURIScheme( frame )

   return URLutil.getURIScheme( frame.args[ 1 ] ) or ""

end function p.getAuthority( frame )

   return URLutil.getAuthority( frame.args[ 1 ] ) or ""

end function p.getFragment( frame )

   local r = URLutil.getFragment( frame.args[ 1 ], frame.args[ 2 ] )
   if r then
       r = "#" .. r
   else
       r = ""
   end
   return r

end function p.getHost( frame )

   return URLutil.getHost( frame.args[ 1 ] ) or ""

end function p.getLocation( frame )

   return URLutil.getLocation( frame.args[ 1 ] ) or ""

end function p.getPath( frame )

   return URLutil.getPath( frame.args[ 1 ] ) or ""

end function p.getPort( frame )

   return URLutil.getPort( frame.args[ 1 ] ) or ""

end function p.getQuery( frame )

   local r
   local key = frame.args[ 2 ]
   if key then
       key = mw.text.trim( key )
       if key == "" then
           key = nil
       end
   end
   r = URLutil.getQuery( frame.args[ 1 ], key, frame.args[ 3 ] )
   if r then
       if not key then
           r = "?" .. r
       end
   else
       r = ""
   end
   return r

end function p.getRelativePath( frame )

   return URLutil.getRelativePath( frame.args[ 1 ] ) or ""

end function p.getScheme( frame )

   return URLutil.getScheme( frame.args[ 1 ] ) or ""

end function p.getTLD( frame )

   return URLutil.getTLD( frame.args[ 1 ] ) or ""

end function p.getTop2domain( frame )

   return URLutil.getTop2domain( frame.args[ 1 ] ) or ""

end function p.getTop3domain( frame )

   return URLutil.getTop3domain( frame.args[ 1 ] ) or ""

end function p.isAuthority( frame )

   return URLutil.isAuthority( frame.args[ 1 ] ) and "1" or ""

end function p.isDomain( frame )

   return URLutil.isDomain( frame.args[ 1 ] ) and "1" or ""

end function p.isDomainExample( frame )

   return URLutil.isDomainExample( frame.args[ 1 ] ) and "1" or ""

end function p.isDomainInt( frame )

   return URLutil.isDomainInt( frame.args[ 1 ] ) and "1" or ""

end function p.isHost( frame )

   return URLutil.isHost( frame.args[ 1 ] ) and "1" or ""

end function p.isIP( frame )

   return URLutil.isIP( frame.args[ 1 ] ) or ""

end function p.isIPlocal( frame )

   return URLutil.isIPlocal( frame.args[ 1 ] ) and "1" or ""

end function p.isIPv4( frame )

   return URLutil.isIPv4( frame.args[ 1 ] ) and "1" or ""

end function p.isIPv6( frame )

   return URLutil.isIPv6( frame.args[ 1 ] ) and "1" or ""

end function p.isMailAddress( frame )

   return URLutil.isMailAddress( frame.args[ 1 ] ) and "1" or ""

end function p.isMailLink( frame )

   return URLutil.isMailLink( frame.args[ 1 ] ) and "1" or ""

end function p.isProtocolMW( frame )

   return URLutil.isProtocolMW( frame.args[ 1 ] ) and "1" or ""

end function p.isProtocolDialog( frame )

   return URLutil.isProtocolDialog( frame.args[ 1 ] ) and "1" or ""

end function p.isProtocolWiki( frame )

   return URLutil.isProtocolWiki( frame.args[ 1 ] ) and "1" or ""

end function p.isResourceURL( frame )

   return URLutil.isResourceURL( frame.args[ 1 ] ) and "1" or ""

end function p.isSuspiciousURL( frame )

   return URLutil.isSuspiciousURL( frame.args[ 1 ] ) and "1" or ""

end function p.isUnescapedURL( frame )

   return URLutil.isUnescapedURL( frame.args[ 1 ], frame.args[ 2 ] ) and "1" or ""

end function p.isWebURL( frame )

   return URLutil.isWebURL( frame.args[ 1 ] ) and "1" or ""

end function p.wikiEscapeURL( frame )

   return URLutil.wikiEscapeURL( frame.args[ 1 ] )

end function p.URLutil()

   return URLutil

end

return p