lib/uri/common.rb


DEFINITIONS

This source file includes following functions.


   1  #
   2  # $Id: common.rb,v 1.3 2002/06/12 09:31:41 akira Exp $
   3  #
   4  # Copyright (c) 2001 akira yamada <akira@ruby-lang.org>
   5  # You can redistribute it and/or modify it under the same term as Ruby.
   6  #
   7  
   8  =begin
   9  
  10  == URI
  11  
  12  =end
  13  
  14  module URI
  15    module REGEXP
  16      module PATTERN
  17        # RFC 2396 (URI Generic Syntax)
  18        # RFC 2732 (IPv6 Literal Addresses in URL's)
  19        # RFC 2373 (IPv6 Addressing Architecture)
  20  
  21        # alpha         = lowalpha | upalpha
  22        ALPHA = "a-zA-Z"
  23        # alphanum      = alpha | digit
  24        ALNUM = "#{ALPHA}\\d"
  25  
  26        # hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
  27        #                         "a" | "b" | "c" | "d" | "e" | "f"
  28        HEX     = "a-fA-F\\d"
  29        # escaped       = "%" hex hex
  30        ESCAPED = "%[#{HEX}]{2}"
  31        # mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
  32        #                 "(" | ")"
  33        # unreserved    = alphanum | mark
  34        UNRESERVED = "-_.!~*'()#{ALNUM}"
  35        # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  36        #                 "$" | ","
  37        # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 
  38        #                 "$" | "," | "[" | "]" (RFC 2732)
  39        RESERVED = ";/?:@&=+$,\\[\\]"
  40  
  41        # uric          = reserved | unreserved | escaped
  42        URIC = "(?:[#{UNRESERVED}#{RESERVED}]|#{ESCAPED})"
  43        # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
  44        #                 "&" | "=" | "+" | "$" | ","
  45        URIC_NO_SLASH = "(?:[#{UNRESERVED};?:@&=+$,]|#{ESCAPED})"
  46        # query         = *uric
  47        QUERY = "#{URIC}*"
  48        # fragment      = *uric
  49        FRAGMENT = "#{URIC}*"
  50  
  51        # domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
  52        DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  53        # toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
  54        TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  55        # hostname      = *( domainlabel "." ) toplabel [ "." ]
  56        HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
  57  
  58        # RFC 2373, APPENDIX B:
  59        # IPv6address = hexpart [ ":" IPv4address ]
  60        # IPv4address   = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  61        # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
  62        # hexseq  = hex4 *( ":" hex4)
  63        # hex4    = 1*4HEXDIG
  64        #
  65        # XXX: This definition has a flaw. "::" + IPv4address must be
  66        # allowed too.  Here is a replacement.
  67        #
  68        # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  69        IPV4ADDR = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
  70        # hex4     = 1*4HEXDIG
  71        HEX4 = "[#{HEX}]{1,4}"
  72        # lastpart = hex4 | IPv4address
  73        LASTPART = "(?:#{HEX4}|#{IPV4ADDR})"
  74        # hexseq1  = *( hex4 ":" ) hex4
  75        HEXSEQ1 = "(?:#{HEX4}:)*#{HEX4}"
  76        # hexseq2  = *( hex4 ":" ) lastpart
  77        HEXSEQ2 = "(?:#{HEX4}:)*#{LASTPART}"
  78        # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
  79        IPV6ADDR = "(?:#{HEXSEQ2}|(?:#{HEXSEQ1})?::(?:#{HEXSEQ2})?)"
  80  
  81        # IPv6prefix  = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
  82        # unused
  83  
  84        # ipv6reference = "[" IPv6address "]" (RFC 2732)
  85        IPV6REF = "\\[#{IPV6ADDR}\\]"
  86  
  87        # host          = hostname | IPv4address
  88        # host          = hostname | IPv4address | IPv6reference (RFC 2732)
  89        HOST = "(?:#{HOSTNAME}|#{IPV4ADDR}|#{IPV6REF})"
  90        # port          = *digit
  91        PORT = "\d*"
  92        # hostport      = host [ ":" port ]
  93        HOSTPORT = "#{HOST}(?:#{PORT})?"
  94  
  95        # userinfo      = *( unreserved | escaped |
  96        #                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
  97        USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})*"
  98  
  99        # pchar         = unreserved | escaped |
 100        #                 ":" | "@" | "&" | "=" | "+" | "$" | ","
 101        PCHAR = "(?:[#{UNRESERVED}:@&=+$,]|#{ESCAPED})"
 102        # param         = *pchar
 103        PARAM = "#{PCHAR}*"
 104        # segment       = *pchar *( ";" param )
 105        SEGMENT = "#{PCHAR}*(?:;#{PARAM})*"
 106        # path_segments = segment *( "/" segment )
 107        PATH_SEGMENTS = "#{SEGMENT}(?:/#{SEGMENT})*"
 108  
 109        # server        = [ [ userinfo "@" ] hostport ]
 110        SERVER = "(?:#{USERINFO}@)?#{HOSTPORT}"
 111        # reg_name      = 1*( unreserved | escaped | "$" | "," |
 112        #                     ";" | ":" | "@" | "&" | "=" | "+" )
 113        REG_NAME = "(?:[#{UNRESERVED}$,;+@&=+]|#{ESCAPED})+"
 114        # authority     = server | reg_name
 115        AUTHORITY = "(?:#{SERVER}|#{REG_NAME})"
 116  
 117        # rel_segment   = 1*( unreserved | escaped |
 118        #                     ";" | "@" | "&" | "=" | "+" | "$" | "," )
 119        REL_SEGMENT = "(?:[#{UNRESERVED};@&=+$,]|#{ESCAPED})+"
 120  
 121        # scheme        = alpha *( alpha | digit | "+" | "-" | "." )
 122        SCHEME = "[#{ALPHA}][-+.#{ALPHA}\\d]*"
 123  
 124        # abs_path      = "/"  path_segments
 125        ABS_PATH = "/#{PATH_SEGMENTS}"
 126        # rel_path      = rel_segment [ abs_path ]
 127        REL_PATH = "#{REL_SEGMENT}(?:#{ABS_PATH})?"
 128        # net_path      = "//" authority [ abs_path ]
 129        NET_PATH   = "//#{AUTHORITY}(?:#{ABS_PATH})?"
 130  
 131        # hier_part     = ( net_path | abs_path ) [ "?" query ]
 132        HIER_PART   = "(?:#{NET_PATH}|#{ABS_PATH})(?:\\?(?:#{QUERY}))?"
 133        # opaque_part   = uric_no_slash *uric
 134        OPAQUE_PART = "#{URIC_NO_SLASH}#{URIC}*"
 135  
 136        # absoluteURI   = scheme ":" ( hier_part | opaque_part )
 137        ABS_URI   = "#{SCHEME}:(?:#{HIER_PART}|#{OPAQUE_PART})"
 138        # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
 139        REL_URI = "(?:#{NET_PATH}|#{ABS_PATH}|#{REL_PATH})(?:\\?#{QUERY})?"
 140  
 141        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
 142        URI_REF = "(?:#{ABS_URI}|#{REL_URI})?(?:##{FRAGMENT})?"
 143  
 144        # XXX:
 145        X_ABS_URI = "
 146          (#{PATTERN::SCHEME}):                     (?# 1: scheme)
 147          (?:
 148             (?:(?:
 149               //(?:
 150                   (?:(?:(#{PATTERN::USERINFO})@)?  (?# 2: userinfo)
 151                     (?:(#{PATTERN::HOST})(?::(\\d*))?))?(?# 3: host, 4: port)
 152                 |
 153                   (#{PATTERN::REG_NAME})           (?# 5: registry)
 154                 ))?
 155                 ((?!//)#{PATTERN::ABS_PATH})?      (?# 6: path)
 156             )(?:\\?(#{PATTERN::QUERY}))?           (?# 7: query)
 157          |
 158             (#{PATTERN::OPAQUE_PART})              (?# 8: opaque)
 159          )
 160          (?:\\#(#{PATTERN::FRAGMENT}))?            (?# 9: fragment)
 161        "
 162        X_REL_URI = "
 163          (?:
 164            (?:
 165              //
 166              (?:
 167                (?:(#{PATTERN::USERINFO})@)?       (?# 1: userinfo)
 168                  (#{PATTERN::HOST})?(?::(\\d*))?  (?# 2: host, 3: port)
 169              |
 170                (#{PATTERN::REG_NAME})             (?# 4: registry)
 171              )
 172            )
 173          |
 174            (#{PATTERN::REL_SEGMENT})              (?# 5: rel_segment)
 175          )?
 176          (#{PATTERN::ABS_PATH})?                  (?# 6: abs_path)
 177          (?:\\?(#{PATTERN::QUERY}))?              (?# 7: query)
 178          (?:\\#(#{PATTERN::FRAGMENT}))?           (?# 8: fragment)
 179        "
 180      end # PATTERN
 181  
 182      # for URI::split
 183      ABS_URI = Regexp.new('^' + PATTERN::X_ABS_URI + '$', #'
 184                           Regexp::EXTENDED, 'N').freeze
 185      REL_URI = Regexp.new('^' + PATTERN::X_REL_URI + '$', #'
 186                           Regexp::EXTENDED, 'N').freeze
 187  
 188      # for URI::extract
 189      URI_REF     = Regexp.new(PATTERN::URI_REF, false, 'N').freeze
 190      ABS_URI_REF = Regexp.new(PATTERN::X_ABS_URI, Regexp::EXTENDED, 'N').freeze
 191      REL_URI_REF = Regexp.new(PATTERN::X_REL_URI, Regexp::EXTENDED, 'N').freeze
 192  
 193      # for URI::escape/unescape
 194      ESCAPED = Regexp.new(PATTERN::ESCAPED, false, 'N').freeze
 195      UNSAFE  = Regexp.new("[^#{PATTERN::UNRESERVED}#{PATTERN::RESERVED}]",
 196                           false, 'N').freeze
 197  
 198      # for Generic#initialize
 199      SCHEME   = Regexp.new("^#{PATTERN::SCHEME}$", false, 'N').freeze #"
 200      USERINFO = Regexp.new("^#{PATTERN::USERINFO}$", false, 'N').freeze #"
 201      HOST     = Regexp.new("^#{PATTERN::HOST}$", false, 'N').freeze #"
 202      PORT     = Regexp.new("^#{PATTERN::PORT}$", false, 'N').freeze #"
 203      OPAQUE   = Regexp.new("^#{PATTERN::OPAQUE_PART}$", false, 'N').freeze #"
 204      REGISTRY = Regexp.new("^#{PATTERN::REG_NAME}$", false, 'N').freeze #"
 205      ABS_PATH = Regexp.new("^#{PATTERN::ABS_PATH}$", false, 'N').freeze #"
 206      REL_PATH = Regexp.new("^#{PATTERN::REL_PATH}$", false, 'N').freeze #"
 207      QUERY    = Regexp.new("^#{PATTERN::QUERY}$", false, 'N').freeze #"
 208      FRAGMENT = Regexp.new("^#{PATTERN::FRAGMENT}$", false, 'N').freeze #"
 209    end # REGEXP
 210  
 211    module Util
 212      def make_components_hash(klass, array_hash)
 213        tmp = {}
 214        if array_hash.kind_of?(Array) &&
 215            array_hash.size == klass.component.size - 1
 216          klass.component[1..-1].each_index do |i|
 217            begin
 218              tmp[klass.component[i + 1]] = array_hash[i].clone
 219            rescue TypeError
 220              tmp[klass.component[i + 1]] = array_hash[i]
 221            end
 222          end
 223  
 224        elsif array_hash.kind_of?(Hash)
 225          array_hash.each do |key, value|
 226            begin
 227              tmp[key] = value.clone
 228            rescue TypeError
 229              tmp[key] = value
 230            end
 231          end
 232        else
 233          raise ArgumentError, 
 234            "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})"
 235        end
 236        tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase
 237  
 238        return tmp
 239      end
 240      module_function :make_components_hash
 241    end
 242  
 243    module Escape
 244      include REGEXP
 245  
 246      def escape(str, unsafe = UNSAFE)
 247        unless unsafe.kind_of?(Regexp)
 248          # perhaps unsafe is String object
 249          unsafe = Regexp.new(Regexp.quote(unsafe), false, 'N')
 250        end
 251        str.gsub(unsafe) do |us|
 252          tmp = ''
 253          us.each_byte do |uc|
 254            tmp << sprintf('%%%02X', uc)
 255          end
 256          tmp
 257        end
 258      end
 259      alias encode escape
 260  
 261      def unescape(str)
 262        str.gsub(ESCAPED) do
 263          $&[1,2].hex.chr
 264        end
 265      end
 266      alias decode unescape
 267    end
 268  
 269    include REGEXP
 270    extend Escape
 271  
 272    @@schemes = {}
 273  
 274    class Error < StandardError; end
 275    class InvalidURIError < Error; end # it is not URI.
 276    class InvalidComponentError < Error; end # it is not component of URI.
 277    class BadURIError < Error; end # the URI is valid but it is bad for the position.
 278  
 279  =begin
 280  
 281  === Methods
 282  
 283  --- URI::split(uri)
 284  
 285  =end
 286  
 287    def self.split(uri)
 288      case uri
 289      when ''
 290        # null uri
 291  
 292      when ABS_URI
 293        scheme, userinfo, host, port, 
 294          registry, path, query, opaque, fragment = $~[1..-1]
 295  
 296        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
 297  
 298        # absoluteURI   = scheme ":" ( hier_part | opaque_part )
 299        # hier_part     = ( net_path | abs_path ) [ "?" query ]
 300        # opaque_part   = uric_no_slash *uric
 301  
 302        # abs_path      = "/"  path_segments
 303        # net_path      = "//" authority [ abs_path ]
 304  
 305        # authority     = server | reg_name
 306        # server        = [ [ userinfo "@" ] hostport ]
 307  
 308        if !scheme
 309          raise InvalidURIError, 
 310            "bad URI(absolute but no scheme): #{uri}"
 311        end
 312        if !opaque && (!path && (!host && !registry))
 313          raise InvalidURIError,
 314            "bad URI(absolute but no path): #{uri}" 
 315        end
 316  
 317      when REL_URI
 318        scheme = nil
 319        opaque = nil
 320  
 321        userinfo, host, port, registry, 
 322          rel_segment, abs_path, query, fragment = $~[1..-1]
 323        if rel_segment && abs_path
 324          path = rel_segment + abs_path
 325        elsif rel_segment
 326          path = rel_segment
 327        elsif abs_path
 328          path = abs_path
 329        end
 330  
 331        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
 332  
 333        # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
 334  
 335        # net_path      = "//" authority [ abs_path ]
 336        # abs_path      = "/"  path_segments
 337        # rel_path      = rel_segment [ abs_path ]
 338  
 339        # authority     = server | reg_name
 340        # server        = [ [ userinfo "@" ] hostport ]
 341  
 342      else
 343        raise InvalidURIError, "bad URI(is not URI?): #{uri}"
 344      end
 345  
 346      path = '' if !path && !opaque # (see RFC2396 Section 5.2)
 347      ret = [
 348        scheme, 
 349        userinfo, host, port,     # X
 350        registry,                 # X
 351        path,                     # Y
 352        opaque,                   # Y
 353        query,
 354        fragment
 355      ]
 356      return ret
 357    end
 358  
 359  =begin
 360  
 361  --- URI::parse(uri_str)
 362  
 363  =end
 364    def self.parse(uri)
 365      scheme, userinfo, host, port, 
 366        registry, path, opaque, query, fragment = self.split(uri)
 367  
 368      if scheme && @@schemes.include?(scheme.upcase)
 369        @@schemes[scheme.upcase].new(scheme, userinfo, host, port, 
 370                                     registry, path, opaque, query, 
 371                                     fragment)
 372      else
 373        Generic.new(scheme, userinfo, host, port, 
 374                    registry, path, opaque, query, 
 375                    fragment)
 376      end
 377    end
 378  
 379  =begin
 380  
 381  --- URI::join(str[, str, ...])
 382  
 383  =end
 384    def self.join(*str)
 385      u = self.parse(str[0])
 386      str[1 .. -1].each do |x|
 387        u = u.merge(x)
 388      end
 389      u
 390    end
 391  
 392  =begin
 393  
 394  --- URI::extract(str[, schemes])
 395  
 396  =end
 397    def self.extract(str, schemes = [])
 398      urls = []
 399      if schemes.size > 0
 400        tmp = Regexp.new('(?:' + schemes.collect{|s| 
 401                           Regexp.quote(s + ':')
 402                         }.join('|') + ')', 
 403                         Regexp::IGNORECASE, 'N')
 404        str.scan(tmp) {
 405          tmp_str = $& + $'
 406          if ABS_URI_REF =~ tmp_str
 407            if block_given?
 408              yield($&)
 409            else
 410              urls << $&
 411            end
 412          end
 413        }
 414  
 415      else
 416        str.scan(ABS_URI_REF) {
 417          if block_given?
 418            yield($&)
 419          else
 420            urls << $&
 421          end
 422        }
 423      end
 424  
 425      if block_given?
 426        return nil
 427      else
 428        return urls
 429      end
 430    end
 431  
 432  end # URI