1  require 'uri'
   2  
   3  module HTMLUtils
   4    ESC = {
   5      '&' => '&',
   6      '"' => '"',
   7      '<' => '&lt;',
   8      '>' => '&gt;'
   9    }
  10  
  11    def escape(str)
  12      table = ESC   # optimize
  13      str.gsub(/[&"<>]/n) {|s| table[s] }
  14    end
  15    
  16    def urlencode(str)
  17      str.gsub(/[^\w\.\-]/n) {|ch| sprintf('%%%02X', ch[0]) }
  18    end
  19  end
  20  
  21  class PukiWikiParser
  22    include HTMLUtils
  23  
  24    def initialize(logger)
  25      @logger = logger
  26      @h_start_level = 2
  27    end
  28  
  29    def to_html(src, page_names, base_uri = '/', suffix= '/')
  30      @page_names = page_names
  31      @base_uri = base_uri
  32      @pagelist_suffix = suffix
  33      @inline_re = nil   # invalidate cache
  34  
  35      buf = []
  36      lines = src.rstrip.split(/\r?\n/).map {|line| line.chomp }
  37      while lines.first
  38        case lines.first
  39        when ''
  40          lines.shift
  41        when /\A----/
  42          lines.shift
  43          buf.push '<hr />'
  44        when /\A\*/
  45          buf.push parse_h(lines.shift)
  46        when /\A\s/
  47          buf.concat parse_pre(take_block(lines, /\A\s/))
  48        when /\A>/
  49          buf.concat parse_quote(take_block(lines, /\A>/))
  50        when /\A-/
  51          buf.concat parse_list('ul', take_block(lines, /\A-/))
  52        when /\A\+/
  53          buf.concat parse_list('ol', take_block(lines, /\A\+/))
  54        when /\A:/
  55          buf.concat parse_dl(take_block(lines, /\A:/))
  56        else
  57          buf.push '<p>'
  58          buf.concat parse_p(take_block(lines, /\A(?![*\s>:\-\+]|----|\z)/))
  59          buf.push '</p>'
  60        end
  61      end
  62      buf.join("\n")
  63    end
  64  
  65    private
  66  
  67    def take_block(lines, marker)
  68      buf = []
  69      until lines.empty?
  70        break unless marker =~ lines.first
  71        buf.push lines.shift.sub(marker, '')
  72      end
  73      buf
  74    end
  75  
  76    def parse_h(line)
  77      @logger.debug "h: #{line.inspect}"
  78      level = @h_start_level + (line.slice(/\A\*{1,4}/).length - 1)
  79      content = line.sub(/\A\*+/, '')
  80      "<h#{level}>#{parse_inline(content)}</h#{level}>"
  81    end
  82  
  83    def parse_list(type, lines)
  84      @logger.debug "#{type}: #{lines.inspect}"
  85      marker = ((type == 'ul') ? /\A-/ : /\A\+/)
  86      parse_list0(type, lines, marker)
  87    end
  88  
  89    def parse_list0(type, lines, marker)
  90      buf = ["<#{type}>"]
  91      closeli = nil
  92      until lines.empty?
  93        if marker =~ lines.first
  94          buf.concat parse_list0(type, take_block(lines, marker), marker)
  95        else
  96          buf.push closeli if closeli;  closeli = '</li>'
  97          buf.push "<li>#{parse_inline(lines.shift)}"
  98        end
  99      end
 100      buf.push closeli if closeli;  closeli = '</li>'
 101      buf.push "</#{type}>"
 102      buf
 103    end
 104  
 105    def parse_dl(lines)
 106      @logger.debug "dl: #{lines.inspect}"
 107      buf = ["<dl>"]
 108      lines.each do |line|
 109        dt, dd = *line.split('|', 2)
 110        buf.push "<dt>#{parse_inline(dt)}</dt>"
 111        buf.push "<dd>#{parse_inline(dd)}</dd>" if dd
 112      end
 113      buf.push "</dl>"
 114      buf
 115    end
 116  
 117    def parse_quote(lines)
 118      @logger.debug "quote: #{lines.inspect}"
 119      [ "<blockquote><p>", lines.join("\n"), "</p></blockquote>"]
 120    end
 121  
 122    def parse_pre(lines)
 123      @logger.debug "pre: #{lines.inspect}"
 124      [ "<pre><code>#{lines.map {|line| escape(line) }.join("\n")}",
 125        '</code></pre>']
 126    end
 127  
 128    def parse_p(lines)
 129      lines.map {|line| parse_inline(line) }
 130    end
 131  
 132    def parse_inline(str)
 133      @inline_re ||= %r<
 134          ([&<>"])                             # $1: HTML escape characters
 135        | \[\[(.+?):\s*(https?://\S+)\s*\]\]   # $2: label, $3: URI
 136        | (#{autolink_re()})                   # $4: Page name autolink
 137        | (#{URI.regexp('http')})              # $5...: URI autolink
 138        >x
 139      str.gsub(@inline_re) {
 140        case
 141        when htmlchar = $1 then escape(htmlchar)
 142        when bracket  = $2 then a_href($3, bracket, 'outlink')
 143        when pagename = $4 then a_href(page_uri(pagename), pagename, 'pagelink')
 144        when uri      = $5 then a_href(uri, uri, 'outlink')
 145        else
 146          raise 'must not happen'
 147        end
 148      }
 149    end
 150  
 151    def a_href(uri, label, cssclass)
 152      %Q[<a class="#{cssclass}" href="#{escape(uri)}">#{escape(label)}</a>]
 153    end
 154  
 155    def autolink_re
 156      Regexp.union(* @page_names.reject {|name| name.size <= 3 })
 157    end
 158  
 159    def page_uri(page_name)
 160      "#{@base_uri}#{urlencode(page_name)}#{@pagelist_suffix}"
 161    end
 162  end