# A parser for SGML, using the derived class as static DTD.
# from http://raa.ruby-lang.org/project/html-parser
module FeedParser
  class SGMLParser
    # Regular expressions used for parsing:
    Interesting = /[&<]/
    Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
                                '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
                                '![^<>]*)?')

    Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*);/
    Charref = /&#([0-9]+);/

    Starttagopen = /<[>a-zA-Z]/
    Endtagopen = /<\/[<>a-zA-Z]/
    Endbracket = /[<>]/
    Special = /<![^<>]*>/
    Commentopen = /<!--/
    Commentclose = /--[ \t\n]*>/
    Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
    Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
                              '(\s*=\s*' +
                              "('[^']*'" +
                              '|"[^"]*"' +
                              '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')

    Entitydefs =
      {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}

    def initialize(verbose=false)
      @verbose = verbose
      reset
    end

    def reset
      @rawdata = ''
      @stack = []
      @lasttag = '???'
      @nomoretags = false
      @literal = false
    end

    def has_context(gi)
      @stack.include? gi
    end

    def setnomoretags
      @nomoretags = true
      @literal = true
    end

    def setliteral(*args)
      @literal = true
    end

    def feed(data)
      @rawdata << data
      goahead(false)
    end

    def close
      goahead(true)
    end

    def goahead(_end)
      rawdata = @rawdata
      i = 0
      n = rawdata.length
      while i < n
        if @nomoretags
          handle_data(rawdata[i..(n-1)])
          i = n
          break
        end
        j = rawdata.index(Interesting, i)
        j = n unless j
        if i < j
          handle_data(rawdata[i..(j-1)])
        end
        i = j
        break if (i == n)
        if rawdata[i] == ?< #
          if rawdata.index(Starttagopen, i) == i
            if @literal
              handle_data(rawdata[i, 1])
              i += 1
              next
            end
            k = parse_starttag(i)
            break unless k
            i = k
            next
          end
          if rawdata.index(Endtagopen, i) == i
            k = parse_endtag(i)
            break unless k
            i = k
            @literal = false
            next
          end
          if rawdata.index(Commentopen, i) == i
            if @literal
              handle_data(rawdata[i,1])
              i += 1
              next
            end
            k = parse_comment(i)
            break unless k
            i += k
            next
          end
          if rawdata.index(Special, i) == i
            if @literal
              handle_data(rawdata[i, 1])
              i += 1
              next
            end
            k = parse_special(i)
            break unless k
            i += k
            next
          end
        elsif rawdata[i] == ?& #
          if rawdata.index(Charref, i) == i
            i += $&.length
            handle_charref($1)
            i -= 1 unless rawdata[i-1] == ?;
            next
          end
          if rawdata.index(Entityref, i) == i
            i += $&.length
            handle_entityref($1)
            i -= 1 unless rawdata[i-1] == ?;
            next
          end
        else
          raise RuntimeError, 'neither < nor & ??'
        end
        # We get here only if incomplete matches but
        # nothing else
        match = rawdata.index(Incomplete, i)
        unless match == i
          handle_data(rawdata[i, 1])
          i += 1
          next
        end
        j = match + $&.length
        break if j == n # Really incomplete
        handle_data(rawdata[i..(j-1)])
        i = j
      end
      # end while
      if _end and i < n
        handle_data(@rawdata[i..(n-1)])
        i = n
      end
      @rawdata = rawdata[i..-1]
    end

    def parse_comment(i)
      rawdata = @rawdata
      if rawdata[i, 4] != '<!--'
        raise RuntimeError, 'unexpected call to handle_comment'
      end
      match = rawdata.index(Commentclose, i)
      return nil unless match
      matched_length = $&.length
      j = match
      handle_comment(rawdata[i+4..(j-1)])
      j = match + matched_length
      return j-i
    end

    def parse_starttag(i)
      rawdata = @rawdata
      j = rawdata.index(Endbracket, i + 1)
      return nil unless j
      attrs = []
      if rawdata[i+1] == ?> #
        # SGML shorthand: <> == <last open tag seen>
        k = j
        tag = @lasttag
      else
        match = rawdata.index(Tagfind, i + 1)
        unless match
          raise RuntimeError, 'unexpected call to parse_starttag'
        end
        k = i + 1 + ($&.length)
        tag = $&.downcase
        @lasttag = tag
      end
      while k < j
        break unless rawdata.index(Attrfind, k)
        matched_length = $&.length
        attrname, rest, attrvalue = $1, $2, $3
        if not rest
          attrvalue = '' # was: = attrname
        elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
            (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
          attrvalue = attrvalue[1..-2]
        end
        attrs << [attrname.downcase, attrvalue]
        k += matched_length
      end
      if rawdata[j] == ?> #
        j += 1
      end
      finish_starttag(tag, attrs)
      return j
    end

    def parse_endtag(i)
      rawdata = @rawdata
      j = rawdata.index(Endbracket, i + 1)
      return nil unless j
      tag = (rawdata[i+2..j-1].strip).downcase
      if rawdata[j] == ?> #
        j += 1
      end
      finish_endtag(tag)
      return j
    end

    def finish_starttag(tag, attrs)
      method = 'start_' + tag
      if self.respond_to?(method)
        @stack << tag
        handle_starttag(tag, method, attrs)
        return 1
      else
        method = 'do_' + tag
        if self.respond_to?(method)
          handle_starttag(tag, method, attrs)
          return 0
        else
          unknown_starttag(tag, attrs)
          return -1
        end
      end
    end

    def finish_endtag(tag)
      if tag == ''
        found = @stack.length - 1
        if found < 0
          unknown_endtag(tag)
          return
        end
      else
        unless @stack.include? tag
          method = 'end_' + tag
          unless self.respond_to?(method)
            unknown_endtag(tag)
          end
          return
        end
        found = @stack.index(tag) #or @stack.length
      end
      while @stack.length > found
        tag = @stack[-1]
        method = 'end_' + tag
        if respond_to?(method)
          handle_endtag(tag, method)
        else
          unknown_endtag(tag)
        end
        @stack.pop
      end
    end

    def parse_special(i)
      rawdata = @rawdata
      match = rawdata.index(Endbracket, i+1)
      return nil unless match
      matched_length = $&.length
      handle_special(rawdata[i+1..(match-1)])
      return match - i + matched_length
    end

    def handle_starttag(tag, method, attrs)
      self.send(method, attrs)
    end

    def handle_endtag(tag, method)
      self.send(method)
    end

    def report_unbalanced(tag)
      if @verbose
        print '*** Unbalanced </' + tag + '>', "\n"
        print '*** Stack:', self.stack, "\n"
      end
    end

    def handle_charref(name)
      if name =~ /[0-9]+/
        unknown_charref(name)
      else
        handle_data(name)
      end
    end

    def handle_entityref(name)
      table = Entitydefs
      if table.include?(name)
        handle_data(table[name])
      else
        unknown_entityref(name)
        return
      end
    end

    def handle_data(data)
    end

    def handle_comment(data)
    end

    def handle_special(data)
    end

    def unknown_starttag(tag, attrs)
    end
    def unknown_endtag(tag)
    end
    def unknown_charref(ref)
    end
    def unknown_entityref(ref)
    end
  end
end
