File: html_parse.rb

package info (click to toggle)
ruby-openid 2.5.0debian-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 1,980 kB
  • ctags: 2,219
  • sloc: ruby: 16,737; xml: 219; sh: 24; makefile: 2
file content (144 lines) | stat: -rw-r--r-- 3,613 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
require "openid/yadis/htmltokenizer"

module OpenID

  # Stuff to remove before we start looking for tags
  REMOVED_RE = /
    # Comments
    <!--.*?-->

    # CDATA blocks
  | <!\[CDATA\[.*?\]\]>

    # script blocks
  | <script\b

    # make sure script is not an XML namespace
    (?!:)

    [^>]*>.*?<\/script>

  /mix

  def OpenID.openid_unescape(s)
    s.gsub('&amp;','&').gsub('&lt;','<').gsub('&gt;','>').gsub('&quot;','"')
  end

  def OpenID.unescape_hash(h)
    newh = {}
    h.map{|k,v|
      newh[k]=openid_unescape(v)
    }
    newh
  end


  def OpenID.parse_link_attrs(html)
    begin
      stripped = html.gsub(REMOVED_RE,'')
    rescue ArgumentError
      begin
        stripped = html.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'')
      rescue Encoding::UndefinedConversionError, Encoding::ConverterNotFoundError
        # needed for a problem in JRuby where it can't handle the conversion.
        # see details here: https://github.com/jruby/jruby/issues/829
        stripped = html.encode('UTF-8', 'ASCII', :invalid => :replace, :undef => :replace, :replace => '').gsub(REMOVED_RE,'')
      end
    end
    parser = HTMLTokenizer.new(stripped)

    links = []
    # to keep track of whether or not we are in the head element
    in_head = false
    in_html = false
    saw_head = false

    begin
      while el = parser.getTag('head', '/head', 'link', 'body', '/body', 
                               'html', '/html')
        
        # we are leaving head or have reached body, so we bail
        return links if ['/head', 'body', '/body', '/html'].member?(el.tag_name)

        # enforce html > head > link
        if el.tag_name == 'html'
          in_html = true
        end
        next unless in_html
        if el.tag_name == 'head'
          if saw_head
            return links #only allow one head
          end
          saw_head = true
          unless el.to_s[-2] == 47 # tag ends with a /: a short tag
            in_head = true
          end
        end
        next unless in_head

        return links if el.tag_name == 'html'

        if el.tag_name == 'link'
          links << unescape_hash(el.attr_hash)
        end
        
      end
    rescue Exception # just stop parsing if there's an error
    end
    return links
  end

  def OpenID.rel_matches(rel_attr, target_rel)
    # Does this target_rel appear in the rel_str?
    # XXX: TESTME
    rels = rel_attr.strip().split()
    rels.each { |rel|
      rel = rel.downcase
      if rel == target_rel
        return true
      end
    }

    return false
  end

  def OpenID.link_has_rel(link_attrs, target_rel)
    # Does this link have target_rel as a relationship?

    # XXX: TESTME
    rel_attr = link_attrs['rel']
    return (rel_attr and rel_matches(rel_attr, target_rel))
  end

  def OpenID.find_links_rel(link_attrs_list, target_rel)
    # Filter the list of link attributes on whether it has target_rel
    # as a relationship.

    # XXX: TESTME
    matchesTarget = lambda { |attrs| link_has_rel(attrs, target_rel) }
    result = []

    link_attrs_list.each { |item|
      if matchesTarget.call(item)
        result << item
      end
    }

    return result
  end

  def OpenID.find_first_href(link_attrs_list, target_rel)
    # Return the value of the href attribute for the first link tag in
    # the list that has target_rel as a relationship.

    # XXX: TESTME
    matches = find_links_rel(link_attrs_list, target_rel)
    if !matches or matches.empty?
      return nil
    end

    first = matches[0]
    return first['href']
  end
end