File: uri_helper.rb

package info (click to toggle)
libfeedtools-ruby 0.2.29%2Bdfsg1-4
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,004 kB
  • ctags: 1,385
  • sloc: ruby: 18,815; sql: 39; makefile: 6
file content (219 lines) | stat: -rw-r--r-- 7,699 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#--
# Copyright (c) 2005 Robert Aman
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#++

require 'feed_tools'
require 'uri'
  
module FeedTools
  # Generic url processing methods needed in numerous places throughout
  # FeedTools
  module UriHelper
    
    # Returns true if the idn module can be used.
    def self.idn_enabled?
      # This is an override variable to keep idn from being used even if it
      # is available.
      if FeedTools.configurations[:idn_enabled] == false
        return false
      end
      if @idn_enabled.nil? || @idn_enabled == false
        @idn_enabled = false
        begin
          require 'idn'
          if IDN::Idna.toASCII('http://www.詹姆斯.com/') ==
            "http://www.xn--8ws00zhy3a.com/"
            @idn_enabled = true
          else
            @idn_enabled = false
          end
        rescue LoadError
          # Tidy not installed, disable features that rely on tidy.
          @idn_enabled = false
        end
      end
      return @idn_enabled
    end
    
    # Attempts to ensures that the passed url is valid and sane.  Accepts very,
    # very ugly urls and makes every effort to figure out what it was supposed
    # to be.  Also translates from the feed: and rss: pseudo-protocols to the
    # http: protocol.
    def self.normalize_url(url)
      if url.nil?
        return nil
      end
      if !url.kind_of?(String)
        url = url.to_s
      end
      if url.blank?
        return ""
      end
      normalized_url = url.strip

      begin
        normalized_url =
          FeedTools::URI.convert_path(normalized_url.strip).normalize.to_s
      rescue Exception
      end
      
      begin
        begin
          normalized_url =
            FeedTools::URI.parse(normalized_url.strip).normalize.to_s
        rescue Exception
          normalized_url = CGI.unescape(url.strip)
        end
      rescue Exception
        normalized_url = url.strip
      end

      # if a url begins with the '/' character, it only makes sense that they
      # meant to be using a file:// url.  Fix it for them.
      if normalized_url.length > 0 && normalized_url[0..0] == "/"
        normalized_url = "file://" + normalized_url
      end

      # if a url begins with a drive letter followed by a colon, we're looking at
      # a file:// url.  Fix it for them.
      if normalized_url.length > 0 &&
          normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
        normalized_url = "file:///" + normalized_url
      end

      # if a url begins with javascript:, it's quite possibly an attempt at
      # doing something malicious.  Let's keep that from getting anywhere,
      # shall we?
      if (normalized_url.downcase =~ /javascript:/) != nil
        return "#"
      end

      # deal with all of the many ugly possibilities involved in the rss:
      # and feed: pseudo-protocols (incidentally, whose crazy idea was this
      # mess?)
      normalized_url.gsub!(/^htp:\/*/i, "http://")
      normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
      normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
      normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
      normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
      normalized_url.gsub!(/^file:\/*/i, "file:///")
      normalized_url.gsub!(/^https:\/*/i, "https://")
      normalized_url.gsub!(/^mms:\/*/i, "http://")
      # fix (very) bad urls (usually of the user-entered sort)
      normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
      normalized_url.gsub!(/^http:\/*$/i, "")

      if (normalized_url =~ /^file:/i) == 0
        # Adjust windows-style urls
        normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
        normalized_url.gsub!(/\\/, '/')
      else
        if FeedTools::URI.parse(normalized_url).scheme == nil &&
            normalized_url =~ /\./ &&
          normalized_url = "http://" + normalized_url
        end
        if normalized_url == "http://"
          return nil
        end
      end
      if normalized_url =~ /^https?:\/\/#/i
        normalized_url.gsub!(/^https?:\/\/#/i, "#")
      end
      if normalized_url =~ /^https?:\/\/\?/i
        normalized_url.gsub!(/^https?:\/\/\?/i, "?")
      end

      normalized_url =
        FeedTools::URI.parse(normalized_url.strip).normalize.to_s
      return normalized_url
    end

    # Resolves a relative uri
    def self.resolve_relative_uri(relative_uri, base_uri_sources=[])
      return relative_uri if base_uri_sources.blank?
      return nil if relative_uri.nil?
      begin
        # Massive HACK to get around file protocol URIs being used to
        # resolve relative URIs on feeds in the local file system.
        # Better to leave these URIs unresolved and hope some other
        # tool resolves them correctly.
        base_uri_sources.reject! do |base_uri|
          base_uri == nil ||
            FeedTools::URI.parse(base_uri).scheme == "file"
        end
        base_uri = FeedTools::URI.parse(
          FeedTools::XmlHelper.select_not_blank(base_uri_sources))
        resolved_uri = base_uri
        if relative_uri.to_s != ''
          resolved_uri = base_uri + relative_uri.to_s
        end
        return FeedTools::UriHelper.normalize_url(resolved_uri.to_s)
      rescue
        return relative_uri
      end
    end

    # Converts a url into a tag uri
    def self.build_tag_uri(url, date)
      unless url.kind_of? String
        raise ArgumentError, "Expected String, got #{url.class.name}"
      end
      unless date.kind_of? Time
        raise ArgumentError, "Expected Time, got #{date.class.name}"
      end
      tag_uri = normalize_url(url)
      unless FeedTools::UriHelper.is_uri?(tag_uri)
        raise ArgumentError, "Must supply a valid URL."
      end
      host = URI.parse(tag_uri).host
      tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
      tag_uri.gsub!(/#/, "/")
      tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
        "#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
      return tag_uri
    end

    # Converts a url into a urn:uuid: uri
    def self.build_urn_uri(url)
      unless url.kind_of? String
        raise ArgumentError, "Expected String, got #{url.class.name}"
      end
      normalized_url = normalize_url(url)
      require 'uuidtools'
      return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri.to_s
    end

    # Returns true if the parameter appears to be a valid uri
    def self.is_uri?(url)
      return false if url.nil?
      begin
        uri = URI.parse(url)
        if uri.scheme.blank?
          return false
        end
      rescue URI::InvalidURIError
        return false
      end
      return true
    end
  end
end