1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
|
# for URI::regexp
require 'uri'
require 'feedparser/html2text-parser'
# This class provides various converters
class String
# is this text HTML ? search for tags. used by String#text2html
def html?
return (self =~ /<p>/i) || (self =~ /<\/p>/i) || (self =~ /<br>/i) || (self =~ /<br\s*(\/)?\s*>/i) || (self =~ /<\/a>/i) || (self =~ /<img.*>/i)
end
# returns true if the text contains escaped HTML (with HTML entities). used by String#text2html
def escaped_html?
return (self =~ /<img src=/i) || (self =~ /<a href=/i) || (self =~ /<br(\/| \/|)>/i) || (self =~ /<p>/i)
end
def escape_html
r = self.gsub('&', '&')
r = r.gsub('<', '<')
r = r.gsub('>', '>')
r
end
MY_ENTITIES = {}
FeedParser::HTML2TextParser::entities.each do |k, v|
MY_ENTITIES["&#{k};"] = [v].pack('U*')
MY_ENTITIES["&##{v};"] = [v].pack('U*')
end
# un-escape HTML in the text. used by String#text2html
def unescape_html
r = self
MY_ENTITIES.each do |k, v|
r = r.gsub(k, v)
end
r
end
# convert text to HTML
def text2html(feed)
text = self.clone
realhtml = text.html?
eschtml = text.escaped_html?
# fix for RSS feeds with both real and escaped html (crazy!):
# we take the first one
if (realhtml && eschtml)
if (realhtml < eschtml)
eschtml = nil
else
realhtml = nil
end
end
if realhtml
# do nothing
elsif eschtml
text = text.unescape_html
else
# paragraphs
text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
# uris
text.gsub!(/([^'"])(#{URI::DEFAULT_PARSER.make_regexp(['http','ftp','https'])})/,
'\1<a href="\2">\2</a>')
end
# Handle broken hrefs in <a> and <img>
if feed and feed.link
text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m|
begin
first, url, last = $1, $3, $4
if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/)
m
elsif url =~ /^\//
(first + feed.link.split(/\//)[0..2].join('/') + url + last)
else
t = feed.link.split(/\//)
if t.length == 3 # http://toto with no trailing /
(first + feed.link + '/' + url + last)
else
if feed.link =~ /\/$/
(first + feed.link + url + last)
else
(first + t[0...-1].join('/') + '/' + url + last)
end
end
end
rescue
m
end
end
end
text
end
# Remove white space around the text
def rmWhiteSpace!
return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
end
# Convert a text in inputenc to a text in UTF8
# must take care of wrong input locales
def toUTF8(inputenc)
if inputenc.downcase != 'utf-8'
# it is said it is not UTF-8. Ensure it is REALLY not UTF-8
begin
if self.unpack('U*').pack('U*') == self
return self
end
rescue
# do nothing
end
begin
return self.unpack('C*').pack('U*')
rescue
return self #failsafe solution. but a dirty one :-)
end
else
return self
end
end
end
|