1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
|
### HTML #############################################################################################
# Code for stripping tags and collapsing whitespace.
# Author: Tom De Smedt.
# Copyright (c) 2007 by Tom De Smedt.
# See LICENSE.txt for details.
import sgmllib
import re
from htmlentitydefs import name2codepoint
from BeautifulSoup import UnicodeDammit
def clear_cache():
Cache("html").clear()
#### REPLACE ENTITIES ################################################################################
# Windows-1252 is a character encoding of the Latin alphabet,
# used by default in the legacy components of Microsoft Windows.
# List taken from Mark Pilgrim's feedparser.py
cp1252 = {
unichr(128): unichr(8364), # euro sign
unichr(130): unichr(8218), # single low-9 quotation mark
unichr(131): unichr( 402), # latin small letter f with hook
unichr(132): unichr(8222), # double low-9 quotation mark
unichr(133): unichr(8230), # horizontal ellipsis
unichr(134): unichr(8224), # dagger
unichr(135): unichr(8225), # double dagger
unichr(136): unichr( 710), # modifier letter circumflex accent
unichr(137): unichr(8240), # per mille sign
unichr(138): unichr( 352), # latin capital letter s with caron
unichr(139): unichr(8249), # single left-pointing angle quotation mark
unichr(140): unichr( 338), # latin capital ligature oe
unichr(142): unichr( 381), # latin capital letter z with caron
unichr(145): unichr(8216), # left single quotation mark
unichr(146): unichr(8217), # right single quotation mark
unichr(147): unichr(8220), # left double quotation mark
unichr(148): unichr(8221), # right double quotation mark
unichr(149): unichr(8226), # bullet
unichr(150): unichr(8211), # en dash
unichr(151): unichr(8212), # em dash
unichr(152): unichr( 732), # small tilde
unichr(153): unichr(8482), # trade mark sign
unichr(154): unichr( 353), # latin small letter s with caron
unichr(155): unichr(8250), # single right-pointing angle quotation mark
unichr(156): unichr( 339), # latin small ligature oe
unichr(158): unichr( 382), # latin small letter z with caron
unichr(159): unichr( 376) # latin capital letter y with diaeresis
}
def replace_entities(ustring, placeholder=" "):
"""Replaces HTML special characters by readable characters.
As taken from Leif K-Brooks algorithm on:
http://groups-beta.google.com/group/comp.lang.python
"""
def _repl_func(match):
try:
if match.group(1): # Numeric character reference
return unichr( int(match.group(2)) )
else:
try: return cp1252[ unichr(int(match.group(3))) ].strip()
except: return unichr( name2codepoint[match.group(3)] )
except:
return placeholder
# Force to Unicode.
if not isinstance(ustring, unicode):
ustring = UnicodeDammit(ustring).unicode
# Don't want some weird unicode character here
# that truncate_spaces() doesn't know of:
ustring = ustring.replace(" ", " ")
# The ^> makes sure nothing inside a tag (i.e. href with query arguments) gets processed.
_entity_re = re.compile(r'&(?:(#)(\d+)|([^;^> ]+));')
return _entity_re.sub(_repl_func, ustring)
#### STRIP TAGS ######################################################################################
class Tagstripper(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
def strip(self, html, exclude=[], linebreaks=False, blocks="\n", breaks="\n", columns="\n"):
self.data = ""
self.exclude = exclude
self.linebreaks = linebreaks
self.block = blocks
self.blocks = [
"h1", "h2", "h3", "h4", "h5", "h6",
"p", "center", "blockquote",
"div", "table", "ul", "ol",
"pre", "code", "form"
]
self.break_ = breaks
self.breaks = [
"br", "tr", "li"
]
self.columns = columns
self.feed(self.prepare(html))
self.close()
return self.data
def prepare(self, html):
# Clean up faulty HTML before parsing.
html = html.replace("<br/>", "<br />")
html = html.replace("<hr/>", "<hr />")
# Display list items with an asterisk.
#html = html.replace("li>", "li>*")
html = re.sub(r"<li.*?>", "\n<li>* ", html)
#html = html.replace("li>\n", "li>")
# Make sure there is a space between elements.
html = html.replace("><", "> <")
# Linebreaks in the source should not end up in the output.
if not self.linebreaks:
html = html.replace("\r", "\n")
html = html.replace("\n", " ")
return html
def unknown_starttag(self, tag, attributes):
# Include tags from the whitelist in the output.
if tag in self.exclude:
self.data += "<"+tag+">"
# Add linebreaks before and after block-level elements.
if tag in self.blocks:
self.data += self.block
# Convert things like <tr> and <br /> to linebreak.
if tag in self.breaks:
self.data += self.break_
def unknown_endtag(self, tag):
# Close tags from the whitelist in the output.
if tag in self.exclude:
self.data += "</"+tag+">"
# Add linebreaks before and after block-level elements.
if tag in self.blocks:
self.data += self.block
# Usually it's cleaner to separate columns by linebreaks too.
if tag == "td":
self.data += self.columns
def handle_data(self, data):
self.data += data
def handle_entityref(self, ref):
# Let entity refs (e.g. ) pass.
self.data += "&"+ref+";"
def handle_charref(self, ref):
# Let things like ƕ pass.
self.data += "&"+ref+";"
def strip_tags(html, exclude=[], linebreaks=False, blocks="\n", breaks="\n", columns="\n"):
# Removes all tags from HTML except those in the whitelist.
# This can leave a clutter of javascript and whitespace.
return Tagstripper().strip(html, exclude, blocks, breaks, columns)
#### STRIP CODE AND COMMENTS #########################################################################
def strip_between(start, end, str):
# ? denotes non-greedy *
# The dot matches anything in this pattern, including linebreaks.
# Replace is case-incensitive.
p = re.compile(r""+start+".*?"+end, re.DOTALL | re.I)
return re.sub(p, "", str)
def strip_javascript(html):
return strip_between("<script", "</script>", html)
def strip_inline_css(html):
return strip_between("<style", "</style>", html)
def strip_comments(html):
return strip_between("<!--", "-->", html)
def strip_forms(html):
return strip_between("<form", "</form>", html)
#### COLLAPSE WHITESPACE #############################################################################
def collapse_spaces(str):
# If there are 10 consecutive spaces, 9 of them are removed.
# Tabs not at the beginning of a line are truncated as well, e.g "this is untidy".
#str = re.sub(r"[[^$\t]\t]+", " ", str)
str = re.sub(r"[ ]+", " ", str).strip(" ")
return str
def collapse_linebreaks(str, max=2):
# Allow only a maximum of max linebreaks to build up,
# stripping additional whitespace lines from the output.
lines = str.split("\n")
str = ""
i = 0
for l in lines:
if l.strip() == "":
i += 1
else:
i = 0
if i < max:
str += l.strip(" ")
str += "\n"
return str.strip()
def collapse_tabs(str, indent=False):
# Converts tabs to spaces, optionally leaving the left indentation unmodified.
# collapse_spaces() should be called after this.
if not indent:
return str.replace("\t", " ")
else:
p = re.compile(r"^(\t+)", re.MULTILINE)
delimiter = "$$$_INDENTATION"
str = re.sub(p, "\\1"+delimiter, str)
lines = str.split("\n")
str = ""
for l in lines:
i = l.find(delimiter)
#if i >= 0:
l = l[:i] + l[i:].replace("\t", " ")
str += l + "\n"
str = str.replace(delimiter, "")
return str
def plain(html):
try: html = str(html)
except:
pass
if html == "None": html = ""
html = strip_javascript(html)
html = strip_inline_css(html)
html = strip_comments(html)
html = strip_forms(html)
html = strip_tags(html, columns="")
html = replace_entities(html)
html = collapse_tabs(html)
html = collapse_spaces(html)
html = collapse_linebreaks(html)
return html
#from urllib import urlopen
#html = urlopen("http://nodebox.net").read()
#print html
#print "##############################################"
#print plain(html)
|