1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
|
#!/usr/bin/env python
"""usage: %prog [options] filename
Parse a document to a tree, with optional profiling
"""
import sys
import os
import traceback
from optparse import OptionParser
from html5lib import html5parser, sanitizer
from html5lib.tokenizer import HTMLTokenizer
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants
def parse():
optParser = getOptParser()
opts,args = optParser.parse_args()
encoding = "utf8"
try:
f = args[-1]
# Try opening from the internet
if f.startswith('http://'):
try:
import urllib.request, urllib.parse, urllib.error, cgi
f = urllib.request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
(mediaType, params) = cgi.parse_header(contentType)
encoding = params.get('charset')
except:
pass
elif f == '-':
f = sys.stdin
if sys.version_info[0] >= 3:
encoding = None
else:
try:
# Try opening from file system
f = open(f, "rb")
except IOError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
sys.exit(1)
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
if opts.sanitize:
tokenizer = sanitizer.HTMLSanitizer
else:
tokenizer = HTMLTokenizer
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log)
if opts.fragment:
parseMethod = p.parseFragment
else:
parseMethod = p.parse
if opts.profile:
import cProfile
import pstats
cProfile.runctx("run(parseMethod, f, encoding)", None,
{"run": run,
"parseMethod": parseMethod,
"f": f,
"encoding": encoding},
"stats.prof")
# XXX - We should use a temp file here
stats = pstats.Stats('stats.prof')
stats.strip_dirs()
stats.sort_stats('time')
stats.print_stats()
elif opts.time:
import time
t0 = time.time()
document = run(parseMethod, f, encoding)
t1 = time.time()
if document:
printOutput(p, document, opts)
t2 = time.time()
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
else:
sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
else:
document = run(parseMethod, f, encoding)
if document:
printOutput(p, document, opts)
def run(parseMethod, f, encoding):
try:
document = parseMethod(f, encoding=encoding)
except:
document = None
traceback.print_exc()
return document
def printOutput(parser, document, opts):
if opts.encoding:
print("Encoding:", parser.tokenizer.stream.charEncoding)
for item in parser.log:
print(item)
if document is not None:
if opts.xml:
sys.stdout.write(document.toxml("utf-8"))
elif opts.tree:
if not hasattr(document,'__getitem__'):
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
elif opts.hilite:
sys.stdout.write(document.hilite("utf-8"))
elif opts.html:
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts,opt)
except:
pass
if not kwargs['quote_char']:
del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
if sys.version_info[0] >= 3:
encoding = None
else:
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if opts.error:
errList=[]
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
def getOptParser():
parser = OptionParser(usage=__doc__)
parser.add_option("-p", "--profile", action="store_true", default=False,
dest="profile", help="Use the hotshot profiler to "
"produce a detailed log of the run")
parser.add_option("-t", "--time",
action="store_true", default=False, dest="time",
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
parser.add_option("-b", "--treebuilder", action="store", type="string",
dest="treebuilder", default="simpleTree")
parser.add_option("-e", "--error", action="store_true", default=False,
dest="error", help="Print a list of parse errors")
parser.add_option("-f", "--fragment", action="store_true", default=False,
dest="fragment", help="Parse as a fragment")
parser.add_option("", "--tree", action="store_true", default=False,
dest="tree", help="Output as debug tree")
parser.add_option("-x", "--xml", action="store_true", default=False,
dest="xml", help="Output as xml")
parser.add_option("", "--no-html", action="store_false", default=True,
dest="html", help="Don't output html")
parser.add_option("", "--hilite", action="store_true", default=False,
dest="hilite", help="Output as formatted highlighted code.")
parser.add_option("-c", "--encoding", action="store_true", default=False,
dest="encoding", help="Print character encoding used")
parser.add_option("", "--inject-meta-charset", action="store_true",
default=False, dest="inject_meta_charset",
help="inject <meta charset>")
parser.add_option("", "--strip-whitespace", action="store_true",
default=False, dest="strip_whitespace",
help="strip whitespace")
parser.add_option("", "--omit-optional-tags", action="store_true",
default=False, dest="omit_optional_tags",
help="omit optional tags")
parser.add_option("", "--quote-attr-values", action="store_true",
default=False, dest="quote_attr_values",
help="quote attribute values")
parser.add_option("", "--use-best-quote-char", action="store_true",
default=False, dest="use_best_quote_char",
help="use best quote character")
parser.add_option("", "--quote-char", action="store",
default=None, dest="quote_char",
help="quote character")
parser.add_option("", "--no-minimize-boolean-attributes",
action="store_false", default=True,
dest="minimize_boolean_attributes",
help="minimize boolean attributes")
parser.add_option("", "--use-trailing-solidus", action="store_true",
default=False, dest="use_trailing_solidus",
help="use trailing solidus")
parser.add_option("", "--space-before-trailing-solidus",
action="store_true", default=False,
dest="space_before_trailing_solidus",
help="add space before trailing solidus")
parser.add_option("", "--escape-lt-in-attrs", action="store_true",
default=False, dest="escape_lt_in_attrs",
help="escape less than signs in attribute values")
parser.add_option("", "--escape-rcdata", action="store_true",
default=False, dest="escape_rcdata",
help="escape rcdata element values")
parser.add_option("", "--sanitize", action="store_true", default=False,
dest="sanitize", help="sanitize")
parser.add_option("-l", "--log", action="store_true", default=False,
dest="log", help="log state transitions")
return parser
if __name__ == "__main__":
parse()
|