1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
#!/usr/bin/env python
#
# Copyright (C) 2004 Stefan Seefeld
# All rights reserved.
# Licensed to the public under the terms of the GNU LGPL (>= 2),
# see the file COPYING for details.
#
from xml.sax import saxexts, saxlib, saxutils
import sys, os, string, urllib, urlparse
import getopt
verbose = False
class Reference:
def __init__(self, orig, line, ref):
self.orig = orig
self.line = line
self.ref = ref
class DocumentHandler(saxlib.DocumentHandler):
"""Store urefs with the linenumbers they were encountered in,
so we can either traverse them, too, or report errors with specific
line numbers."""
def __init__(self):
self.urefs = {}
self.locator = None
def get_urefs(self):
urefs = map(lambda (u, l): Reference(self.locator.getSystemId(), l, u), self.urefs.items())
self.urefs = {}
self.locator = None
return urefs
def setDocumentLocator(self, locator):
"Receive an object for locating the origin of SAX document events."
self.locator = locator
def startElement(self, name, attrs):
"Look for ancors and store links."
if name == 'a':
href = attrs.getValue('href')
if not self.urefs.has_key(href):
self.urefs[href] = self.locator.getLineNumber()
from xml.sax.drivers import drv_xmlproc
SAXparser=drv_xmlproc.SAX_XPParser()
handler = DocumentHandler()
SAXparser.setDocumentHandler(handler)
SAXparser.setErrorHandler(saxutils.ErrorRaiser())
def validate(url):
"""validate (x)html conformance using 'tidy'."""
if verbose: print 'validating', url
status = os.system('tidy -errors -quiet %s'%url)
if os.WIFSIGNALED(status):
print 'internal error:', os.WTERMSIG(status)
elif os.WIFEXITED(status):
if os.WEXITSTATUS(status) != 0:
print 'validation failed'
return
else:
print 'internal error !'
def usage():
print 'Usage : %s [options] <input files>'%sys.argv[0]
print """
List of options:
-h, --help help
-p, --print provide verbose feedback during validation
-m, --maximum maximum number of pages to validate
-v, --validate call http://validator.w3.org to validate html
-e, --external follow external links
"""
def main():
global verbose
max = 50
external = False
do_validate = False
opts, args = getopt.getopt(sys.argv[1:],
'pm:evh',
['print', 'maximum=', 'external', 'validate', 'help'])
for o, a in opts:
if o == '-h' or o == '--help':
usage()
sys.exit(0)
elif o == '-p' or o == '--print':
verbose = True
elif o == '-m' or o == '--maximum':
max = int(a)
elif o == '-e' or o == '--external':
external = True
elif o == '-v' or o == '--validate':
do_validate = True
if not args:
usage()
sys.exit(0)
done = []
urefs = [Reference('.', 0, args[0])]
while urefs and len(done) < max:
uref = urefs.pop(0)
url = urlparse.urljoin(uref.orig, uref.ref)
scheme, location, path, query, fragment = urlparse.urlsplit(url)
if not external and scheme and scheme != 'file': continue
url = urlparse.urlunsplit((scheme, location, path, query, ''))
if url in done: continue
try:
if verbose: print 'parsing', url
SAXparser.parse(url)
if do_validate: validate(url)
done.append(url)
urefs.extend(handler.get_urefs())
except saxlib.SAXParseException, e:
sys.stderr.write("%s; processing aborted\n"%e)
break
if __name__ == '__main__':
main()
|