File: html-validator

package info (click to toggle)
synopsis 0.8.0-5
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 10,112 kB
  • ctags: 12,996
  • sloc: cpp: 34,254; ansic: 33,620; python: 10,975; sh: 7,261; xml: 6,369; makefile: 773; asm: 445
file content (134 lines) | stat: -rwxr-xr-x 3,804 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
#
# Copyright (C) 2004 Stefan Seefeld
# All rights reserved.
# Licensed to the public under the terms of the GNU LGPL (>= 2),
# see the file COPYING for details.
#


from xml.sax import saxexts, saxlib, saxutils
import sys, os, string, urllib, urlparse
import getopt

verbose = False

class Reference:
    def __init__(self, orig, line, ref):
        self.orig = orig
        self.line = line
        self.ref = ref

class DocumentHandler(saxlib.DocumentHandler):
    """Store urefs with the linenumbers they were encountered in,
    so we can either traverse them, too, or report errors with specific
    line numbers."""

    def __init__(self):

        self.urefs = {}
        self.locator = None

    def get_urefs(self):
        
        urefs = map(lambda (u, l): Reference(self.locator.getSystemId(), l, u), self.urefs.items())
        self.urefs = {}
        self.locator = None
        return urefs

    def setDocumentLocator(self, locator):
        "Receive an object for locating the origin of SAX document events."
        self.locator = locator

    def startElement(self, name, attrs):
        "Look for ancors and store links."

        if name == 'a':
            href = attrs.getValue('href')
            if not self.urefs.has_key(href):
                self.urefs[href] = self.locator.getLineNumber()

from xml.sax.drivers import drv_xmlproc
SAXparser=drv_xmlproc.SAX_XPParser()

handler = DocumentHandler()
SAXparser.setDocumentHandler(handler)
SAXparser.setErrorHandler(saxutils.ErrorRaiser())

def validate(url):
    """validate (x)html conformance using 'tidy'."""

    if verbose: print 'validating', url
    status = os.system('tidy -errors -quiet %s'%url)
    if os.WIFSIGNALED(status):
        print 'internal error:', os.WTERMSIG(status)
    elif os.WIFEXITED(status):
        if os.WEXITSTATUS(status) != 0:
            print 'validation failed'
        return
    else:
        print 'internal error !'
        
def usage():
   print 'Usage : %s [options] <input files>'%sys.argv[0]
   print """
List of options:

  -h, --help             help
  -p, --print            provide verbose feedback during validation
  -m, --maximum          maximum number of pages to validate
  -v, --validate         call http://validator.w3.org to validate html
  -e, --external         follow external links
"""

def main():
   global verbose

   max = 50
   external = False
   do_validate = False

   opts, args = getopt.getopt(sys.argv[1:],
                              'pm:evh',
                              ['print', 'maximum=', 'external', 'validate', 'help'])
   for o, a in opts:
      if o == '-h' or o == '--help':
         usage()
         sys.exit(0)
      elif o == '-p' or o == '--print':
         verbose = True
      elif o == '-m' or o == '--maximum':
         max = int(a)
      elif o == '-e' or o == '--external':
         external = True
      elif o == '-v' or o == '--validate':
         do_validate = True

   if not args:
         usage()
         sys.exit(0)

   done = []
   urefs = [Reference('.', 0, args[0])]
   while urefs and len(done) < max:

       uref = urefs.pop(0)
       url = urlparse.urljoin(uref.orig, uref.ref)
       scheme, location, path, query, fragment = urlparse.urlsplit(url)
       if not external and scheme and scheme != 'file': continue
       url = urlparse.urlunsplit((scheme, location, path, query, ''))
       if url in done: continue
       try:
           if verbose: print 'parsing', url
           SAXparser.parse(url)
           if do_validate: validate(url)
           done.append(url)
           urefs.extend(handler.get_urefs())
            
       except saxlib.SAXParseException, e:
           sys.stderr.write("%s; processing aborted\n"%e)
           break

if __name__ == '__main__':

    main()