File: links.g

package info (click to toggle)
antlr 2.7.7%2Bdfsg-9.2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 9,964 kB
  • sloc: java: 54,649; cs: 12,537; makefile: 8,847; cpp: 7,359; pascal: 5,273; sh: 4,333; python: 4,299; lisp: 1,969; xml: 220; lex: 192; ansic: 127
file content (117 lines) | stat: -rw-r--r-- 2,823 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

header {
import sys
import string
}

header "LinkExtractor.__init__" {
    self.listener = None
}       

options {
    language = "Python";
}

/** Parse an entire html file, firing events to a single listener
 *  for each image and href link encountered.  All tokens are
 *  defined to skip so the lexer will continue scarfing until EOF.
 */
class LinkExtractor extends Lexer;

options {
    caseSensitive=false;
    k=2;
    filter=SCARF;
    charVocabulary='\3'..'\177';
}

{
    def addLinkListener(self, listener):
        self.listener = listener

    def removeLinkListener(self, listener):
        self.listener = None

    def fireImageLinkEvent(self, target, line):
        self.listener.imageReference(target, line)

    def fireHREFLinkEvent(self, target, line):
        self.listener.hrefReference(target, line)

    /** strip quotes from "..." or '...' strings */
    def stripQuotes(src):
        h = src.find('"')
        if h == -1:
            h = src.index("'")
        t = src.rfind('"')
        if t == -1:
            t = src.rindex("'");
        if h == -1 or t == -1:
            return src
        return src[h+1:t]
    stripQuotes = staticmethod(stripQuotes)

}

AHREF
        :       "<a" WS (ATTR)+ '>'     { $skip }
        ;

IMG     :       "<img" WS (ATTR)+ '>'   { $skip }
        ;

protected
ATTR
options {
        ignore=WS;
}
        :       w:WORD '='
                (       s:STRING
                |       v:WORD
                )
                {
                    if s:
                        target = self.stripQuotes(s.getText())
                    else:
                        target = v.getText()
                    if string.lower(w.getText()) == "href":
                        self.fireHREFLinkEvent(target, self.getLine())
                    elif string.lower(w.getText()) == "src":
                        self.fireImageLinkEvent(target, self.getLine())
                }
        ;

/** Match until next whitespace; can be file, int, etc... */
protected
WORD:   (
                        options {
                                generateAmbigWarnings=false;
                        }
                :       'a'..'z' | '0'..'9' | '/' | '.' | '#' | '_'
                )+
        ;

protected
STRING
        :       '"' (~'"')* '"'
        |       '\'' (~'\'')* '\''
        ;

protected
WS      :       (       ' '
                |       '\t'
                |       '\f'
                |       (       "\r\n"  // DOS
                        |       '\r'    // Macintosh
                        |       '\n'    // Unix (the right way)
                        )
                        { $newline }
                )
                { $skip }
        ;

protected
SCARF
        :       WS      // track line numbers while you scarf
        |       .
        ;