1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
/* Tokenizer for JavaScript code */
var tokenizeJavaScript = (function() {
// Advance the stream until the given character (not preceded by a
// backslash) is encountered, or the end of the line is reached.
function nextUntilUnescaped(source, end) {
var escaped = false;
while (!source.endOfLine()) {
var next = source.next();
if (next == end && !escaped)
return false;
escaped = !escaped && next == "\\";
}
return escaped;
}
// A map of JavaScript's keywords. The a/b/c keyword distinction is
// very rough, but it gives the parser enough information to parse
// correct code correctly (we don't care that much how we parse
// incorrect code). The style information included in these objects
// is used by the highlighter to pick the correct CSS style for a
// token.
var keywords = function(){
function result(type, style){
return {type: type, style: "js-" + style};
}
// keywords that take a parenthised expression, and then a
// statement (if)
var keywordA = result("keyword a", "keyword");
// keywords that take just a statement (else)
var keywordB = result("keyword b", "keyword");
// keywords that optionally take an expression, and form a
// statement (return)
var keywordC = result("keyword c", "keyword");
var operator = result("operator", "keyword");
var atom = result("atom", "atom");
return {
"if": keywordA, "while": keywordA, "with": keywordA,
"else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB,
"return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "delete": keywordC, "throw": keywordC,
"in": operator, "typeof": operator, "instanceof": operator,
"var": result("var", "keyword"), "function": result("function", "keyword"), "catch": result("catch", "keyword"),
"for": result("for", "keyword"), "switch": result("switch", "keyword"),
"case": result("case", "keyword"), "default": result("default", "keyword"),
"true": atom, "false": atom, "null": atom, "undefined": atom, "NaN": atom, "Infinity": atom
};
}();
// Some helper regexps
var isOperatorChar = /[+\-*&%=<>!?|]/;
var isHexDigit = /[0-9A-Fa-f]/;
var isWordChar = /[\w\$_]/;
// Wrapper around jsToken that helps maintain parser state (whether
// we are inside of a multi-line comment and whether the next token
// could be a regular expression).
function jsTokenState(inside, regexp) {
return function(source, setState) {
var newInside = inside;
var type = jsToken(inside, regexp, source, function(c) {newInside = c;});
var newRegexp = type.type == "operator" || type.type == "keyword c" || type.type.match(/^[\[{}\(,;:]$/);
if (newRegexp != regexp || newInside != inside)
setState(jsTokenState(newInside, newRegexp));
return type;
};
}
// The token reader, intended to be used by the tokenizer from
// tokenize.js (through jsTokenState). Advances the source stream
// over a token, and returns an object containing the type and style
// of that token.
function jsToken(inside, regexp, source, setInside) {
function readHexNumber(){
source.next(); // skip the 'x'
source.nextWhileMatches(isHexDigit);
return {type: "number", style: "js-atom"};
}
function readNumber() {
source.nextWhileMatches(/[0-9]/);
if (source.equals(".")){
source.next();
source.nextWhileMatches(/[0-9]/);
}
if (source.equals("e") || source.equals("E")){
source.next();
if (source.equals("-"))
source.next();
source.nextWhileMatches(/[0-9]/);
}
return {type: "number", style: "js-atom"};
}
// Read a word, look it up in keywords. If not found, it is a
// variable, otherwise it is a keyword of the type found.
function readWord() {
source.nextWhileMatches(isWordChar);
var word = source.get();
var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word];
return known ? {type: known.type, style: known.style, content: word} :
{type: "variable", style: "js-variable", content: word};
}
function readRegexp() {
nextUntilUnescaped(source, "/");
source.nextWhileMatches(/[gi]/);
return {type: "regexp", style: "js-string"};
}
// Mutli-line comments are tricky. We want to return the newlines
// embedded in them as regular newline tokens, and then continue
// returning a comment token for every line of the comment. So
// some state has to be saved (inside) to indicate whether we are
// inside a /* */ sequence.
function readMultilineComment(start){
var newInside = "/*";
var maybeEnd = (start == "*");
while (true) {
if (source.endOfLine())
break;
var next = source.next();
if (next == "/" && maybeEnd){
newInside = null;
break;
}
maybeEnd = (next == "*");
}
setInside(newInside);
return {type: "comment", style: "js-comment"};
}
function readOperator() {
source.nextWhileMatches(isOperatorChar);
return {type: "operator", style: "js-operator"};
}
function readString(quote) {
var endBackSlash = nextUntilUnescaped(source, quote);
setInside(endBackSlash ? quote : null);
return {type: "string", style: "js-string"};
}
// Fetch the next token. Dispatches on first character in the
// stream, or first two characters when the first is a slash.
if (inside == "\"" || inside == "'")
return readString(inside);
var ch = source.next();
if (inside == "/*")
return readMultilineComment(ch);
else if (ch == "\"" || ch == "'")
return readString(ch);
// with punctuation, the type of the token is the symbol itself
else if (/[\[\]{}\(\),;\:\.]/.test(ch))
return {type: ch, style: "js-punctuation"};
else if (ch == "0" && (source.equals("x") || source.equals("X")))
return readHexNumber();
else if (/[0-9]/.test(ch))
return readNumber();
else if (ch == "/"){
if (source.equals("*"))
{ source.next(); return readMultilineComment(ch); }
else if (source.equals("/"))
{ nextUntilUnescaped(source, null); return {type: "comment", style: "js-comment"};}
else if (regexp)
return readRegexp();
else
return readOperator();
}
else if (isOperatorChar.test(ch))
return readOperator();
else
return readWord();
}
// The external interface to the tokenizer.
return function(source, startState) {
return tokenizer(source, startState || jsTokenState(false, true));
};
})();
|