1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
# -*- coding: utf-8 -*-
"""
Pygments tests with example files
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
from __future__ import print_function
import os
import pprint
import difflib
import pickle
from pygments.lexers import get_lexer_for_filename, get_lexer_by_name
from pygments.token import Error
from pygments.util import ClassNotFound
import support
STORE_OUTPUT = False
STATS = {}
TESTDIR = os.path.dirname(__file__)
# Jython generates a StackOverflowError for repetitions of the form (a|b)+,
# which are commonly used in string patterns, when matching more than about 1000
# chars. These tests do not complete. See http://bugs.jython.org/issue1965
BAD_FILES_FOR_JYTHON = ('Object.st', 'all.nit', 'genclass.clj',
'ragel-cpp_rlscan')
def test_example_files():
global STATS
STATS = {}
outdir = os.path.join(TESTDIR, 'examplefiles', 'output')
if STORE_OUTPUT and not os.path.isdir(outdir):
os.makedirs(outdir)
for fn in os.listdir(os.path.join(TESTDIR, 'examplefiles')):
if fn.startswith('.') or fn.endswith('#'):
continue
absfn = os.path.join(TESTDIR, 'examplefiles', fn)
if not os.path.isfile(absfn):
continue
print(absfn)
with open(absfn, 'rb') as f:
code = f.read()
try:
code = code.decode('utf-8')
except UnicodeError:
code = code.decode('latin1')
lx = None
if '_' in fn:
try:
lx = get_lexer_by_name(fn.split('_')[0])
except ClassNotFound:
pass
if lx is None:
try:
lx = get_lexer_for_filename(absfn, code=code)
except ClassNotFound:
raise AssertionError('file %r has no registered extension, '
'nor is of the form <lexer>_filename '
'for overriding, thus no lexer found.'
% fn)
yield check_lexer, lx, fn
N = 7
stats = list(STATS.items())
stats.sort(key=lambda x: x[1][1])
print('\nExample files that took longest absolute time:')
for fn, t in stats[-N:]:
print('%-30s %6d chars %8.2f ms %7.3f ms/char' % ((fn,) + t))
print()
stats.sort(key=lambda x: x[1][2])
print('\nExample files that took longest relative time:')
for fn, t in stats[-N:]:
print('%-30s %6d chars %8.2f ms %7.3f ms/char' % ((fn,) + t))
def check_lexer(lx, fn):
if os.name == 'java' and fn in BAD_FILES_FOR_JYTHON:
raise support.SkipTest
absfn = os.path.join(TESTDIR, 'examplefiles', fn)
with open(absfn, 'rb') as fp:
text = fp.read()
text = text.replace(b'\r\n', b'\n')
text = text.strip(b'\n') + b'\n'
try:
text = text.decode('utf-8')
if text.startswith(u'\ufeff'):
text = text[len(u'\ufeff'):]
except UnicodeError:
text = text.decode('latin1')
ntext = []
tokens = []
import time
t1 = time.time()
for type, val in lx.get_tokens(text):
ntext.append(val)
assert type != Error, \
'lexer %s generated error token for %s: %r at position %d' % \
(lx, absfn, val, len(u''.join(ntext)))
tokens.append((type, val))
t2 = time.time()
STATS[os.path.basename(absfn)] = (len(text),
1000 * (t2 - t1), 1000 * (t2 - t1) / len(text))
if u''.join(ntext) != text:
print('\n'.join(difflib.unified_diff(u''.join(ntext).splitlines(),
text.splitlines())))
raise AssertionError('round trip failed for ' + absfn)
# check output against previous run if enabled
if STORE_OUTPUT:
# no previous output -- store it
outfn = os.path.join(TESTDIR, 'examplefiles', 'output', fn)
if not os.path.isfile(outfn):
with open(outfn, 'wb') as fp:
pickle.dump(tokens, fp)
return
# otherwise load it and compare
with open(outfn, 'rb') as fp:
stored_tokens = pickle.load(fp)
if stored_tokens != tokens:
f1 = pprint.pformat(stored_tokens)
f2 = pprint.pformat(tokens)
print('\n'.join(difflib.unified_diff(f1.splitlines(),
f2.splitlines())))
assert False, absfn
|