File: benchmark.py

package info (click to toggle)
html5-parser 0.4.5-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,752 kB
  • sloc: ansic: 32,440; python: 2,022; makefile: 13
file content (108 lines) | stat: -rwxr-xr-x 3,394 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import argparse
import textwrap
from functools import partial

import html5_parser
import html5lib
from bs4 import BeautifulSoup

try:
    from time import monotonic
except ImportError:
    from time import time as monotonic

TF = 'test/large.html'
try:
    raw = open(TF, 'rb').read()
except Exception:
    try:
        from urllib.request import urlopen
    except ImportError:
        from urllib import urlopen
    print('Downloading large HTML file...')
    raw = urlopen('https://www.w3.org/TR/html5/single-page.html').read()
    open(TF, 'wb').write(raw)

print('Testing with HTML file of', '{:,}'.format(len(raw)), 'bytes')


def timeit(func, number=1):
    total = 0
    for i in range(number):
        st = monotonic()
        r = func()
        t = monotonic() - st
        total += t
        del r
    return total / number


def doit(name, func, num=20):
    print('Parsing', num, 'times with', name)
    t = timeit(func, num)
    print(name, 'took an average of: {:,.3f} seconds to parse it'.format(t))
    return t


p = argparse.ArgumentParser(description='Benchmark html5-parser')
p.add_argument('treebuilder', nargs='?', default='lxml', choices='lxml soup dom etree'.split())
p.add_argument(
    '--num',
    '-n',
    default=10,
    type=int,
    help='Number of repetitions for html5lib (html5-parser will use 10x as many reps)')
args = p.parse_args()

base_time = doit(
    'html5-parser',
    partial(
        html5_parser.parse,
        raw,
        transport_encoding="utf-8",
        namespace_elements=True,
        treebuilder=args.treebuilder),
    num=args.num * 10)
soup_time = doit(
    'html5-parser-to-soup',
    partial(html5_parser.parse, raw, transport_encoding="utf-8", treebuilder='soup'),
    num=args.num)

h5time = doit(
    'html5lib',
    partial(html5lib.parse, raw, transport_encoding="utf-8", treebuilder=args.treebuilder),
    num=args.num)
soup5_time = doit(
    'BeautifulSoup-with-html5lib', partial(BeautifulSoup, raw, 'html5lib'), num=args.num)
soup4_time = doit('BeautifulSoup-with-lxml', partial(BeautifulSoup, raw, 'lxml'), num=args.num)


def row(*args):
    for a in args:
        print('{:18s}'.format(str(a)), end='|')
    print()


print()
print(textwrap.fill(
    'Results are below. They show how much faster html5-parser is than'
    ' each specified parser. Note that there are two additional considerations:'
    ' what the final tree is and whether the parsing supports the HTML 5'
    ' parsing algorithm. The most apples-to-apples comparison is when the'
    ' final tree is lxml and HTML 5 parsing is supported by the parser being compared to.'
    ' Note that in this case, we have the largest speedup. In all other cases,'
    ' speedup is less because of the overhead of building the final tree'
    ' in python instead of C or because the compared parser does not use'
    ' the HTML 5 parsing algorithm or both.'))
print()
row('Parser', 'Tree', 'Supports HTML 5', 'Speedup (factor)')
print('=' * 79)
row('html5lib', 'lxml', 'yes', round(h5time / base_time))
row('soup+html5lib', 'BeautifulSoup', 'yes', round(soup5_time / soup_time))
row('soup+lxml.html', 'BeautifulSoup', 'no', round(soup4_time / soup_time))