1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
|
# coding:utf-8
"""A simple benchmark that measures speed of lxml and selectolax.
How the benchmark works
-----------------------
For each page, we extract:
1) Title
2) Number of script tag
3) The ``href`` attribute from all links
4) The content of the Meta description tag
"""
import functools
import json
import time
from bs4 import BeautifulSoup
from html5_parser import parse
from lxml.html import fromstring
from selectolax.parser import HTMLParser
from selectolax.lexbor import LexborHTMLParser
bad_urls = []
def bs4_parser(html_content, parser=HTMLParser):
soup = BeautifulSoup(html_content, 'html.parser')
title_text = soup.title.string
assert title_text
a_hrefs = [a.attrs.get('href', '') for a in soup.find_all('a')]
assert len(a_hrefs) >= 5, 'href'
num_script_tags = len(soup.find_all('script'))
assert num_script_tags > 0, 'script'
meta_description = soup.find('meta', attrs={"name": "description"})
if meta_description:
meta_content = meta_description.get('content')
def selectolax_parser(html_content, parser=HTMLParser):
tree = parser(html_content)
title_text = ""
title_node = tree.css_first('title')
if title_node:
title_text = title_node.text()
assert title_text
a_hrefs = [a.attrs.get('href', '') for a in tree.css('a[href]')]
assert len(a_hrefs) >= 5, 'href'
num_script_tags = len(tree.css('script'))
assert num_script_tags > 0, 'script'
meta_description = tree.css_first('meta[name="description"]')
if meta_description:
meta_content = meta_description.attrs.sget('content', '')
def lxml_parser(html_content):
tree = fromstring(html_content)
title_text = tree.xpath('//title/text()')
assert title_text, 'title'
a_hrefs = [a.attrib.get('href', '') for a in tree.xpath('//a[@href]')]
assert len(a_hrefs) >= 5, 'href'
num_script_tags = len(tree.xpath('//script'))
assert num_script_tags > 0, 'script'
meta_description = tree.xpath('meta[@name="description"]')
if meta_description:
meta_content = meta_description[0].attrib.get('content', '')
def html5_parser(html_content):
tree = parse(html_content)
title_text = tree.xpath('//title/text()')
assert title_text, 'title'
a_hrefs = [a.attrib.get('href', '') for a in tree.xpath('//a[@href]')]
assert len(a_hrefs) >= 5, 'href'
num_script_tags = len(tree.xpath('//script'))
assert num_script_tags > 0, 'script'
meta_description = tree.xpath('meta[@name="description"]')
if meta_description:
meta_content = meta_description[0].attrib.get('content', '')
def _perform_test(pages, parse_func):
for page in pages:
parse_func(page['html'])
def main():
#
# This file contains 754 main pages from the top internet domains (according to Alexa rank).
# That translates to 324MB of HTML data.
# Because of potential copyright infringements, I don't publish it.
#
html_pages = [json.loads(page) for page in open('pages/pages.json', 'rt')]
available_parsers = [
('bs4', bs4_parser,),
('lxml', lxml_parser,),
('html5_parser', html5_parser,),
('modest', selectolax_parser,),
('lexbor', functools.partial(selectolax_parser, parser=LexborHTMLParser)),
]
for parser_name, parser in available_parsers:
start = time.time()
_perform_test(html_pages, parser)
print('%r: %s' % (parser_name, time.time() - start))
if __name__ == '__main__':
main()
|