File: html_utils_test.py

package info (click to toggle)
pyglossary 5.0.9-1
  • links: PTS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,896 kB
  • sloc: python: 46,165; sh: 308; javascript: 100; xml: 42; makefile: 28
file content (77 lines) | stat: -rw-r--r-- 1,982 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding: utf-8 -*-

import sys
import unittest
from os.path import abspath, dirname

rootDir = dirname(dirname(abspath(__file__)))
sys.path.insert(0, rootDir)

from pyglossary.html_utils import unescape_unicode


class UnescapeUnicodeTest(unittest.TestCase):
	def case(self, text, expected):
		actual = unescape_unicode(text)
		self.assertEqual(actual, expected)

	def test(self):
		self.case("<", "<")
		self.case(">", ">")
		self.case("&", "&")
		self.case(""", """)
		self.case("'", "'")
		self.case(" ", " ")
		self.case(" ", " ")

		self.case("<á>", "<á>")

		self.case("/wəːkiŋtiːm/", "/wəːkiŋtiːm/")

		# Babylon dictionaries contain a lot of non-standard entity,
		# references for example, csdot, fllig, nsm, cancer, thlig,
		# tsdot, upslur...
		self.case("<&etilde;", "<ẽ")
		self.case("<⅓", "<⅓")
		self.case("<⅔", "<⅔")
		self.case("<ĩ", "<ĩ")
		self.case("<&ldash;", "<–")
		self.case("<ů", "<ů")
		self.case("<ũ", "<ũ")
		self.case("<&wring;", "<ẘ")
		self.case("<&xfrac13;", "<⅓")
		self.case("<ŷ", "<ŷ")
		self.case("<&ygrave;", "<ỳ")
		self.case("<&yring;", "<ẙ")
		self.case("<&ytilde;", "<ỹ")


def benchmark_main():
	import timeit
	from random import choice

	from english_words import english_words_set

	english_words_list = list(english_words_set)
	textList = []

	for _ in range(20):
		text = ""
		for _ in range(10):
			text += choice(english_words_list) + " "
		textList.append(text)

	print("avg length:", sum(len(text) for text in textList) / len(textList))

	def run_benchmark1():
		for text in textList:
			unescape_unicode(text)

	print("benchmark 1:", timeit.timeit("run_benchmark1()", globals=locals()))


if __name__ == "__main__":
	if "-b" in sys.argv:
		benchmark_main()
	else:
		unittest.main()