1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
# -*- coding: utf-8 -*-
import sys
import unittest
from os.path import abspath, dirname
rootDir = dirname(dirname(abspath(__file__)))
sys.path.insert(0, rootDir)
from pyglossary.html_utils import unescape_unicode
class UnescapeUnicodeTest(unittest.TestCase):
def case(self, text, expected):
actual = unescape_unicode(text)
self.assertEqual(actual, expected)
def test(self):
self.case("<", "<")
self.case(">", ">")
self.case("&", "&")
self.case(""", """)
self.case("'", "'")
self.case(" ", " ")
self.case(" ", " ")
self.case("<á>", "<á>")
self.case("/wəːkiŋtiːm/", "/wəːkiŋtiːm/")
# Babylon dictionaries contain a lot of non-standard entity,
# references for example, csdot, fllig, nsm, cancer, thlig,
# tsdot, upslur...
self.case("<&etilde;", "<ẽ")
self.case("<⅓", "<⅓")
self.case("<⅔", "<⅔")
self.case("<ĩ", "<ĩ")
self.case("<&ldash;", "<–")
self.case("<ů", "<ů")
self.case("<ũ", "<ũ")
self.case("<&wring;", "<ẘ")
self.case("<&xfrac13;", "<⅓")
self.case("<ŷ", "<ŷ")
self.case("<&ygrave;", "<ỳ")
self.case("<&yring;", "<ẙ")
self.case("<&ytilde;", "<ỹ")
def benchmark_main():
import timeit
from random import choice
from english_words import english_words_set
english_words_list = list(english_words_set)
textList = []
for _ in range(20):
text = ""
for _ in range(10):
text += choice(english_words_list) + " "
textList.append(text)
print("avg length:", sum(len(text) for text in textList) / len(textList))
def run_benchmark1():
for text in textList:
unescape_unicode(text)
print("benchmark 1:", timeit.timeit("run_benchmark1()", globals=locals()))
if __name__ == "__main__":
if "-b" in sys.argv:
benchmark_main()
else:
unittest.main()
|