File: test_pystemmer.py

package info (click to toggle)
pystemmer 3.0.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,092 kB
  • sloc: python: 309; sh: 26; makefile: 14
file content (103 lines) | stat: -rw-r--r-- 3,419 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import unittest


class PyStemmerBaseTestCase(unittest.TestCase):
    def import_pystemmer(self):
        Stemmer = __import__('Stemmer')
        return Stemmer

    def get_stemmer(self, lang):
        Stemmer = self.import_pystemmer()
        return Stemmer.Stemmer(lang)


class PyStemmerGenericTests(PyStemmerBaseTestCase):
    def test_import(self):
        Stemmer = self.import_pystemmer()
        self.assertTrue(hasattr(Stemmer, '__file__'))

    def test_has_version(self):
        Stemmer = self.import_pystemmer()
        self.assertTrue(hasattr(Stemmer, 'version'))

    def test_has_algorithms(self):
        Stemmer = self.import_pystemmer()
        self.assertTrue(hasattr(Stemmer, 'algorithms'))


class PyStemmerEnglishTests(PyStemmerBaseTestCase):
    def setUp(self):
        self.stemmer = self.get_stemmer('english')

    def test_stemWord(self):
        self.assertEqual(self.stemmer.stemWord('cycling'), 'cycl')

    def test_stemWords(self):
        self.assertEqual(self.stemmer.stemWords(['cycling', 'cyclist']),
                         ['cycl', 'cyclist'])

    def test_stemWords_unicode_simple(self):
        self.assertEqual(self.stemmer.stemWords(['cycling', u'cyclist']),
                         ['cycl', u'cyclist'])

    def get_voc_words_file(self):
        import os
        here = os.path.dirname(__file__)

        voc_words_file = open(os.path.join(here, 'en_voc.txt'))

        return voc_words_file

    def test_stemWord_many_times(self):
        # This test runs stemWord on a large number of words (29417)
        # so that we force cache purging to be tested

        with self.get_voc_words_file() as voc_words_file:
            for word in voc_words_file:
                word = word.strip()
                result = self.stemmer.stemWord(word)


class PyStemmerFrenchTests(PyStemmerBaseTestCase):
    def setUp(self):
        self.stemmer = self.get_stemmer('french')

    def test_stemWord(self):
        self.assertEqual(self.stemmer.stemWord('cyclisme'), 'cyclism')


class PyStemmerGermanTests(PyStemmerBaseTestCase):
    def setUp(self):
        self.stemmer = self.get_stemmer('german')

    def test_stemWord(self):
        self.assertEqual(self.stemmer.stemWord('Fahrradfahren'), 'Fahrradfahr')
        self.assertEqual(self.stemmer.stemWord('Rad fahren'), 'Rad fahr')


class PyStemmerRussianTests(PyStemmerBaseTestCase):
    def setUp(self):
        self.stemmer = self.get_stemmer('russian')

    def test_stemWord(self):
        word = b' '.join([
            b'\xd1\x81\xd0\xbe\xd0\xb2\xd0\xb5\xd1\x80\xd1\x88\xd0\xb0\xd1\x82\xd1\x8c',
            b'\xd1\x86\xd0\xb8\xd0\xba\xd0\xbb',
            b'\xd1\x80\xd0\xb0\xd0\xb7\xd0\xb2\xd0\xb8\xd1\x82\xd0\xb8\xd1\x8f'
            ]).decode('utf-8')
        stem = b' '.join([
            b'\xd1\x81\xd0\xbe\xd0\xb2\xd0\xb5\xd1\x80\xd1\x88\xd0\xb0\xd1\x82\xd1\x8c',
            b'\xd1\x86\xd0\xb8\xd0\xba\xd0\xbb',
            b'\xd1\x80\xd0\xb0\xd0\xb7\xd0\xb2\xd0\xb8\xd1\x82'
            ]).decode('utf-8')
        self.assertEqual(self.stemmer.stemWord(word), stem)


class PyStemmerHungarianTests(PyStemmerBaseTestCase):
    def setUp(self):
        self.stemmer = self.get_stemmer('hungarian')

    def test_stemWord(self):
        word = b'Fut\xc3\xa1s k\xc3\xb6zben'.decode('utf-8')
        stem = b'Fut\xc3\xa1s k\xc3\xb6z'.decode('utf-8')
        self.assertEqual(self.stemmer.stemWord(word), stem)