File: __init__.py

package info (click to toggle)
pycangjie 1.2-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 1,440 kB
  • ctags: 214
  • sloc: sh: 11,238; makefile: 91; python: 89
file content (162 lines) | stat: -rw-r--r-- 6,217 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Copyright (c) 2013 - The pycangjie authors
#
# This file is part of pycangjie, the Python bindings to libcangjie.
#
# pycangjie is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# pycangjie is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with pycangjie.  If not, see <http://www.gnu.org/licenses/>.


import itertools
import operator
import string
import subprocess
import unittest

import cangjie


class MetaTest(type):
    """Metaclass for our test cases

    The goal is to provide every TestCase class with methods like test_a(),
    test_b(), etc..., in other words, one method per potential Cangjie input
    code.

    Well, not quite, because that would be 12356630 methods (the number of
    strings composed of 1 to 5 lowercase ascii letters), and even though my
    laptop has 8Go of RAM, the test process gets killed by the OOM killer. :)

    So we cheat, and use libcangjie's wildcard support, so that we only
    generate 26 + 26^2 = 702 methods.
    """
    def __init__(cls, name, bases, dct):
        super(MetaTest, cls).__init__(name, bases, dct)

        def gen_codes():
            """Generate the 702 possible input codes"""
            # First, the 1-character codes
            for c in string.ascii_lowercase:
                yield c

            # Next, the 2-characters-with-wildcard codes
            for t in itertools.product(string.ascii_lowercase, repeat=2):
                yield '*'.join(t)

        def tester(code):
            def func(cls):
                return cls.run_test(code)
            return func

        # Generate the test_* methods
        for code in gen_codes():
            setattr(cls, "test_%s" % code.replace("*", ""), tester(code))


class BaseTestCase(unittest.TestCase):
    """Base test class, grouping the common stuff for all our unit tests"""
    def __init__(self, name):
        super().__init__(name)

        self.cli_cmd = ["/usr/bin/libcangjie_cli"] + self.cli_options

        self.language = (cangjie.filters.BIG5 | cangjie.filters.HKSCS |
                         cangjie.filters.PUNCTUATION |
                         cangjie.filters.CHINESE |
                         cangjie.filters.ZHUYIN | cangjie.filters.KANJI |
                         cangjie.filters.KATAKANA |
                         cangjie.filters.HIRAGANA |
                         cangjie.filters.SYMBOLS)

    def setUp(self):
        self.cj = cangjie.Cangjie(self.version, self.language)

    def tearDown(self):
        del self.cj

    def run_command(self, cmd):
        """Run a command, deal with errors, and return its stdout"""
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        out, err = proc.communicate()

        try:
            cangjie.errors.handle_error_code(proc.returncode,
                                             msg="Unknown error while running"
                                                 " libcangjie_cli (%d)"
                                                 % proc.returncode)

        except cangjie.errors.CangjieNoCharsError:
            return ""

        try:
            return out.decode("utf-8")

        except UnicodeDecodeError:
            # Python's 'utf-8' codec trips over b"\xed\xa1\x9d\xed\xbc\xb2",
            # but according to [1] and [2], it is a valid sequence of 2 chars:
            #     U+D85D    \xed\xa1\x9d
            #     U+DF32    \xed\xbc\xb2
            # [1] http://www.utf8-chartable.de/unicode-utf8-table.pl?start=55389&utf8=string-literal
            # [2] http://www.utf8-chartable.de/unicode-utf8-table.pl?start=57138&utf8=string-literal
            # TODO: Investigate this further, and eventually open a bug report
            out2 = []
            for line in out.split("\n".encode("utf-8")):
                try:
                    out2.append(line.decode("utf-8"))
                except UnicodeDecodeError:
                    pass
            return "\n".join(out2)

    def run_test(self, input_code):
        """Run the actual test

        This compares the output of the libcangjie_cli tool with the output
        from pycangjie.

        The idea is that if pycangjie produces the same results as a C++ tool
        compiled against libcangjie, then pycangjie properly wraps libcangjie.

        We do not try to verify that pycangjie produces valid results here,
        validity is to be checked in libcangjie.

        Note that this whole test is based on scraping the output of
        libcangjie_cli, which is quite fragile.
        """
        # Get a list of CangjieChar from libcangjie_cli as a reference
        tmp_expected = self.run_command(self.cli_cmd+[input_code]).split("\n")
        tmp_expected = map(lambda x: x.strip(" \n"), tmp_expected)
        tmp_expected = filter(lambda x: len(x) > 0, tmp_expected)

        expected = []
        for item in tmp_expected:
            chchar, code, frequency = item.split(", ")

            chchar = chchar.split(" ")[-1]
            code = code.split(" ")[-1].strip("'")
            frequency = int(frequency.split(" ")[-1])

            expected.append(cangjie._core.CangjieChar(chchar.encode("utf-8"),
                                                      code.encode("utf-8"),
                                                      frequency))

        expected = sorted(expected, key=operator.attrgetter('chchar', 'code'))

        try:
            # And compare with what pycangjie produces
            results = sorted(self.cj.get_characters(input_code),
                             key=operator.attrgetter('chchar', 'code'))

            self.assertEqual(results, expected)

        except cangjie.errors.CangjieNoCharsError:
            self.assertEqual(len(expected), 0)