File: data.py

package info (click to toggle)
mwic 0.7.10-3
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 784 kB
  • sloc: python: 1,169; sh: 73; makefile: 65
file content (116 lines) | stat: -rw-r--r-- 3,616 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Copyright © 2013-2018 Jakub Wilk <jwilk@jwilk.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the “Software”), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

'''
collecting misspelling data
'''

import collections
import sys

class Occurrences():

    def __init__(self):
        self._data = collections.defaultdict(dict)
        self.certainty = 0

    def add(self, word, line, pos, certainty):
        if isinstance(pos, int):
            self._data[(word, line)][pos] = certainty
        else:
            for p in pos:
                self._data[(word, line)][p] = certainty
        self.certainty = max(self.certainty, certainty)

    def count(self):
        return sum(
            len(positions)
            for positions in self._data.values()
        )

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        for (word, line), positions in self._data.items():
            yield word, line, positions

    @staticmethod
    def _sorting_key(item):
        lcontext, word, rcontext = item
        return (rcontext, lcontext[::-1], word)

    def _context(self):
        for (word, line), positions in self._data.items():
            for pos in positions:
                lcontext = line[:pos]
                rcontext = line[pos + len(word):]
                yield lcontext, word, rcontext

    def sorted_context(self):
        return sorted(self._context(), key=self._sorting_key)

class Misspellings():

    def __init__(self):
        self._word_index = collections.defaultdict(Occurrences)
        self._line_index = collections.defaultdict(Occurrences)

    def add(self, word, line, pos, certainty):
        word = sys.intern(word)
        line = sys.intern(line)
        self._word_index[word].add(word, line, pos, certainty)
        self._line_index[line].add(word, line, pos, certainty)

    @staticmethod
    def _sorting_key(*, reverse=False):
        sign = 1
        if reverse:
            sign = -1
        def k(item):
            s, occurrences = item
            return (
                sign * -occurrences.certainty,
                sign * occurrences.count(),
                s
            )
        return k

    def __bool__(self):
        return bool(self._word_index)

    def sorted_words(self, *, reverse=False):
        return sorted(
            self._word_index.items(),
            key=self._sorting_key(reverse=reverse)
        )

    def sorted_lines(self, *, reverse=False):
        return sorted(
            self._line_index.items(),
            key=self._sorting_key(reverse=reverse)
        )

__all__ = [
    'Misspellings',
    'Occurrences',
]

# vim:ts=4 sts=4 sw=4 et