File: textprocessors.py

package info (click to toggle)
python-pynlpl 1.2.9-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,900 kB
  • sloc: python: 25,677; sh: 73; makefile: 3
file content (462 lines) | stat: -rw-r--r-- 15,751 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
# -*- coding: utf8 -*-

###############################################################
#  PyNLPl - Text Processors
#   by Maarten van Gompel
#   Centre for Language Studies
#   Radboud University Nijmegen
#   http://www.github.com/proycon/pynlpl
#   proycon AT anaproy DOT nl
#
#       Licensed under GPLv3
#
# This is a Python library containing text processors
#
###############################################################


from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from pynlpl.common import isstring
import sys
if sys.version < '3':
    from codecs import getwriter
    stderr = getwriter('utf-8')(sys.stderr)
    stdout = getwriter('utf-8')(sys.stdout)
else:
    stderr = sys.stderr
    stdout = sys.stdout

import unicodedata
import string
import io
import array
import re
from itertools import permutations
from pynlpl.statistics import FrequencyList
from pynlpl.formats import folia
from pynlpl.algorithms import bytesize

WHITESPACE = [" ", "\t", "\n", "\r","\v","\f"]
EOSMARKERS = ('.','?','!','。',';','؟','。','?','!','।','։','՞','።','᙮','។','៕')
REGEXP_URL = re.compile(r"^(?:(?:https?):(?:(?://)|(?:\\\\))|www\.)(?:[\w\d:#@%/;$()~_?\+-=\\\.&](?:#!)?)*")
REGEXP_MAIL = re.compile(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+(?:\.[a-zA-Z]+)+") #email
TOKENIZERRULES = (REGEXP_URL, REGEXP_MAIL)


class Windower(object):
    """Moves a sliding window over a list of tokens, upon iteration in yields all n-grams of specified size in a tuple.

    Example without markers:

    >>> for ngram in Windower("This is a test .",3, None, None):
    ...     print(" ".join(ngram))
    This is a
    is a test
    a test .

    Example with default markers:

    >>> for ngram in Windower("This is a test .",3):
    ...     print(" ".join(ngram))
    <begin> <begin> This
    <begin> This is
    This is a
    is a test
    a test .
    test . <end>
    . <end> <end>
    """

    def __init__(self, tokens, n=1, beginmarker = "<begin>", endmarker = "<end>"):
        """
        Constructor for Windower

        :param tokens: The tokens to iterate over. Should be an itereable. Strings will be split on spaces automatically.
        :type tokens: iterable
        :param n: The size of the n-grams to extract
        :type n: integer
        :param beginmarker: The marker for the beginning of the sentence, defaults to "<begin>". Set to None if no markers are desired.
        :type beginmarker: string or None
        :param endmarker: The marker for the end of the sentence, defaults to "<end>". Set to None if no markers are desired.
        :type endmarker: string or None
        """


        if isinstance(tokens, str) or (sys.version < '3' and isinstance(tokens, unicode)):
            self.tokens = tuple(tokens.split())
        else:
            self.tokens = tuple(tokens)
        assert isinstance(n, int)
        self.n = n
        self.beginmarker = beginmarker
        self.endmarker = endmarker

    def __len__(self):
        """Returns the number of n-grams in the data (quick computation without iteration)

        Without markers:

        >>> len(Windower("This is a test .",3, None, None))
        3

        >>> len(Windower("This is a test .",2, None, None))
        4

        >>> len(Windower("This is a test .",1, None, None))
        5

        With default markers:

        >>> len(Windower("This is a test .",3))
        7

        """

        c = (len(self.tokens) - self.n) + 1
        if self.beginmarker: c += self.n-1
        if self.endmarker: c += self.n-1
        return c


    def __iter__(self):
        """Yields an n-gram (tuple) at each iteration"""
        l = len(self.tokens)

        if self.beginmarker:
            beginmarker = (self.beginmarker),  #tuple
        if self.endmarker:
            endmarker = (self.endmarker),  #tuple

        for i in range(-(self.n - 1),l):
            begin = i
            end = i + self.n
            if begin >= 0 and end <= l:
                yield tuple(self.tokens[begin:end])
            elif begin < 0 and end > l:
                if not self.beginmarker or not self.endmarker:
                    continue
                else:
                   yield tuple(((begin * -1) * beginmarker  ) + self.tokens + ((end - l) * endmarker ))
            elif begin < 0:
                if not self.beginmarker:
                   continue
                else:
                   yield tuple(((begin * -1) * beginmarker ) + self.tokens[0:end])
            elif end > l:
                if not self.endmarker:
                   continue
                else:
                   yield tuple(self.tokens[begin:] + ((end - l) * endmarker))

class MultiWindower(object):
    "Extract n-grams of various configurations from a sequence"

    def __init__(self,tokens, min_n = 1, max_n = 9, beginmarker=None, endmarker=None):
        if isinstance(tokens, str) or (sys.version < '3' and isinstance(tokens, unicode)):
            self.tokens = tuple(tokens.split())
        else:
            self.tokens = tuple(tokens)
        assert isinstance(min_n, int)
        assert isinstance(max_n, int)
        self.min_n = min_n
        self.max_n = max_n
        self.beginmarker = beginmarker
        self.endmarker = endmarker

    def __iter__(self):
        for n in range(self.min_n, self.max_n + 1):
            for ngram in Windower(self.tokens,n, self.beginmarker, self.endmarker):
                yield ngram


class ReflowText(object):
    """Attempts to re-flow a text that has arbitrary line endings in it. Also undoes hyphenisation"""

    def __init__(self, stream, filternontext=True):
        self.stream = stream
        self.filternontext = filternontext

    def __iter__(self):
        eosmarkers = ('.',':','?','!','"',"'","„","”","’")
        emptyline = 0
        buffer = ""
        for line in self.stream:

            line = line.strip()
            if line:
                if emptyline:
                    if buffer:
                        yield buffer
                        yield ""
                        emptyline = 0
                        buffer = ""

                if buffer: buffer += ' '
                if (line[-1] in eosmarkers):
                    buffer += line
                    yield buffer
                    buffer = ""
                    emptyline = 0
                elif len(line) > 2 and line[-1] == '-' and line[-2].isalpha():
                    #undo hyphenisation
                    buffer += line[:-1]
                else:
                    if self.filternontext:
                        hastext = False
                        for c in line:
                            if c.isalpha():
                                hastext = True
                                break
                    else:
                        hastext = True

                    if hastext:
                        buffer += line
            else:
                emptyline += 1

            #print "BUFFER=[" + buffer.encode('utf-8') + "] emptyline=" + str(emptyline)

        if buffer:
            yield buffer



def calculate_overlap(haystack, needle, allowpartial=True):
    """Calculate the overlap between two sequences. Yields (overlap, placement) tuples (multiple because there may be multiple overlaps!). The former is the part of the sequence that overlaps, and the latter is -1 if the overlap is on the left side, 0 if it is a subset, 1 if it overlaps on the right side, 2 if its an identical match"""
    needle = tuple(needle)
    haystack = tuple(haystack)
    solutions = []

    #equality check
    if needle == haystack:
        return [(needle, 2)]

    if allowpartial:
        minl =1
    else:
        minl = len(needle)

    for l in range(minl,min(len(needle), len(haystack))+1):
        #print "LEFT-DEBUG", l,":", needle[-l:], " vs ", haystack[:l]
        #print "RIGHT-DEBUG", l,":", needle[:l], " vs ", haystack[-l:]
        #Search for overlap left (including partial overlap!)
        if needle[-l:] == haystack[:l]:
            #print "LEFT MATCH"
            solutions.append( (needle[-l:], -1) )
        #Search for overlap right (including partial overlap!)
        if needle[:l] == haystack[-l:]:
            #print "RIGHT MATCH"
            solutions.append( (needle[:l], 1) )

    if len(needle) <= len(haystack):
        options = list(iter(Windower(haystack,len(needle),beginmarker=None,endmarker=None)))
        for option in options[1:-1]:
            if option == needle:
                #print "SUBSET MATCH"
                solutions.append( (needle, 0) )

    return solutions




class Tokenizer(object):
    """A tokenizer and sentence splitter, which acts on a file/stream-like object and when iterating over the object it yields
    a lists of tokens (in case the sentence splitter is active (default)), or a token (if the sentence splitter is deactivated).
    """

    def __init__(self, stream, splitsentences=True, onesentenceperline=False, regexps=TOKENIZERRULES):
        """
        Constructor for Tokenizer

        :param stream: An iterable or file-object containing the data to tokenize
        :type stream: iterable or file-like object
        :param splitsentences: Enable sentence splitter? (default=_True_)
        :type splitsentences: bool
        :param onesentenceperline: Assume input has one sentence per line? (default=_False_)
        :type onesentenceperline: bool
        :param regexps: Regular expressions to use as tokeniser rules in tokenisation (default=_pynlpl.textprocessors.TOKENIZERRULES_)
        :type regexps:  Tuple/list of regular expressions to use in tokenisation
        """

        self.stream = stream
        self.regexps = regexps
        self.splitsentences=splitsentences
        self.onesentenceperline = onesentenceperline

    def __iter__(self):
        buffer = ""
        for line in self.stream:
            line = line.strip()
            if line:
                if buffer: buffer += "\n"
                buffer += line

            if (self.onesentenceperline or not line) and buffer:
                if self.splitsentences:
                    yield split_sentences(tokenize(buffer))
                else:
                    for token in tokenize(buffer, self.regexps):
                        yield token
                buffer = ""

        if buffer:
            if self.splitsentences:
                yield split_sentences(tokenize(buffer))
            else:
                for token in tokenize(buffer, self.regexps):
                    yield token




def tokenize(text, regexps=TOKENIZERRULES):
    """Tokenizes a string and returns a list of tokens

    :param text: The text to tokenise
    :type text: string
    :param regexps: Regular expressions to use as tokeniser rules in tokenisation (default=_pynlpl.textprocessors.TOKENIZERRULES_)
    :type regexps:  Tuple/list of regular expressions to use in tokenisation
    :rtype: Returns a list of tokens

    Examples:

    >>> for token in tokenize("This is a test."):
    ...    print(token)
    This
    is
    a
    test
    .


    """

    for i,regexp in list(enumerate(regexps)):
        if isstring(regexp):
            regexps[i] = re.compile(regexp)

    tokens = []
    begin = 0
    for i, c in enumerate(text):
        if begin > i:
            continue
        elif i == begin:
            m = False
            for regexp in regexps:
                m = regexp.findall(text[i:i+300])
                if m:
                    tokens.append(m[0])
                    begin = i + len(m[0])
                    break
            if m: continue

        if c in string.punctuation or c in WHITESPACE:
            prev = text[i-1] if i > 0 else ""
            next = text[i+1] if i < len(text)-1 else ""

            if (c == '.' or c == ',') and prev.isdigit() and next.isdigit():
                #punctuation in between numbers, keep as one token
                pass
            elif (c == "'" or c == "`") and prev.isalpha() and next.isalpha():
                #quote in between chars, keep...
                pass
            elif c not in WHITESPACE and next == c: #group clusters of identical punctuation together
                continue
            elif c == '\r' and prev == '\n':
                #ignore
                begin = i+1
                continue
            else:
                token = text[begin:i]
                if token: tokens.append(token)

                if c not in WHITESPACE:
                    tokens.append(c) #anything but spaces and newlines (i.e. punctuation) counts as a token too
                begin = i + 1 #set the begin cursor

    if begin <= len(text) - 1:
        token = text[begin:]
        tokens.append(token)

    return tokens


def crude_tokenizer(text):
    """Replaced by tokenize(). Alias"""
    return tokenize(text) #backwards-compatibility, not so crude anymore

def tokenise(text, regexps=TOKENIZERRULES): #for the British
    """Alias for the British"""
    return tokenize(text)

def is_end_of_sentence(tokens,i ):
    # is this an end-of-sentence marker? ... and is this either
    # the last token or the next token is NOT an end of sentence
    # marker as well? (to deal with ellipsis etc)
    return tokens[i] in EOSMARKERS and (i == len(tokens) - 1 or not tokens[i+1] in EOSMARKERS)

def split_sentences(tokens):
    """Split sentences (based on tokenised data), returns sentences as a list of lists of tokens, each sentence is a list of tokens"""
    begin = 0
    for i, token in enumerate(tokens):
        if is_end_of_sentence(tokens, i):
            yield tokens[begin:i+1]
            begin = i+1
    if begin <= len(tokens)-1:
        yield tokens[begin:]



def strip_accents(s, encoding= 'utf-8'):
    """Strip characters with diacritics and return a flat ascii representation"""
    if sys.version < '3':
        if isinstance(s,unicode):
           return unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore')
        else:
           return unicodedata.normalize('NFKD', unicode(s,encoding)).encode('ASCII', 'ignore')
    else:
        if isinstance(s,bytes): s = str(s,encoding)
        return str(unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore'),'ascii')

def swap(tokens, maxdist=2):
    """Perform a swap operation on a sequence of tokens, exhaustively swapping all tokens up to the maximum specified distance. This is a subset of all permutations."""
    assert maxdist >= 2
    tokens = list(tokens)
    if maxdist > len(tokens):
        maxdist = len(tokens)
    l = len(tokens)
    for i in range(0,l - 1):
        for permutation in permutations(tokens[i:i+maxdist]):
            if permutation != tuple(tokens[i:i+maxdist]):
                newtokens = tokens[:i]
                newtokens += permutation
                newtokens += tokens[i+maxdist:]
                yield newtokens
        if maxdist == len(tokens):
            break


def find_keyword_in_context(tokens, keyword, contextsize=1):
    """Find a keyword in a particular sequence of tokens, and return the local context. Contextsize is the number of words to the left and right. The keyword may have multiple word, in which case it should to passed as a tuple or list"""
    if isinstance(keyword,tuple) and isinstance(keyword,list):
        l = len(keyword)
    else:
        keyword = (keyword,)
        l = 1
    n = l + contextsize*2
    focuspos = contextsize + 1
    for ngram in Windower(tokens,n,None,None):
        if ngram[focuspos:focuspos+l] == keyword:
            yield ngram[:focuspos], ngram[focuspos:focuspos+l],ngram[focuspos+l+1:]



if __name__ == "__main__":
    import doctest
    doctest.testmod()