File: __init__.py

package info (click to toggle)
w3af 1.0-rc3svn3489-1
  • links: PTS
  • area: main
  • in suites: jessie, jessie-kfreebsd, squeeze, wheezy
  • size: 59,908 kB
  • ctags: 16,916
  • sloc: python: 136,990; xml: 63,472; sh: 153; ruby: 94; makefile: 40; asm: 35; jsp: 32; perl: 18; php: 5
file content (86 lines) | stat: -rw-r--r-- 2,789 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2009 NLTK Project
# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
#         Steven Bird <sb@csse.unimelb.edu.au> (minor additions)
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
Functions for X{tokenizing}, i.e., dividing text strings into
substrings.
"""

from simple import *
from regexp import *
from punkt import *
from sexpr import *
from treebank import *
from nltk.internals import deprecated
import nltk

__all__ = ['WhitespaceTokenizer', 'SpaceTokenizer', 'TabTokenizer',
           'LineTokenizer', 'RegexpTokenizer', 'BlanklineTokenizer',
           'WordPunctTokenizer', 'WordTokenizer', 'blankline_tokenize',
           'wordpunct_tokenize', 'regexp_tokenize', 'word_tokenize',
           'SExprTokenizer', 'sexpr_tokenize', 'line_tokenize',
           'PunktWordTokenizer', 'PunktSentenceTokenizer',
           'TreebankWordTokenizer', 'sent_tokenize', 'word_tokenize'
           ]

# Standard sentence tokenizer.
def sent_tokenize(text):
    """
    Use NLTK's currently recommended sentence tokenizer to tokenize
    sentences in the given text.  Currently, this uses
    L{PunktSentenceTokenizer}.
    """
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(text)
    
# Standard word tokenizer.
_word_tokenize = TreebankWordTokenizer().tokenize
def word_tokenize(text):
    """
    Use NLTK's currently recommended word tokenizer to tokenize words
    in the given sentence.  Currently, this uses
    L{TreebankWordTokenizer}.  This tokenizer should be fed a single
    sentence at a time.
    """
    return _word_tokenize(text)

######################################################################
#{ Deprecated since 0.8
######################################################################

@deprecated("Use nltk.blankline_tokenize() or "
            "nltk.BlanklineTokenizer instead.")
def blankline(text):
    return BlanklineTokenizer().tokenize(text)

@deprecated("Use nltk.wordpunct_tokenize() or "
            "nltk.WordPunctTokenizer instead.")
def wordpunct(text):
    return WordPunctTokenizer().tokenize(text)

@deprecated("Use str.split() or nltk.WhitespaceTokenizer instead.")
def whitespace(text):
    return WhitespaceTokenizer().tokenize(text)

@deprecated("Use nltk.word_tokenize() or "
            "nltk.WordTokenizer instead.")
def word(text):
    return WordTokenizer().tokenize(text)

@deprecated("Use nltk.line_tokenize() or "
            "nltk.LineTokenizer instead.")
def line(text):
    return LineTokenizer().tokenize(text)

@deprecated("Use method of nltk.tokenize.PunktWordTokenizer "
            "instead.")
def punkt_word_tokenize(text):
    return PunktWordTokenizer().tokenize(text)

#}