File: tokenize.R

package info (click to toggle)
r-cran-nlp 0.2-0-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 424 kB
  • sloc: makefile: 2
file content (213 lines) | stat: -rw-r--r-- 6,059 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
## Tokenizers break text up into words, phrases, symbols, or other
## meaningful elements called tokens, see e.g.
## <https://en.wikipedia.org/wiki/Tokenization_%28lexical_analysis%29>.
## This can be accomplished by returning the sequence of tokens, or the
## corresponding spans (character start and end positions).
## Apache OpenNLP provides a Tokenizer interface, with methods
## String[] tokenize() and Span[] tokenizePos() for the two variants.
## See e.g.
## <http://opennlp.apache.org/docs/1.5.3/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html>.
## NLTK provides an interface class nltk.tokenize.api.TokenizerI, for
## which subclasses must define a tokenize() method, and can define a
## span_tokenize() method.
## See e.g. <http://www.nltk.org/api/nltk.tokenize.html>.
## In R, this could be mimicked by having two generics for getting the
## tokens or spans, and have a virtual Tokenizer class for which
## extension classes must provide methods for at least one of the
## generics.
## However, it seems more natural to have tokenizers be *functions*
## (instead of interface classes) which can be called directly (instead
## of calling the respective generics), and have two "kinds" of such
## functions: token tokenizers and span tokenizers.  We use the class
## information to indicate the kind, which in turn allows to provide a
## generic mechanism for mapping between the two kinds (straightforward
## when going from spans to tokens, doable for the opposite direction).
## This also allows to "extract" both kinds of tokenizers from suitable
## annotators or annotator pipelines.
## For now, there is no underlying virtual Tokenizer class.

### * Span tokenizers

Span_Tokenizer <-
function(f, meta = list())
{
    attr(f, "meta") <- meta
    class(f) <- "Span_Tokenizer"
    f
}

as.Span_Tokenizer <-
function(x, ...)
    UseMethod("as.Span_Tokenizer")

as.Span_Tokenizer.Span_Tokenizer <-
function(x, ...)
    x

## For now, pass metadata as is.
as.Span_Tokenizer.Token_Tokenizer <-
function(x, ...)
{
    f <- function(s) {
        s <- as.String(s)
        spans_from_tokens(s, x(s))
    }
    Span_Tokenizer(f, meta(x))
}

## For now, do not pass metadata.
as.Span_Tokenizer.Annotator <-
as.Span_Tokenizer.Annotator_Pipeline <-
function(x, type = "word", ...)
{
    f <- function(s) {
        a <- x(as.String(s))
        as.Span(a[a$type == "word", ])
    }
    Span_Tokenizer(f)
}
        
is.Span_Tokenizer <-
function(x)
    inherits(x, "Span_Tokenizer")

format.Span_Tokenizer <-
function(x, ...)
{
    d <- meta(x, "description")
    if(is.null(d)) {
        "A span tokenizer."
    } else {
        c("A span tokenizer, with description",
          strwrap(d, indent = 2L, exdent = 2L))
    }
}

### * Token tokenizers

Token_Tokenizer <-
function(f, meta = list())    
{
    attr(f, "meta") <- meta
    class(f) <- "Token_Tokenizer"
    f
}

as.Token_Tokenizer <-
function(x, ...)
    UseMethod("as.Token_Tokenizer")

as.Token_Tokenizer.Token_Tokenizer <-
function(x, ...)
    x

## For now, pass metadata as is.
as.Token_Tokenizer.Span_Tokenizer <-
function(x, ...)
{
    f <- function(s) {
        s <- as.String(s)
        s[x(s)]
    }
    Token_Tokenizer(f, meta(x))
}
    
## For now, do not pass metadata.
as.Token_Tokenizer.Annotator <-
as.Token_Tokenizer.Annotator_Pipeline <-
function(x, type = "word", ...)
{
    f <- function(s) {
        s <- as.String(s)
        a <- x(s)
        s[a[a$type == "word", ]]
    }
    Token_Tokenizer(f)
}

is.Token_Tokenizer <-
function(x)
    inherits(x, "Token_Tokenizer")

format.Token_Tokenizer <-
function(x, ...)
{
    d <- meta(x, "description")
    if(is.null(d)) {
        "A token tokenizer."
    } else {
        c("A token tokenizer, with description",
          strwrap(d, indent = 2L, exdent = 2L))
    }
}   

### Regexp span tokenizers a la NLTK.

Regexp_Tokenizer <-
function(pattern, invert = FALSE, ..., meta = list())
{
    force(pattern)
    args <- list(...)
    
    f <- if(invert) {
        ## Pattern gives the separators.
        function(s) {
            s <- as.String(s)
            if(is.na(s) || !nchar(s))
                stop("Need a non-empty string.")
            m <- do.call(gregexpr,
                         c(list(pattern = pattern, text = s), args))[[1L]]
            if((length(m) == 1L) && (m == -1L))
                return(Span(1L, nchar(s)))
            start <- c(1L, m + attr(m, "match.length"))
            end <- c(m - 1L, nchar(s))
            ind <- start <= end
            Span(start[ind], end[ind])
        }
    } else {
        ## Pattern gives the tokens.
        function(s) {
            s <- as.String(s)
            if(is.na(s) || !nchar(s))
                stop("Need a non-empty string.")
            m <- do.call(gregexpr,
                         c(list(pattern = pattern, text = s), args))[[1L]]
            Span(m, m + attr(m, "match.length") - 1L)
        }
    }

    Span_Tokenizer(f, meta)
}

whitespace_tokenizer <-
    Regexp_Tokenizer("\\s+",
                     invert = TRUE,
                     meta = list(description = "Divides strings into substrings by treating any sequence of whitespace characters as a separator."))

blankline_tokenizer <-
    Regexp_Tokenizer("\\s*\n\\s*\\n\\s*",
                     invert = TRUE,
                     meta = list(description = "Divides strings into substrings by treating any sequence of blank lines as a separator."))

wordpunct_tokenizer <-
    Regexp_Tokenizer("\\w+|[^\\w\\s]+",
                     perl = TRUE,
                     meta = list(description = "Divides strings into substrings of alphabetic and (non-whitespace) non-alphabetic characters."))

### * Utilities

spans_from_tokens <-
function(x, tokens)
{
    start <- end <- integer(length(tokens))
    off <- 0L
    for(i in seq_along(tokens)) {
        m <- regexpr(tokens[i], x, fixed = TRUE)
        pos <- m + attr(m, "match.length")
        x <- substring(x, pos)
        start[i] <- off + m
        end[i] <- off <- off + pos - 1L
        
    }
    Span(start, end)
}