File: conll.R

package info (click to toggle)
r-cran-nlp 0.1-9-1~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 376 kB
  • sloc: makefile: 1
file content (112 lines) | stat: -rw-r--r-- 2,743 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
CoNLLTextDocument <-
function(con, encoding = "unknown", meta = list())
{
    ## <NOTE>
    ## Could make the fields controllable, e.g.
    ##   CoNLLDocument(con, fields = c("word", "POS", "chunk_tag"))
    ## and use "" for something to be skipped.
    ## </NOTE>

    records <- scan(con, what = list("", "", ""), encoding = encoding,
                    quote = NULL, quiet = TRUE, fill = TRUE,
                    blank.lines.skip = FALSE)
    words <- records[[1L]]
    ind <- words == ""

    doc <- list(content =
                data.frame(sent = cumsum(ind) + 1L,
                           word = words,
                           POS = records[[2L]],
                           chunk_tag = records[[3L]],
                           stringsAsFactors = FALSE)[!ind, ],
                meta = meta)
    class(doc) <- c("CoNLLTextDocument", "TextDocument")
    doc
}

format.CoNLLTextDocument <-
function(x, ...)
{
    content <- x$content
    nr <- NROW(content)
    c(.format_TextDocument(x),
      sprintf("Content:  words: %d, sents: %d",
              nr,
              content[[nr, "sent"]]))
}

## print.CoNLLTextDocument <-
## function(x, ...)
## {
##     content <- x$content
##     nr <- NROW(content)
##     writeLines(sprintf("<<CoNLLTextDocument (words: %d, sents: %d)>>",
##                        nr, content[[nr, "sent"]]))
##     invisible(x)
## }

content.CoNLLTextDocument <-
function(x)
    x$content

## meta.CoNLLTextDocument <-
## function(x, tag = NULL, ...)
##     if(is.null(tag)) x$meta else x$meta[[tag]]

## `meta<-.CoNLLTextDocument` <-
## function(x, tag = NULL, ..., value)
## {
##     if(is.null(tag))
##         x$meta <- value
##     else
##         x$meta[[tag]] <- value
##     x
## }

as.character.CoNLLTextDocument <-
words.CoNLLTextDocument <-
function(x, ...)
{
    x$content$word
}

sents.CoNLLTextDocument <-
function(x, ...)
{
    split(x$content$word,
          x$content$sent)
}

tagged_words.CoNLLTextDocument <-
function(x, map = NULL, ...)
{
    if(!is.null(map))
        x <- .map_POS_tags_CoNLLTextDocument(x, map)
    Tagged_Token(x$content$word, x$content$POS)
}

tagged_sents.CoNLLTextDocument <-
function(x, map = NULL, ...)
{
    if(!is.null(map))
        x <- .map_POS_tags_CoNLLTextDocument(x, map)
    split(Tagged_Token(x$content$word, x$content$POS),
          x$content$sent)
}

chunked_sents.CoNLLTextDocument <-
function(x, ...)
{
    Map(chunk_tree_from_chunk_info,
        split(x$content$word, x$content$sent),
        split(x$content$POS, x$content$sent),
        split(x$content$chunk_tag, x$content$sent))
}

.map_POS_tags_CoNLLTextDocument <-
function(x, map)
{
    map <- POS_tag_mapper(map, meta(x, "POS_tagset"))
    x$content$POS <- map(x$content$POS)
    x
}