1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
|
CoNLLTextDocument <-
function(con, encoding = "unknown", meta = list())
{
## <NOTE>
## Could make the fields controllable, e.g.
## CoNLLDocument(con, fields = c("word", "POS", "chunk_tag"))
## and use "" for something to be skipped.
## </NOTE>
records <- scan(con, what = list("", "", ""), encoding = encoding,
quote = NULL, quiet = TRUE, fill = TRUE,
blank.lines.skip = FALSE)
words <- records[[1L]]
ind <- words == ""
doc <- list(content =
data.frame(sent = cumsum(ind) + 1L,
word = words,
POS = records[[2L]],
chunk_tag = records[[3L]],
stringsAsFactors = FALSE)[!ind, ],
meta = meta)
class(doc) <- c("CoNLLTextDocument", "TextDocument")
doc
}
format.CoNLLTextDocument <-
function(x, ...)
{
content <- x$content
nr <- NROW(content)
c(.format_TextDocument(x),
sprintf("Content: words: %d, sents: %d",
nr,
content[[nr, "sent"]]))
}
## print.CoNLLTextDocument <-
## function(x, ...)
## {
## content <- x$content
## nr <- NROW(content)
## writeLines(sprintf("<<CoNLLTextDocument (words: %d, sents: %d)>>",
## nr, content[[nr, "sent"]]))
## invisible(x)
## }
content.CoNLLTextDocument <-
function(x)
x$content
## meta.CoNLLTextDocument <-
## function(x, tag = NULL, ...)
## if(is.null(tag)) x$meta else x$meta[[tag]]
## `meta<-.CoNLLTextDocument` <-
## function(x, tag = NULL, ..., value)
## {
## if(is.null(tag))
## x$meta <- value
## else
## x$meta[[tag]] <- value
## x
## }
as.character.CoNLLTextDocument <-
words.CoNLLTextDocument <-
function(x, ...)
{
x$content$word
}
sents.CoNLLTextDocument <-
function(x, ...)
{
split(x$content$word,
x$content$sent)
}
tagged_words.CoNLLTextDocument <-
function(x, map = NULL, ...)
{
if(!is.null(map))
x <- .map_POS_tags_CoNLLTextDocument(x, map)
Tagged_Token(x$content$word, x$content$POS)
}
tagged_sents.CoNLLTextDocument <-
function(x, map = NULL, ...)
{
if(!is.null(map))
x <- .map_POS_tags_CoNLLTextDocument(x, map)
split(Tagged_Token(x$content$word, x$content$POS),
x$content$sent)
}
chunked_sents.CoNLLTextDocument <-
function(x, ...)
{
Map(chunk_tree_from_chunk_info,
split(x$content$word, x$content$sent),
split(x$content$POS, x$content$sent),
split(x$content$chunk_tag, x$content$sent))
}
.map_POS_tags_CoNLLTextDocument <-
function(x, map)
{
map <- POS_tag_mapper(map, meta(x, "POS_tagset"))
x$content$POS <- map(x$content$POS)
x
}
|