File: lexRank.Rd

package info (click to toggle)
r-cran-lexrankr 0.5.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 340 kB
  • sloc: cpp: 30; sh: 17; ansic: 14; makefile: 2
file content (57 lines) | stat: -rw-r--r-- 4,621 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lexRank.R
\name{lexRank}
\alias{lexRank}
\title{Extractive text summarization with LexRank}
\usage{
lexRank(text, docId = "create", threshold = 0.2, n = 3,
  returnTies = TRUE, usePageRank = TRUE, damping = 0.85,
  continuous = FALSE, sentencesAsDocs = FALSE, removePunc = TRUE,
  removeNum = TRUE, toLower = TRUE, stemWords = TRUE,
  rmStopWords = TRUE, Verbose = TRUE)
}
\arguments{
\item{text}{A character vector of documents to be cleaned and processed by the LexRank algorithm}

\item{docId}{A vector of document IDs with length equal to the length of \code{text}.  If \code{docId == "create"} then doc IDs will be created as an index from 1 to \code{n}, where \code{n} is the length of \code{text}.}

\item{threshold}{The minimum simil value a sentence pair must have to be represented in the graph where lexRank is calculated.}

\item{n}{The number of sentences to return as the extractive summary.  The function will return the top \code{n} lexRanked sentences.  See \code{returnTies} for handling ties in lexRank.}

\item{returnTies}{\code{TRUE} or \code{FALSE} indicating whether or not to return greater than \code{n} sentence IDs if there is a tie in lexRank.  If \code{TRUE}, the returned number of sentences will not be limited to \code{n}, but rather will return every sentence with a top 3 score.  If \code{FALSE}, the returned number of sentences will be \code{<=n}. Defaults to \code{TRUE}.}

\item{usePageRank}{\code{TRUE} or \code{FALSE} indicating whether or not to use the page rank algorithm for ranking sentences.  If \code{FALSE}, a sentences unweighted centrality will be used as the rank.  Defaults to \code{TRUE}.}

\item{damping}{The damping factor to be passed to page rank algorithm.  Ignored if \code{usePageRank} is \code{FALSE}.}

\item{continuous}{\code{TRUE} or \code{FALSE} indicating whether or not to use continuous LexRank.  Only applies if \code{usePageRank==TRUE}.  If \code{TRUE}, \code{threshold} will be ignored and lexRank will be computed using a weighted graph representation of the sentences. Defaults to \code{FALSE}.}

\item{sentencesAsDocs}{\code{TRUE} or \code{FALSE}, indicating whether or not to treat sentences as documents when calculating tfidf scores for similarity. If \code{TRUE}, inverse document frequency will be calculated as inverse sentence frequency (useful for single document extractive summarization).}

\item{removePunc}{\code{TRUE} or \code{FALSE} indicating whether or not to remove punctuation from text while tokenizing.  If \code{TRUE}, punctuation will be removed.  Defaults to \code{TRUE}.}

\item{removeNum}{\code{TRUE} or \code{FALSE} indicating whether or not to remove numbers from text while tokenizing.  If \code{TRUE}, numbers will be removed.  Defaults to \code{TRUE}.}

\item{toLower}{\code{TRUE} or \code{FALSE} indicating whether or not to coerce all of text to lowercase while tokenizing.  If \code{TRUE}, \code{text} will be coerced to lowercase.  Defaults to \code{TRUE}.}

\item{stemWords}{\code{TRUE} or \code{FALSE} indicating whether or not to stem resulting tokens.  If \code{TRUE}, the outputted tokens will be tokenized using \code{SnowballC::wordStem()}.  Defaults to \code{TRUE}.}

\item{rmStopWords}{\code{TRUE}, \code{FALSE}, or character vector of stopwords to remove from tokens. If \code{TRUE}, words in \code{lexRankr::smart_stopwords} will be removed prior to stemming. If \code{FALSE}, no stopword removal will occur. If a character vector is passed, this vector will be used as the list of stopwords to be removed.  Defaults to \code{TRUE}.}

\item{Verbose}{\code{TRUE} or \code{FALSE} indicating whether or not to \code{cat} progress messages to the console while running.  Defaults to \code{TRUE}.}
}
\value{
A 2 column dataframe with columns \code{sentenceId} and \code{value}. \code{sentence} contains the ids of the top \code{n} sentences in descending order by \code{value}. \code{value} contains page rank score (if \code{usePageRank==TRUE}) or degree centrality (if \code{usePageRank==FALSE}).
}
\description{
Compute LexRanks from a vector of documents using the page rank algorithm or degree centrality the methods used to compute lexRank are discussed in "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization."
}
\examples{
lexRank(c("This is a test.","Tests are fun.",
"Do you think the exam will be hard?","Is an exam the same as a test?",
"How many questions are going to be on the exam?"))
}
\references{
\url{http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html}
}