1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
|
\name{duplicated}
\alias{duplicated}
\alias{duplicated.data.table}
\alias{unique}
\alias{unique.data.table}
\alias{anyDuplicated}
\alias{anyDuplicated.data.table}
\alias{uniqueN}
\title{ Determine Duplicate Rows }
\description{
\code{duplicated} returns a logical vector indicating which rows of a
\code{data.table} are duplicates of a row with smaller subscripts.
\code{unique} returns a \code{data.table} with duplicated rows removed, by
columns specified in \code{by} argument. When no \code{by} then duplicated
rows by all columns are removed.
\code{anyDuplicated} returns the \emph{index} \code{i} of the first duplicated
entry if there is one, and 0 otherwise.
\code{uniqueN} is equivalent to \code{length(unique(x))} when x is an
\code{atomic vector}, and \code{nrow(unique(x))} when x is a \code{data.frame}
or \code{data.table}. The number of unique rows are computed directly without
materialising the intermediate unique data.table and is therefore faster and
memory efficient.
}
\usage{
\method{duplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots)
\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots)
\method{anyDuplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots)
uniqueN(x, by=if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE)
}
\arguments{
\item{x}{ A data.table. \code{uniqueN} accepts atomic vectors and data.frames
as well.}
\item{\dots}{ Not used at this time. }
\item{incomparables}{ Not used. Here for S3 method consistency. }
\item{fromLast}{ logical indicating if duplication should be considered from
the reverse side, i.e., the last (or rightmost) of identical elements would
correspond to \code{duplicated = FALSE}.}
\item{by}{\code{character} or \code{integer} vector indicating which combinations
of columns from \code{x} to use for uniqueness checks. By default all columns
are being used. That was changed recently for consistency to data.frame methods.
In version \code{< 1.9.8} default was \code{key(x)}.}
\item{na.rm}{Logical (default is \code{FALSE}). Should missing values (including
\code{NaN}) be removed?}
}
\details{
Because data.tables are usually sorted by key, tests for duplication are
especially quick when only the keyed columns are considered. Unlike
\code{\link[base:unique]{unique.data.frame}}, \code{paste} is not used to ensure
equality of floating point data. It is instead accomplished directly and is
therefore quite fast. data.table provides \code{\link{setNumericRounding}} to
handle cases where limitations in floating point representation is undesirable.
\code{v1.9.4} introduces \code{anyDuplicated} method for data.tables and is
similar to base in functionality. It also implements the logical argument
\code{fromLast} for all three functions, with default value \code{FALSE}.
}
\value{
\code{duplicated} returns a logical vector of length \code{nrow(x)}
indicating which rows are duplicates.
\code{unique} returns a data table with duplicated rows removed.
\code{anyDuplicated} returns a integer value with the index of first duplicate.
If none exists, 0L is returned.
\code{uniqueN} returns the number of unique elements in the vector,
\code{data.frame} or \code{data.table}.
}
\seealso{ \code{\link{setNumericRounding}}, \code{\link{data.table}},
\code{\link{duplicated}}, \code{\link{unique}}, \code{\link{all.equal}},
\code{\link{fsetdiff}}, \code{\link{funion}}, \code{\link{fintersect}},
\code{\link{fsetequal}}
}
\examples{
DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3),
C = rep(1:2, 6), key = "A,B")
duplicated(DT)
unique(DT)
duplicated(DT, by="B")
unique(DT, by="B")
duplicated(DT, by=c("A", "C"))
unique(DT, by=c("A", "C"))
DT = data.table(a=c(2L,1L,2L), b=c(1L,2L,1L)) # no key
unique(DT) # rows 1 and 2 (row 3 is a duplicate of row 1)
DT = data.table(a=c(3.142, 4.2, 4.2, 3.142, 1.223, 1.223), b=rep(1,6))
unique(DT) # rows 1,2 and 5
DT = data.table(a=tan(pi*(1/4 + 1:10)), b=rep(1,10)) # example from ?all.equal
length(unique(DT$a)) # 10 strictly unique floating point values
all.equal(DT$a,rep(1,10)) # TRUE, all within tolerance of 1.0
DT[,which.min(a)] # row 10, the strictly smallest floating point value
identical(unique(DT),DT[1]) # TRUE, stable within tolerance
identical(unique(DT),DT[10]) # FALSE
# fromLast=TRUE
DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3),
C = rep(1:2, 6), key = "A,B")
duplicated(DT, by="B", fromLast=TRUE)
unique(DT, by="B", fromLast=TRUE)
# anyDuplicated
anyDuplicated(DT, by=c("A", "B")) # 3L
any(duplicated(DT, by=c("A", "B"))) # TRUE
# uniqueN, unique rows on key columns
uniqueN(DT, by = key(DT))
# uniqueN, unique rows on all columns
uniqueN(DT)
# uniqueN while grouped by "A"
DT[, .(uN=uniqueN(.SD)), by=A]
# uniqueN's na.rm=TRUE
x = sample(c(NA, NaN, runif(3)), 10, TRUE)
uniqueN(x, na.rm = FALSE) # 5, default
uniqueN(x, na.rm=TRUE) # 3
}
\keyword{ data }
|