File: maxDissim.Rd

package info (click to toggle)
r-cran-caret 6.0-81-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 7,268 kB
  • sloc: ansic: 208; sh: 10; makefile: 2
file content (121 lines) | stat: -rw-r--r-- 3,228 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/maxDissim.R
\name{maxDissim}
\alias{maxDissim}
\alias{minDiss}
\alias{sumDiss}
\title{Maximum Dissimilarity Sampling}
\usage{
maxDissim(a, b, n = 2, obj = minDiss, useNames = FALSE,
  randomFrac = 1, verbose = FALSE, ...)

minDiss(u)

sumDiss(u)
}
\arguments{
\item{a}{a matrix or data frame of samples to start}

\item{b}{a matrix or data frame of samples to sample from}

\item{n}{the size of the sub-sample}

\item{obj}{an objective function to measure overall dissimilarity}

\item{useNames}{a logical: should the function return the row names (as
opposed ot the row index)}

\item{randomFrac}{a number in (0, 1] that can be used to sub-sample from the
remaining candidate values}

\item{verbose}{a logical; should each step be printed?}

\item{\dots}{optional arguments to pass to dist}

\item{u}{a vector of dissimilarities}
}
\value{
a vector of integers or row names (depending on \code{useNames})
corresponding to the rows of \code{b} that comprise the sub-sample.
}
\description{
Functions to create a sub-sample by maximizing the dissimilarity between new
samples and the existing subset.
}
\details{
Given an initial set of m samples and a larger pool of n samples, this
function iteratively adds points to the smaller set by finding with of the n
samples is most dissimilar to the initial set. The argument \code{obj}
measures the overall dissimilarity between the initial set and a candidate
point. For example, maximizing the minimum or the sum of the m
dissimilarities are two common approaches.

This algorithm tends to select points on the edge of the data mainstream and
will reliably select outliers. To select more samples towards the interior
of the data set, set \code{randomFrac} to be small (see the examples below).
}
\examples{


example <- function(pct = 1, obj = minDiss, ...)
{
  tmp <- matrix(rnorm(200 * 2), nrow = 200)

  ## start with 15 data points
  start <- sample(1:dim(tmp)[1], 15)
  base <- tmp[start,]
  pool <- tmp[-start,]
  
  ## select 9 for addition
  newSamp <- maxDissim(
                       base, pool, 
                       n = 9, 
                       randomFrac = pct, obj = obj, ...)
  
  allSamp <- c(start, newSamp)
  
  plot(
       tmp[-newSamp,], 
       xlim = extendrange(tmp[,1]), ylim = extendrange(tmp[,2]), 
       col = "darkgrey", 
       xlab = "variable 1", ylab = "variable 2")
  points(base, pch = 16, cex = .7)
  
  for(i in seq(along = newSamp))
    points(
           pool[newSamp[i],1], 
           pool[newSamp[i],2], 
           pch = paste(i), col = "darkred") 
}

par(mfrow=c(2,2))

set.seed(414)
example(1, minDiss)
title("No Random Sampling, Min Score")

set.seed(414)
example(.1, minDiss)
title("10 Pct Random Sampling, Min Score")

set.seed(414)
example(1, sumDiss)
title("No Random Sampling, Sum Score")

set.seed(414)
example(.1, sumDiss)
title("10 Pct Random Sampling, Sum Score")

}
\references{
Willett, P. (1999), "Dissimilarity-Based Algorithms for
Selecting Structurally Diverse Sets of Compounds," \emph{Journal of
Computational Biology}, 6, 447-457.
}
\seealso{
\code{\link{dist}}
}
\author{
Max Kuhn \email{max.kuhn@pfizer.com}
}
\keyword{utilities}