1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
|
# FIXME: generateDesign will NOT work if there are dependencies
# over multiple levels of params and one only states the dependency only
# wrt to the "last" param. also see daniels unit test.
# it works as long all dependencies are stated, we need to at least document this
# FIXME: it really makes no sense to calculate the distance for params that are NA
# when we do the design and augment it right? think about what happens here
#' @title Generates a statistical design for a parameter set.
#'
#' @description
#' The following types of columns are created:
#' \tabular{ll}{
#' numeric(vector) \tab `numeric` \cr
#' integer(vector) \tab `integer` \cr
#' discrete(vector) \tab `factor` (names of values = levels) \cr
#' logical(vector) \tab `logical`
#' }
#' If you want to convert these, look at [BBmisc::convertDataFrameCols()].
#' Dependent parameters whose constraints are unsatisfied generate `NA` entries in their
#' respective columns.
#' For discrete vectors the levels and their order will be preserved, even if not all levels are present.
#'
#' Currently only lhs designs are supported.
#'
#' The algorithm currently iterates the following steps:
#' \enumerate{
#' \item{We create a space filling design for all parameters, disregarding `requires`,
#' a `trafo` or the forbidden region.}
#' \item{Forbidden points are removed.}
#' \item{Parameters are trafoed (potentially, depending on the setting of argument `trafo`);
#' dependent parameters whose constraints are unsatisfied are set to `NA` entries.}
#' \item{Duplicated design points are removed. Duplicated points are not generated in a
#' reasonable space-filling design, but the way discrete parameters and also parameter dependencies
#' are handled make this possible.}
#' \item{If we removed some points, we now try to augment the design in a space-filling way
#' and iterate.}
#' }
#'
#' Note that augmenting currently is somewhat experimental as we simply generate
#' missing points via new calls to [lhs::randomLHS()], but do not add points so
#' they are maximally far away from the already present ones. The reason is that
#' the latter is quite hard to achieve with complicated dependencies and
#' forbidden regions, if one wants to ensure that points actually get added...
#' But we are working on it.
#'
#' Note that if you have trafos attached to your params, the complete creation
#' of the design (except for the detection of invalid parameters w.r.t to their
#' `requires` setting) takes place on the UNTRANSFORMED scale. So this function
#' creates, e.g., a maximin LHS design on the UNTRANSFORMED scale, but not
#' necessarily the transformed scale.
#'
#' `generateDesign` will NOT work if there are dependencies over multiple levels
#' of parameters and the dependency is only given with respect to the
#' \dQuote{previous} parameter. A current workaround is to state all
#' dependencies on all parameters involved. (We are working on it.)
#'
#' @template arg_gendes_n
#' @template arg_parset
#' @param fun (`function`)\cr
#' Function from package lhs.
#' Possible are: [lhs::maximinLHS()], [lhs::randomLHS()],
#' [lhs::geneticLHS()], [lhs::improvedLHS()], [lhs::optAugmentLHS()],
#' [lhs::optimumLHS()]
#' Default is [lhs::randomLHS()].
#' @param fun.args (`list`)\cr
#' List of further arguments passed to `fun`.
#' @template arg_trafo
#' @param augment (`integer(1)`)\cr
#' Duplicated values and forbidden regions in the parameter space can lead to
#' the design becoming smaller than `n`. With this option it is possible to
#' augment the design again to size `n`. It is not guaranteed that this always
#' works (to full size) and `augment` specifies the number of tries to
#' augment. If the the design is of size less than `n` after all tries, a
#' warning is issued and the smaller design is returned. Default is 20.
#' @template ret_gendes_df
#' @export
#' @examples
#' ps = makeParamSet(
#' makeNumericParam("x1", lower = -2, upper = 1),
#' makeIntegerParam("x2", lower = 10, upper = 20)
#' )
#' # random latin hypercube design with 5 samples:
#' generateDesign(5, ps)
#'
#' # with trafo
#' ps = makeParamSet(
#' makeNumericParam("x", lower = -2, upper = 1),
#' makeNumericVectorParam("y", len = 2, lower = 0, upper = 1, trafo = function(x) x / sum(x))
#' )
#' generateDesign(10, ps, trafo = TRUE)
generateDesign = function(n = 10L, par.set, fun, fun.args = list(), trafo = FALSE, augment = 20L) {
n = asInt(n)
z = doBasicGenDesignChecks(par.set)
lower = z$lower
upper = z$upper
requirePackages("lhs", why = "generateDesign", default.method = "load")
if (missing(fun)) {
fun = lhs::randomLHS
} else {
assertFunction(fun)
}
assertList(fun.args)
assertFlag(trafo)
augment = asInt(augment, lower = 0L)
### precompute some useful stuff
pars = par.set$pars
lens = getParamLengths(par.set)
k = sum(lens)
pids = getParamIds(par.set, repeated = TRUE, with.nr = TRUE)
par.ids.each = lapply(pars, getParamIds, repeated = TRUE, with.nr = TRUE)
par.nas.each = lapply(pars, getParamNA, repeated = FALSE)
req.vectorized = determineReqVectorized(pars)
lower2 = setNames(rep(NA_real_, k), pids)
lower2 = insert(lower2, lower)
upper2 = setNames(rep(NA_real_, k), pids)
upper2 = insert(upper2, upper)
values = getParamSetValues(par.set)
types.df = getParamTypes(par.set, df.cols = TRUE)
types.df[types.df == "factor"] = "character"
nmissing = n
# result objects
res = data.frame()
des = matrix(nrow = 0, ncol = k)
for (iter in seq_len(augment)) {
### get design, types converted, trafos, conditionals set to NA
# create new design or augment if we already have some points
newdes = if (nmissing == n) {
do.call(fun, insert(list(n = nmissing, k = k), fun.args))
} else {
lhs::randomLHS(nmissing, k = k)
}
# taken and adapted from individual Param Objects in mlr-org/paradox
getMapping = function(i) {
# Numeric
if (types.df[i] == "numeric") {
newdes[, i] * (upper2[i] - lower2[i]) + lower2[i]
} else if (types.df[i] == "integer") {
# Integer
as.integer(floor(newdes[, i] * ((upper2[i] - lower2[i]) + 1L) * (1 - 1e-16)) + lower2[i])
# Logic
} else if (types.df[i] == "logical") {
newdes[, i] < 0.5
# Discrete
} else if (types.df[i] == "character") {
values[[i]][floor(newdes[, i] * length(values[[i]]) * (1 - 1e-16)) + 1]
} else {
stopf("%s for Param %s is an unsupported type.", types.df[i], pids[i])
}
}
newres = mapDfc(seq_along(pids), getMapping)
colnames(newres) = pids
# check each row if forbidden, then remove
if (hasForbidden(par.set)) {
# FIXME: this is pretty slow, but correct
fb = unlist(lapply(dfRowsToList(newres, par.set = par.set), function(x) {
isForbidden(x, par.set = par.set)
}))
newres = newres[!fb, , drop = FALSE]
newdes = newdes[!fb, , drop = FALSE]
}
if (trafo) {
newres = applyTrafos(newres, pars)
}
newres = setRequiresToNA(newres, pars, par.ids.each, par.nas.each, req.vectorized)
# add to result (design matrix and data.frame)
des = rbind(des, newdes)
res = rbind(res, newres)
# remove duplicates
to.remove = duplicated(res)
des = des[!to.remove, , drop = FALSE]
res = res[!to.remove, , drop = FALSE]
nmissing = n - nrow(res)
# Enough points? We are done!
if (nmissing == 0L) {
break
}
}
if (nrow(res) < n) {
warningf("generateDesign could only produce %i points instead of %i!", nrow(res), n)
}
colnames(res) = pids
res = fixDesignFactors(res, par.set)
attr(res, "trafo") = trafo
return(res)
}
# applies the trafo to each parameter
# @param res data.frame()
# with columns named accroding to getParamIds(par, repeated = TRUE, with.nr = TRUE) (so multiple columns for vector params)
# @pars list()
# the ps$pars part of a param set
# @value data.frame()
applyTrafos = function(res, pars) {
for (par in pars) {
if (!is.null(par$trafo)) {
ids = getParamIds(par, repeated = TRUE, with.nr = TRUE)
if (par$len == 1) {
# we expect, that the trafo works vectorized for normal params
res[, ids] = par$trafo(res[, ids])
} else {
# for vector params the trafo has to work on the single vector
for (i in seq_len(nrow(res))) {
res[i, ids] = par$trafo(res[i, ids])
}
}
}
}
res
}
# determines if the requirements work vectorized accrding to a simple heuristic
# @param pars list()
# the ps$pars part of a param set
# @value logical named
# TRUE for each column that I can evaluate vectorized
determineReqVectorized = function(pars) {
# heuristic if we allow this requirement to be evaluated in an vectorized fashion
vapply(X = lapply(pars, function(p) p$requires), function(req) {
# vectorized if no "&&", "||" or "(" is detected
!grepl(x = deparse(req), pattern = "\\|\\||&&|\\(")
}, FUN.VALUE = logical(1))
}
# Sets values of params to NA if requirements are not evaluated to TRUE (rowwise)
# @param res data.frame(n,m)
# The design
# @param pars list()
# the ps$pars part of a param set
# @param pars.ids.each list()
# the colnames that are used by each parameter (especially important for vector params, otherwise ist just list(paramA = "paramA"))
# @param pars.nas.each list()
# the na type (e.g NA_character) that should be filled in if req is not met (important so that we do not destroy the right column type)
# @param req.vectorized named logical()
# TRUE for each column that I can evaluate the req vectorized
# @value data.frame()
setRequiresToNA = function(res, pars, par.ids.each = NULL, par.nas.each = NULL, req.vectorized = NULL) {
# these values can be passed manually to make this function faster if it is called multiple times because the single S3 function calls can sum up to some seconds!
if (is.null(par.ids.each)) {
par.ids.each = lapply(pars, getParamIds, repeated = TRUE, with.nr = TRUE)
}
if (is.null(par.nas.each)) {
par.nas.each = lapply(pars, getParamNA, repeated = FALSE)
}
if (is.null(req.vectorized)) {
req.vectorized = determineReqVectorized(pars)
}
for (par in pars) {
req = par$requires
if (!is.null(req)) {
# set rows to NA 1) where req does not evalue to true AND 2) where the row is not already NA
if (req.vectorized[par$id]) {
set.to.na = !eval(req, res)
} else {
# unfortunately we allowed requirements to be not vectorized
set.to.na = !vapply(seq_len(nrow(res)), function(i) {
eval(req, res[i, ])
}, FUN.VALUE = logical(1))
}
set.to.na = set.to.na & !is.na(res[[par.ids.each[[par$id]][1]]])
res[set.to.na, par.ids.each[[par$id]]] = par.nas.each[[par$id]]
}
}
res
}
|