File: Problem.R

package info (click to toggle)
r-cran-batchtools 0.9.15%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,416 kB
  • sloc: ansic: 172; sh: 156; makefile: 2
file content (133 lines) | stat: -rw-r--r-- 5,751 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#' @title Define Problems for Experiments
#'
#' @description
#' Problems may consist of up to two parts: A static, immutable part (\code{data} in \code{addProblem})
#' and a dynamic, stochastic part (\code{fun} in \code{addProblem}).
#' For example, for statistical learning problems a data frame would be the static problem part while
#' a resampling function would be the stochastic part which creates problem instance.
#' This instance is then typically passed to a learning algorithm like a wrapper around a statistical model
#' (\code{fun} in \code{\link{addAlgorithm}}).
#'
#' This function serialize all components to the file system and registers the problem in the \code{\link{ExperimentRegistry}}.
#'
#' \code{removeProblem} removes all jobs from the registry which depend on the specific problem.
#' \code{reg$problems} holds the IDs of already defined problems.
#'
#' @param name [\code{character(1)}]\cr
#'   Unique identifier for the problem.
#' @param data [\code{ANY}]\cr
#'   Static problem part. Default is \code{NULL}.
#' @param fun [\code{function}]\cr
#'   The function defining the stochastic problem part.
#'   The static part is passed to this function with name \dQuote{data} and the \code{\link{Job}}/\code{\link{Experiment}}
#'   is passed as \dQuote{job}.
#'   Therefore, your function must have the formal arguments \dQuote{job} and \dQuote{data} (or dots \code{...}).
#'   If you do not provide a function, it defaults to a function which just returns the data part.
#' @param seed [\code{integer(1)}]\cr
#'   Start seed for this problem. This allows the \dQuote{synchronization} of a stochastic
#'   problem across algorithms, so that different algorithms are evaluated on the same stochastic instance.
#'   If the problem seed is defined, the seeding mechanism works as follows:
#'   (1) Before the dynamic part of a problem is instantiated,
#'   the seed of the problem + [replication number] - 1 is set, i.e. the first
#'   replication uses the problem seed. (2) The stochastic part of the problem is
#'   instantiated. (3) From now on the usual experiment seed of the registry is used,
#'   see \code{\link{ExperimentRegistry}}.
#'   If \code{seed} is set to \code{NULL} (default), the job seed is used to instantiate the problem and
#'   different algorithms see different stochastic instances of the same problem.
#' @param cache [\code{logical(1)}]\cr
#'   If \code{TRUE} and \code{seed} is set, problem instances will be cached on the file system.
#'   This assumes that each problem instance is deterministic for each combination of hyperparameter setting
#'   and each replication number.
#'   This feature is experimental.
#' @template expreg
#' @return [\code{Problem}]. Object of class \dQuote{Problem} (invisibly).
#' @aliases Problem
#' @seealso \code{\link{Algorithm}}, \code{\link{addExperiments}}
#' @export
#' @examples
#' \dontshow{ batchtools:::example_push_temp(1) }
#' tmp = makeExperimentRegistry(file.dir = NA, make.default = FALSE)
#' addProblem("p1", fun = function(job, data) data, reg = tmp)
#' addProblem("p2", fun = function(job, data) job, reg = tmp)
#' addAlgorithm("a1", fun = function(job, data, instance) instance, reg = tmp)
#' addExperiments(repls = 2, reg = tmp)
#'
#' # List problems, algorithms and job parameters:
#' tmp$problems
#' tmp$algorithms
#' getJobPars(reg = tmp)
#'
#' # Remove one problem
#' removeProblems("p1", reg = tmp)
#'
#' # List problems and algorithms:
#' tmp$problems
#' tmp$algorithms
#' getJobPars(reg = tmp)
addProblem = function(name, data = NULL, fun = NULL, seed = NULL, cache = FALSE, reg = getDefaultRegistry()) {
  assertRegistry(reg, class = "ExperimentRegistry", writeable = TRUE)
  assertString(name, min.chars = 1L)
  if (!stri_detect_regex(name, "^[[:alnum:]_.-]+$"))
    stopf("Illegal characters in problem name: %s", name)
  if (is.null(fun)) {
    fun = function(job, data, ...) data
  } else {
    assert(checkFunction(fun, args = c("job", "data")), checkFunction(fun, args = "..."))
  }
  if (is.null(seed)) {
    cache = FALSE
  } else {
    seed = asCount(seed, positive = TRUE)
    cache = assertFlag(cache)
  }

  info("Adding problem '%s'", name)
  prob = setClasses(list(name = name, seed = seed, cache = cache, data = data, fun = fun), "Problem")
  writeRDS(prob, file = getProblemURI(reg, name), compress = reg$compress)
  reg$problems = union(reg$problems, name)
  cache.dir = getProblemCacheDir(reg, name)
  if (fs::dir_exists(cache.dir))
    fs::dir_delete(cache.dir)
  if (cache)
    fs::dir_create(cache.dir)
  saveRegistry(reg)
  invisible(prob)
}

#' @export
#' @rdname addProblem
removeProblems = function(name, reg = getDefaultRegistry()) {
  assertRegistry(reg, class = "ExperimentRegistry", writeable = TRUE, running.ok = FALSE)
  assertCharacter(name, any.missing = FALSE)
  assertSubset(name, reg$problems)

  problem = NULL
  for (nn in name) {
    def.ids = reg$defs[problem == nn, "def.id"]
    job.ids = filter(def.ids, reg$status, "job.id")

    info("Removing Problem '%s' and %i corresponding jobs ...", nn, nrow(job.ids))
    file_remove(getProblemURI(reg, nn))
    reg$defs = reg$defs[!def.ids]
    reg$status = reg$status[!job.ids]
    reg$problems = chsetdiff(reg$problems, nn)
    cache = getProblemCacheDir(reg, nn)
    if (fs::dir_exists(cache))
      fs::dir_delete(cache)
  }

  sweepRegistry(reg)
  invisible(TRUE)
}

getProblemURI = function(reg, name) {
  fs::path(dir(reg, "problems"), mangle(name))
}

getProblemCacheDir = function(reg, name) {
  fs::path(dir(reg, "cache"), "problems", base32_encode(name, use.padding = FALSE))
}

getProblemCacheURI = function(job) {
  fs::path(getProblemCacheDir(job, job$prob.name), sprintf("%s.rds", digest(list(job$prob.name, job$prob.pars, job$repl))))
}