File: chunk.Rd

package info (click to toggle)
r-cran-batchtools 0.9.15%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bullseye
size: 1,416 kB
sloc: ansic: 172; sh: 156; makefile: 2
file content (110 lines) | stat: -rw-r--r-- 3,914 bytes
parent folder | download | duplicates (3)
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/chunkIds.R
\name{chunk}
\alias{chunk}
\alias{lpt}
\alias{binpack}
\title{Chunk Jobs for Sequential Execution}
\usage{
chunk(x, n.chunks = NULL, chunk.size = NULL, shuffle = TRUE)

lpt(x, n.chunks = 1L)

binpack(x, chunk.size = max(x))
}
\arguments{
\item{x}{[\code{numeric}]\cr
For \code{chunk} an atomic vector (usually the \code{job.id}).
For \code{binpack} and \code{lpt}, the weights to group.}

\item{n.chunks}{[\code{integer(1)}]\cr
Requested number of chunks.
The function \code{chunk} distributes the number of elements in \code{x} evenly while
\code{lpt} tries to even out the sum of elements in each chunk.
If more chunks than necessary are requested, empty chunks are ignored.
Mutually exclusive with \code{chunks.size}.}

\item{chunk.size}{[\code{integer(1)}]\cr
Requested chunk size for each single chunk.
For \code{chunk} this is the number of elements in \code{x}, for \code{binpack} the size
is determined by the sum of values in \code{x}.
Mutually exclusive with \code{n.chunks}.}

\item{shuffle}{[\code{logical(1)}]\cr
Shuffles the groups. Default is \code{TRUE}.}
}
\value{
[\code{integer}] giving the chunk number for each element of \code{x}.
}
\description{
Jobs can be partitioned into \dQuote{chunks} to be executed sequentially on the computational nodes.
Chunks are defined by providing a data frame with columns \dQuote{job.id} and \dQuote{chunk} (integer)
to \code{\link{submitJobs}}.
All jobs with the same chunk number will be grouped together on one node to form a single
computational job.

The function \code{chunk} simply splits \code{x} into either a fixed number of groups, or
into a variable number of groups with a fixed number of maximum elements.

The function \code{lpt} also groups \code{x} into a fixed number of chunks,
but uses the actual values of \code{x} in a greedy \dQuote{Longest Processing Time} algorithm.
As a result, the maximum sum of elements in minimized.

\code{binpack} splits \code{x} into a variable number of groups whose sum of elements do
not exceed the upper limit provided by \code{chunk.size}.

See examples of \code{\link{estimateRuntimes}} for an application of \code{binpack} and \code{lpt}.
}
\examples{
\dontshow{ batchtools:::example_push_temp(2) }
ch = chunk(1:10, n.chunks = 2)
table(ch)

ch = chunk(rep(1, 10), chunk.size = 2)
table(ch)

set.seed(1)
x = runif(10)
ch = lpt(x, n.chunks = 2)
sapply(split(x, ch), sum)

set.seed(1)
x = runif(10)
ch = binpack(x, 1)
sapply(split(x, ch), sum)

# Job chunking
tmp = makeRegistry(file.dir = NA, make.default = FALSE)
ids = batchMap(identity, 1:25, reg = tmp)

### Group into chunks with 10 jobs each
library(data.table)
ids[, chunk := chunk(job.id, chunk.size = 10)]
print(ids[, .N, by = chunk])

### Group into 4 chunks
ids[, chunk := chunk(job.id, n.chunks = 4)]
print(ids[, .N, by = chunk])

### Submit to batch system
submitJobs(ids = ids, reg = tmp)

# Grouped chunking
tmp = makeExperimentRegistry(file.dir = NA, make.default = FALSE)
prob = addProblem(reg = tmp, "prob1", data = iris, fun = function(job, data) nrow(data))
prob = addProblem(reg = tmp, "prob2", data = Titanic, fun = function(job, data) nrow(data))
algo = addAlgorithm(reg = tmp, "algo", fun = function(job, data, instance, i, ...) problem)
prob.designs = list(prob1 = data.table(), prob2 = data.table(x = 1:2))
algo.designs = list(algo = data.table(i = 1:3))
addExperiments(prob.designs, algo.designs, repls = 3, reg = tmp)

### Group into chunks of 5 jobs, but do not put multiple problems into the same chunk
# -> only one problem has to be loaded per chunk, and only once because it is cached
ids = getJobTable(reg = tmp)[, .(job.id, problem, algorithm)]
ids[, chunk := chunk(job.id, chunk.size = 5), by = "problem"]
ids[, chunk := .GRP, by = c("problem", "chunk")]
dcast(ids, chunk ~ problem)
}
\seealso{
\code{\link{estimateRuntimes}}
}