1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
|
# autoencoder
# Enable auto-encoder for model building.
# use_all_factor_levels
# Logical. Use all factor levels of categorical variance. Otherwise the first factor level is omittted (without loss of accuracy). Useful for variable imporotances and auto-enabled for autoencoder.
# train_samples_per_iteration
# Number of training samples (globally) per MapReduce iteration. Special values are: 0 one epoch; -1 all available data (e.g., replicated training data); or -2 auto-tuning (default)
# seed
# Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded
# adaptive_rate
# Logical. Adaptive learning rate (ADAELTA)
# rho
# Adaptive learning rate time decay factor (similarity to prior updates)
# epsilon
# Adaptive learning rate parameter, similar to learn rate annealing during initial training phase. Typical values are between 1.0e-10 and 1.0e-4
# rate
# Learning rate (higher => less stable, lower => slower convergence)
# rate_annealing
# Learning rate annealing: (rate)/(1 + rate_annealing*samples)
# rate_decay
# Learning rate decay factor between layers (N-th layer: rate*α^(N-1))
# momentum_start
# Initial momentum at the beginning of traning (try 0.5)
# momentum_ramp
# Number of training samples for which momentum increases
# momentum_stable
# Final momentum after ther amp is over (try 0.99)
# nesterov_accelerated_gradient
# Logical. Use Nesterov accelerated gradient (recommended)
# input_dropout_ratio
# A fraction of the features for each training row to be omitted from training in order to improve generalization (dimension sampling).
# hidden_dropout_ratios
# Input layer dropout ration (can improve generalization) specify one value per hidden layer, defaults to 0.5
# l1
# L1 regularization (can add stability and improve generalization, cause many weights to become 0)
# l2
# L2 regularization (can add stability and improve generalization, causes many weights to be small)
# max_w2
# Constraint for squared sum of incoming weights per unit (e.g. Rectifier)
# initial_weight_distribution
# Can be "Uniform", "UniformAdaptive", or "Normal"
# initial_weight_scale
# Unifrom: -value ... value, Normal: stddev
# loss
# Loss function: Automatic, CrossEntropy (for classification only), Quadratic, Absolute (experimental) or Huber (experimental)
# score_interval
# Shortest time interval (in secs) between model scoring
# score_training_samples
# Number of training set samples for scoring (0 for all)
# score_validation_samples
# Number of validation set samples for scoring (0 for all)
# score_duty_cycle
# Maximum duty cycle fraction for scoring (lower: more training, higher: more scoring)
# classification_stop
# Stopping criterion for classification error fraction on training data (-1 to disable)
# regression_stop
# Stopping criterion for regression error (MSE) on training data (-1 to disable)
# quiet_mode
# Enable quiet mode for less output to standard output
# max_confusion_matrix_size
# Max. size (number of classes) for confusion matrices to be shown
# max_hit_ratio_k
# Max number (top K) of predictions to use for hit ration computation(for multi-class only, 0 to disable)
# balance_classes
# Balance training data class counts via over/under-sampling (for imbalanced data)
# class_sampling_factors
# Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes.
# max_after_balance_size
# Maximum relative size of the training data after balancing class counts (can be less than 1.0)
# score_validation_sampling
# Method used to sample validation dataset for scoring
# diagnostics
# Enable diagnostics for hidden layers
# variable_importances
# Compute variable importances for input features (Gedeon method) - can be slow for large networks)
# fast_mode
# Enable fast mode (minor approximations in back-propagation)
# ignore_const_cols
# Ignore constant columns (no information can be gained anwyay)
# force_load_balance
# Force extra load balancing to increase training speed for small datasets (to keep all cores busy)
# replicate_training_data
# Replicate the entire training dataset onto every node for faster training
# single_node_mode
# Run on a single node for fine-tuning of model parameters
# shuffle_training_data
# Enable shuffling of training data (recommended if training data is replicated and train_samples_per_iteration is close to numRows*numNodes
# col_major
# Use a column major weight matrix for input layer. Can speed up forward proagation, but might slow down backpropagation (Experimental)
# average_activation
# Average activation for sparse auto-encoder (Experimental)
# sparsity_beta
# Sparsity regularization (Experimental)
# max_categorical_features
# Max. number of categorical features, enforced via hashing Experimental)
# reproducible
# Force reproducibility on small data (will be slow - only uses 1 thread)
# export_weights_and_biases
# Whether to export Neural Network weights and biases to H2O Frames"
# ...
# extra parameters to pass onto functions (not implemented)
# Details: https://leanpub.com/deeplearning/read
#' @export
makeRLearner.regr.h2o.deeplearning = function() {
makeRLearnerRegr(
cl = "regr.h2o.deeplearning",
package = "h2o",
par.set = makeParamSet(
makeLogicalLearnerParam("autoencoder", default = FALSE),
makeLogicalLearnerParam("use_all_factor_level", default = TRUE),
makeDiscreteLearnerParam("activation", values = c("Rectifier", "Tanh",
"TanhWithDropout", "RectifierWithDropout", "Maxout", "MaxoutWithDropout"),
default = "Rectifier"),
# FIXME: hidden can also be a list of integer vectors for grid search
makeIntegerVectorLearnerParam("hidden", default = c(200L, 200L),
len = NA_integer_, lower = 1L),
makeNumericLearnerParam("epochs", default = 10L, lower = 1), # doc says can be fractional
makeNumericLearnerParam("train_samples_per_iteration", default = -2, lower = -2),
makeIntegerLearnerParam("seed", tunable = FALSE),
makeLogicalLearnerParam("adaptive_rate", default = TRUE),
makeNumericLearnerParam("rho", default = 0.99, lower = 0), # is there a upper limit for this?
makeNumericLearnerParam("epsilon", default = 1e-08, lower = 1e-10, upper = 1e-4),
makeNumericLearnerParam("rate", default = 0.005, lower = 0, upper = 1),
makeNumericLearnerParam("rate_annealing", default = 1e-06, lower = 0),
makeNumericLearnerParam("rate_decay", default = 1, lower = 0),
makeNumericLearnerParam("momentum_start", default = 0),
makeNumericLearnerParam("momentum_ramp", default = 1e+06),
makeNumericLearnerParam("momentum_stable", default = 0),
makeLogicalLearnerParam("nesterov_accelerated_gradient", default = TRUE),
makeNumericLearnerParam("input_dropout_ratio", default = 0),
makeNumericVectorLearnerParam("hidden_dropout_ratios", default = 0.5),
makeNumericLearnerParam("l1", default = 0),
makeNumericLearnerParam("l2", default = 0),
makeNumericLearnerParam("max_w2", default = Inf, allow.inf = TRUE),
makeDiscreteLearnerParam("initial_weight_distribution",
values = c("UniformAdaptive", "Uniform", "Normal"), default = "UniformAdaptive"),
makeNumericLearnerParam("initial_weight_scale", default = 1),
makeDiscreteLearnerParam("loss", values = c("Automatic", "Quadratic",
"Absolute", "Huber")),
makeDiscreteLearnerParam("distribution", values = c("AUTO", "gaussian",
"poisson", "gamma", "tweedie", "laplace",
"huber", "quantile"), default = "AUTO"),
makeNumericLearnerParam("quantile_alpha", default = 0.5, lower = 0, upper = 1,
requires = quote(distribution == "quantile")),
makeNumericLearnerParam("tweedie_power", default = 1.5, lower = 1, upper = 2,
requires = quote(distribution == "tweedie")),
makeNumericLearnerParam("score_interval", default = 5),
makeIntegerLearnerParam("score_training_samples", default = 10000),
makeIntegerLearnerParam("score_validation_samples", default = 0),
makeNumericLearnerParam("score_duty_cycle", default = 0.1),
makeNumericLearnerParam("regression_stop", default = 1e-6, lower = -1),
makeIntegerLearnerParam("stopping_rounds", default = 5L, lower = 0L),
makeDiscreteLearnerParam("stopping_metric", values = c("AUTO", "deviance",
"mse", "rmse", "mae", "rmsle", "r2"), default = "AUTO",
requires = quote(stopping_rounds > 0L)),
makeNumericLearnerParam("stopping_tolerance", default = 0, lower = 0),
makeNumericLearnerParam("max_runtime_secs", default = 0, lower = 0),
makeLogicalLearnerParam("quiet_mode", tunable = FALSE),
makeLogicalLearnerParam("balance_classes", default = FALSE),
makeNumericLearnerParam("class_sampling_factors", requires = quote(balance_classes == TRUE)),
makeNumericLearnerParam("max_after_balance_size", default = 5),
makeDiscreteLearnerParam("score_validation_sampling", values = c("Uniform",
"Stratified"), default = "Uniform"),
makeLogicalLearnerParam("diagnostics", default = TRUE, tunable = FALSE),
makeLogicalLearnerParam("variable_importances", default = TRUE, tunable = FALSE),
makeLogicalLearnerParam("fast_mode", default = TRUE, tunable = FALSE),
makeLogicalLearnerParam("ignore_const_cols", default = TRUE, tunable = FALSE),
makeLogicalLearnerParam("force_load_balance", default = TRUE, tunable = FALSE),
makeLogicalLearnerParam("replicate_training_data", default = TRUE, tunable = FALSE),
makeLogicalLearnerParam("single_node_mode", default = FALSE, tunable = FALSE),
makeLogicalLearnerParam("shuffle_training_data", tunable = FALSE),
makeLogicalLearnerParam("sparse", default = FALSE, tunable = FALSE),
makeLogicalLearnerParam("col_major", default = FALSE, tunable = FALSE),
makeLogicalLearnerParam("average_activation", tunable = FALSE),
# makeLogicalLearnerParam("sparsity_beta", tunable = FALSE),
makeLogicalLearnerParam("reproducible", default = FALSE, tunable = FALSE),
makeLogicalLearnerParam("export_weights_and_biases", default = FALSE, tunable = FALSE)
),
properties = c("numerics", "factors", "weights", "missings"),
name = "h2o.deeplearning",
short.name = "h2o.dl",
note = 'The default value of `missing_values_handling` is `"MeanImputation"`, so missing values are automatically mean-imputed.',
callees = "h2o.deeplearning"
)
}
#' @export
trainLearner.regr.h2o.deeplearning = function(.learner, .task, .subset, .weights = NULL, ...) {
# check if h2o connection already exists, otherwise start one
conn.up = tryCatch(h2o::h2o.getConnection(), error = function(err) {
return(FALSE)
})
if (!inherits(conn.up, "H2OConnection")) {
h2o::h2o.init()
}
y = getTaskTargetNames(.task)
x = getTaskFeatureNames(.task)
d = getTaskData(.task, subset = .subset)
wcol = NULL
if (!is.null(.weights)) {
d$.mlr.weights = .weights
wcol = ".mlr.weights"
}
h2of = h2o::as.h2o(d)
h2o::h2o.deeplearning(y = y, x = x, training_frame = h2of, weights_column = wcol, ...)
}
#' @export
predictLearner.regr.h2o.deeplearning = function(.learner, .model, .newdata, ...) {
m = .model$learner.model
h2of = h2o::as.h2o(.newdata)
p = h2o::h2o.predict(m, newdata = h2of, ...)
p.df = as.data.frame(p)
return(p.df$predict)
}
|