File: generate_models.R

package info (click to toggle)
xgboost 3.0.4-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 13,848 kB
  • sloc: cpp: 67,603; python: 35,537; java: 4,676; ansic: 1,426; sh: 1,352; xml: 1,226; makefile: 204; javascript: 19
file content (105 lines) | stat: -rw-r--r-- 3,846 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Script to generate reference models. The reference models are used to test backward compatibility
# of saved model files from XGBoost version 0.90 and 1.0.x.
library(xgboost)
library(Matrix)

set.seed(0)
metadata <- list(
  kRounds = 2,
  kRows = 1000,
  kCols = 4,
  kForests = 2,
  kMaxDepth = 2,
  kClasses = 3
)
X <- Matrix(data = rnorm(metadata$kRows * metadata$kCols), nrow = metadata$kRows,
            ncol = metadata$kCols, sparse = TRUE)
w <- runif(metadata$kRows)

version <- packageVersion('xgboost')
target_dir <- 'models'

save_booster <- function(booster, model_name) {
  booster_bin <- function(model_name) {
    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = '')))
  }
  booster_json <- function(model_name) {
    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = '')))
  }
  booster_rds <- function(model_name) {
    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = '')))
  }
  xgb.save(booster, booster_bin(model_name))
  saveRDS(booster, booster_rds(model_name))
  if (version >= '1.0.0') {
    xgb.save(booster, booster_json(model_name))
  }
}

generate_regression_model <- function() {
  print('Regression')
  y <- rnorm(metadata$kRows)

  data <- xgb.DMatrix(X, label = y, nthread = 1)
  params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
                 max_depth = metadata$kMaxDepth)
  booster <- xgb.train(params, data, nrounds = metadata$kRounds)
  save_booster(booster, 'reg')
}

generate_logistic_model <- function() {
  print('Binary classification with logistic loss')
  y <- sample(0:1, size = metadata$kRows, replace = TRUE)
  stopifnot(max(y) == 1, min(y) == 0)

  objective <- c('binary:logistic', 'binary:logitraw')
  name <- c('logit', 'logitraw')

  for (i in seq_len(length(objective))) {
    data <- xgb.DMatrix(X, label = y, weight = w, nthread = 1)
    params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
                   max_depth = metadata$kMaxDepth, objective = objective[i])
    booster <- xgb.train(params, data, nrounds = metadata$kRounds)
    save_booster(booster, name[i])
  }
}

generate_classification_model <- function() {
  print('Multi-class classification')
  y <- sample(0:(metadata$kClasses - 1), size = metadata$kRows, replace = TRUE)
  stopifnot(max(y) == metadata$kClasses - 1, min(y) == 0)

  data <- xgb.DMatrix(X, label = y, weight = w, nthread = 1)
  params <- list(num_class = metadata$kClasses, tree_method = 'hist',
                 num_parallel_tree = metadata$kForests, max_depth = metadata$kMaxDepth,
                 objective = 'multi:softmax')
  booster <- xgb.train(params, data, nrounds = metadata$kRounds)
  save_booster(booster, 'cls')
}

generate_ranking_model <- function() {
  print('Learning to rank')
  y <- sample(0:4, size = metadata$kRows, replace = TRUE)
  stopifnot(max(y) == 4, min(y) == 0)
  kGroups <- 20
  w <- runif(kGroups)
  g <- rep(50, times = kGroups)

  data <- xgb.DMatrix(X, label = y, group = g, nthread = 1)
  # setinfo(data, 'weight', w)
  # ^^^ does not work in version <= 1.1.0; see https://github.com/dmlc/xgboost/issues/5942
  # So call low-level function XGDMatrixSetInfo_R directly. Since this function is not an exported
  # symbol, use the triple-colon operator.
  .Call(xgboost:::XGDMatrixSetInfo_R, data, 'weight', as.numeric(w))
  params <- list(objective = 'rank:ndcg', num_parallel_tree = metadata$kForests,
                 tree_method = 'hist', max_depth = metadata$kMaxDepth)
  booster <- xgb.train(params, data, nrounds = metadata$kRounds)
  save_booster(booster, 'ltr')
}

dir.create(target_dir)

invisible(generate_regression_model())
invisible(generate_logistic_model())
invisible(generate_classification_model())
invisible(generate_ranking_model())