File: geodist.R

package info (click to toggle)
r-cran-recipes 0.1.15%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,496 kB
  • sloc: sh: 37; makefile: 2
file content (186 lines) | stat: -rw-r--r-- 5,443 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#' Distance between two locations
#'
#' `step_geodist` creates a *specification* of a
#'  recipe step that will calculate the distance between
#'  points on a map to a reference location.
#'
#' @inheritParams step_center
#' @param lon,lat Selector functions to choose which variables are
#'  affected by the step. See selections() for more details.
#' @param ref_lon,ref_lat Single numeric values for the location
#'  of the reference point.
#' @param role or model term created by this step, what analysis
#'  role should be assigned?. By default, the function assumes
#'  that resulting distance will be used as a predictor in a model.
#' @param log A logical: should the distance be transformed by
#'  the natural log function?
#' @param columns A character string of variable names that will
#'  be populated (eventually) by the `terms` argument.
#' @param name A single character value to use for the new
#'  predictor column. If a column exists with this name, an error is
#'  issued.
#' @return An updated version of `recipe` with the new step added
#'  to the sequence of existing steps (if any). For the `tidy`
#'  method, a tibble with columns echoing the values of `lat`,
#'  `lon`, `ref_lat`, `ref_lon`, `name`, and `id`.
#' @keywords datagen
#' @concept preprocessing
#' @export
#' @details `step_geodist` will create a
#'
#' @examples
#'
#' library(modeldata)
#' data(Smithsonian)
#'
#' # How close are the museums to Union Station?
#' near_station <- recipe( ~ ., data = Smithsonian) %>%
#'   update_role(name, new_role = "location") %>%
#'   step_geodist(lat = latitude, lon = longitude, log = FALSE,
#'                ref_lat = 38.8986312, ref_lon = -77.0062457) %>%
#'   prep(training = Smithsonian)
#'
#' bake(near_station, new_data = NULL) %>%
#'   arrange(geo_dist)
#'
#' tidy(near_station, number = 1)
step_geodist <- function(recipe,
                         lat = NULL,
                         lon = NULL,
                         role = "predictor",
                         trained = FALSE,
                         ref_lat = NULL,
                         ref_lon = NULL,
                         log = FALSE,
                         name = "geo_dist",
                         columns = NULL,
                         skip = FALSE,
                         id = rand_id("geodist")) {
  if (length(ref_lon) != 1 || !is.numeric(ref_lon))
    rlang::abort("`ref_lon` should be a single numeric value.")
  if (length(ref_lat) != 1 || !is.numeric(ref_lat))
    rlang::abort("`ref_lat` should be a single numeric value.")
  if (length(log) != 1 || !is.logical(log))
    rlang::abort("`log` should be a single logical value.")
  if (length(name) != 1 || !is.character(name))
    rlang::abort("`name` should be a single character value.")

  add_step(
    recipe,
    step_geodist_new(
      lon = enquos(lon),
      lat = enquos(lat),
      role = role,
      trained = trained,
      ref_lon = ref_lon,
      ref_lat = ref_lat,
      log = log,
      name = name,
      columns = columns,
      skip = skip,
      id = id
    )
  )
}

step_geodist_new <-
  function(lon, lat, role, trained, ref_lon, ref_lat,
           log, name, columns, skip, id) {
    step(
      subclass = "geodist",
      lon = lon,
      lat = lat,
      role = role,
      trained = trained,
      ref_lon = ref_lon,
      ref_lat = ref_lat,
      log = log,
      name = name,
      columns = columns,
      skip = skip,
      id = id
    )
  }


#' @export
prep.step_geodist <- function(x, training, info = NULL, ...) {
  lon_name <- eval_select_recipes(x$lon, training, info)
  lat_name <- eval_select_recipes(x$lat, training, info)

  if (length(lon_name) > 1)
    rlang::abort("`lon` should resolve to a single column name.")
  check_type(training[, lon_name])

  if (length(lat_name) > 1)
    rlang::abort("`lat` should resolve to a single column name.")
  check_type(training[, lat_name])

  if (any(names(training) == x$name))
    rlang::abort("'", x$name, "' is already used in the data.")

  step_geodist_new(
    lon = x$lon,
    lat = x$lat,
    role = x$role,
    trained = TRUE,
    ref_lon = x$ref_lon,
    ref_lat = x$ref_lat,
    log = x$log,
    name = x$name,
    columns = c(lat_name, lon_name),
    skip = x$skip,
    id = x$id
  )
}

geo_dist_calc <- function(x, a, b)
  apply(x, 1, function(x, a, b) sqrt((x[1] - a) ^ 2 + (x[2] - b) ^ 2),
        a = a, b = b)

#' @export
bake.step_geodist <- function(object, new_data, ...) {
  dist_vals <-
    geo_dist_calc(new_data[, object$columns], object$ref_lat, object$ref_lon)
  if (object$log) {
    new_data[, object$name] <- log(dist_vals)
  } else {
    new_data[, object$name] <- dist_vals
  }
  new_data
}

print.step_geodist <-
  function(x, width = max(20, options()$width - 30), ...) {
    cat("Geographical distances from",
        format(x$ref_lat, digits = 10), "x",
        format(x$ref_lon, digits = 10), "\n")
    invisible(x)
  }



#' @rdname step_geodist
#' @param x A `step_geodist` object.
#' @export
tidy.step_geodist <- function(x, ...) {
  if (is_trained(x)) {
    res <- tibble(
      latitude = x$columns[1],
      longitude = x$columns[2],
      ref_latitude = x$ref_lat,
      ref_longitude = x$ref_lon,
      name = x$name
    )
  } else {
    res <- tibble(
      latitude = sel2char(x$lat),
      longitude = sel2char(x$lon),
      ref_latitude = x$ref_lat,
      ref_longitude = x$ref_lon,
      name = x$name
    )
  }
  res$id <- x$id
  res
}