File: evaluation.R

package info (click to toggle)
r-cran-popepi 0.4.13%2Bdfsg-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 1,656 kB
sloc: sh: 13; makefile: 2
file content (712 lines) | stat: -rw-r--r-- 22,636 bytes
evalPopArg <- function(
  data,
  arg,
  n = 1L,
  DT = TRUE,
  enclos = NULL,
  recursive = TRUE,
  types = c("NULL","character", "list", "expression"),
  naming = c("DT", "model")
) {
  ## arg: an unevaluated AND substitute()'d argument within a function, which may be
  ## * an expression
  ## * a list of expressions
  ## * a character vector of variable names (in a given data set)
  ## n: steps upstream as in parent.frame(n); 0L refers to calling environment
  ## of evalPopArg, 1L to calling environment of e.g. sir which uses evalPopArg, etc.
  ## hence n = 1L should be almost always the right way to go.
  ## ALTERNATIVELY supply an environment by hand via enclos.
  ## enclos will override n.
  ## recursive: if TRUE, evals arg as many times as it is of type language.
  ## output:
  ## * vector as a result of an expression
  ## * list as a result of a list
  ## * character vector of names
  ## OR with DT = TRUE, a data.table based on aforementioned results.
  ## intention: output to be used in by argument of data.table.
  ## a data.table output is directly usable in by.
  ## if column names cannot be easily found, BV1, BV2, ... are imputed
  ## for missing names (unrobustly: such names may already exist, resulting in duplicates)

  ## naming: DT style uses first element of all.names() where
  ## a name has to be created; model style keeps the whole deparsed
  ## expression. Only applied when DT = TRUE
  naming <- match.arg(naming[1L], c("DT", "model"))

  ## types: allowed popArg types of arguments.
  types <- match.arg(types, c("NULL","character", "list", "expression", "formula"), several.ok = TRUE)

  if (!is.null(enclos) && !is.environment(enclos)) {
    stop("enclos must be NULL or an environment")
  }
  if (!is.environment(enclos)) enclos <- parent.frame(n + 1L)

  ## used data may change if expression uses dollar operator, hence
  ## arg should not be evaluated within data but only its surroundings.
  use_data <- data
  use_enc <- enclos
  dataNames <- names(data)

  if (uses_dollar(arg, data.names = dataNames)) {
    use_data <- enclos
    use_enc <- baseenv()
  }
  e <- eval(arg, envir = use_data, enclos = use_enc)
  if (is.language(e) && !inherits(e, "formula")) {
    if (!recursive) stop("arg is of type language after evaluating, and recursive = FALSE")

    tick <- 1L
    while (is.language(e) && !inherits(e, "formula") && tick < 100L) {
      arg <- e
      use_data <- data
      use_enc <- enclos
      if (uses_dollar(arg, data.names = dataNames)) {
        use_data <- enclos
        use_enc <- baseenv()
      }
      e <- eval(arg, envir = use_data, enclos = use_enc)
      tick <- tick + 1L
    }
    if (tick == 100L) stop("arg was of type language even after 100 evaluations. Something went wrong here...")



  }
  argType <- "NULL"
  if (is.list(e)) argType <- "list" else
    if (is.character(e)) argType <- "character" else
      if (mode(e) == "numeric" || is.vector(e) || is.factor(e)) argType <- "expression" else
        if (inherits(e, "formula")) argType <- "formula"

  if (!argType %in% types) {
    stop(
      "popArg type of evaluated arg not one of the allowed types ",
      "(set via argument types). Detected type: '", deparse(argType),
      "'. Allowed types: ", deparse(types))
  }
  if (argType == "NULL") return(NULL)

  av <- all.vars(arg)
  if (argType == "character") av <- e

  ## byNames: names of columns resulting from aggre argument, by which
  ## pyrs and such are aggregated. same functionality
  ## as in results seen in e.g.
  ## dt[, .N, by = list(factor(x), y, z = w)] ## factor, y, z
  ## note: first object in ags with list or expression aggre is "list"
  byNames <- NULL
  arg_substr5 <- substr(paste0(deparse(arg), collapse = " "), 1, 5)
  if (is.character(e)) {
    byNames <- e
  } else if (argType == "list" && arg_substr5 == "list(") {
    byNames <- sapply(arg[-1], function(x) all.names(x)[1])
  } else if (argType == "expression") {
    byNames <- all.names(arg)[1]
  }

  badNames <- c("$", ":")

  byNames[byNames %in% badNames] <- paste0("BV", seq_len(length(byNames)))[byNames %in% badNames]

  if (argType == "formula") {
    arg <- e
    use_data <- data
    use_enc <- enclos
    e <- RHS2DT(formula = e, data = use_data, enclos = use_enc)
    if (ncol(e) == 0L || nrow(e) == 0L) e <- data.table() ## e.g. y ~ 1

  } else if (is.character(e)) {
    all_names_present(data, e)
    if (DT) {
      ## note: e contains variable names in character strings,
      ## ergo fully named list & DT created
      l <- lapply(e, function(x) data[[x]])
      setattr(l, "names", e)
      setDT(l)
      e <- l; rm(l)
    }
  } else if (is.list(e)) {
    ## note: fully unnamed list has NULL names()
    ## partially named list has some "" names
    ne <- names(e)

    if (DT && any(sapply(e, is.null))) stop("at least one object in list arg is NULL; cannot form data.table with such list")

    if (is.null(ne)) ne <- rep("", length(e))


    wh_bad <- which(ne == "")
    if (length(wh_bad) > 0) {
      if (is.null(byNames)) {
        byNames <- paste0("BV", seq_len(length(e)))
      }

      ne[wh_bad] <- byNames[wh_bad]
      setattr(e, "names", ne)
    }

    if (DT) {
      ## NOTE: used to be setDT, but length of different elements
      ## in list may differ, which as.data.table handles correctly
      e <- as.data.table(e)
    }
  } else if (mode(e) == "numeric" || is.vector(e) || is.factor(e)) {
    ## is e.g. a numeric vector or a factor
    if (DT) {
      e <- data.table(V1 = e)
      setnames(e, 1, byNames)
    }
  }

  ## NOTE: e may be of type language at this point if arg was double-quoted
  ## and recursive = FALSE

  if (DT) {
    setDT(e)
    setattr(e, "all.vars", av)
    setattr(e, "quoted.arg", arg)
    setattr(e, "arg.type", argType)
    if (naming == "model" && ncol(e) > 0L) setnames(e, seq_len(ncol(e)), popArg2ModelNames(arg, type = argType))
  }
  e
}

popArgType <- function(
  arg,
  data = NULL,
  n = 1L,
  enclos = NULL,
  recursive = TRUE
) {
  ## input: a substitute()'d expression / argument
  ## NOTE: recursive useful when arg might be quoted twice and want the eventual
  ## result; need to supply data for it though
  ## output: type of thingie that was substitute()'d
  ##  * list (of expressions)
  ##  * character string vector
  ##  * an expression (includes symbol)
  av <- all.vars(arg, unique = TRUE) ## all variables
  av <- setdiff(av, c("$", "T", "F"))
  an <- all.names(arg, unique = TRUE) ## all variables and functions
  af <- setdiff(an, av) ## all functions used

  a <- deparse(arg)
  a <- paste0(a, collapse = "") ## lists may somehow produce length > 1 here
  if (substr(a, 1, 5) == "list(") return("list")
  if (a == "NULL") return("NULL")
  ## detection of character arguments is not easy and should not be considered
  ## fool proof since user may pass e.g. a vector of character strings as a
  ## symbol, which can only really be interpreted as an expression
  if (sum(grep('\\"', a)) && length(setdiff(af, "c")) == 0) return("character")

  if (is.data.frame(data)) {
    if (is.symbol(arg) && a %in% names(data)) return("expression")
    if (length(av) == 1L && av %in% names(data)) return("expression")
    e <- eval(arg, envir = data[seq_len(min(nrow(data), 20L)), ],
              enclos = if (is.environment(enclos)) enclos else parent.frame(n + 1L))
    if (inherits(e, "formula")) return("formula")
    if (is.null(e)) return("NULL")
    if (is.list(e)) return("list")
    if (is.character(e) && all(e %in% names(data))) return("character")
    if (is.vector(e) || is.factor(e)) return("expression")

    if (recursive && is.language(e)) return(popArgType(e, data = data, n = n + 1L, enclos = enclos))
  }

  "expression"

}
popArg2ModelNames <- function(arg, type) {
  ## INTENTION: given a quoted/substituted expression,
  ## digs out the expression(s) creating a/multiple column(s)
  ## and returns the deparsed expression(s) to be used as names
  ## of columns the same way that models such as lm() display
  ## the names of expressions used within formula

  ## some exceptions
  if (is.data.frame(arg)) return(names(arg))
  if (is.character(arg)) return(arg)

  type <- match.arg(type[1L], c("NULL", "character", "list", "expression", "formula"))

  lang <- NULL
  lang <- try(is.language(arg) || inherits(arg, "formula"), silent = TRUE)


  if (inherits(lang, "try-error") || !lang) stop("arg must be a quoted or substituted expression or a formula. Error message: ", lang, ". type of arg: ", typeof(arg), ". Class: ", class(arg), ". Mode: ", mode(arg), ".")

  d <- oneWhitespace(paste0(deparse(arg)))

  if (type == "expression") return(d) else
    if (type == "NULL") return(NULL) else
      if (type == "character") return(eval(arg)) else
        if (type == "list") {
          d <- substr(d, 6, nchar(d)-1L) ## removes "list(" and ")"
          d <- strsplit(d, ", ")
          return(unlist(d))
        } else if (type == "formula") {
          arg <- eval(arg)
          d <- names(RHS2list(arg))
          if (length(d) == 0L) return(NULL) ## e.g. y ~ 1
          return(d)
        }
  stop("could not determine deparsed-expression-names")
}





uses_dollar <- function(q, data.names) {
  ## INTENTION: determine whether q is an expressions that is evaluated
  ## outside a data.frame, i.e. one that uses the dollar operator.
  ## e.g. TF$V1 should not be evaluated in a data.frame even if it has
  ## the variables TF and V1 since it wont work and was not intended.
  if (!is.language(q) || inherits(q, "formula")) {
    return(FALSE)
  }

  d <- deparse(q)
  ## sometimes d is of length > 1 for some reason...
  d <- paste0(d, collapse = "")
  d <- oneWhitespace(d)

  if (substr(d, 1, 4) == "list") {
    ## lists are not allowed to work in this manner for now.
    return(FALSE)
  }

  if (!grepl(x = d, pattern = "\\$")) {
    ## does not use dollar operator.
    return(FALSE)
  }

  ## detect if word$word is used in d
  t <- regexec(pattern = "\\w+\\$\\w+", text = d)
  if (t != -1) {
    ## ok, used word$word
    ## is there are variable with that name in data.names?
    m <- unlist(regmatches(d, t))
    if (m %in% data.names) {
      return(FALSE)
    }
    ## if not, it should be evaluated outside the data.
    return(TRUE)
  }

  return(FALSE)
}




`%.:%` <- function(x, y) {
  ## INTENTION: hacking formula calls using `:`
  ## which is apparently normally evaluated in C... (in model.matrix.default)
  ## USAGE: e.g. c(1,2) %.:% c(3,4) = c(3, 8)
  ## (instead of getting warning)
  if (length(x) > 1L && length(y) > 1L && is.numeric(x) && is.numeric(y)) {
    return(x*y)
  } else if (length(x) == 1L && length(y) == 1L && is.numeric(x) && is.numeric(y)) {
    return(x:y)
  }
  as.factor(x):as.factor(y)

}



RHS2list <- function(formula, handle.adjust=TRUE) {
  ## INTENTION: turns the right-hand side of a formula
  ## into a list of substituted expressions;
  ## each element in list is an expressions separated
  ## by a '+' in the formula. needs to be eval()'d,
  ## preferably using the appropriate data set.
  if (!inherits(formula, "formula")) stop("not a formula")

  ## no response
  formula <- formula[c(1, length(formula))]

  te <- terms(formula)
  tl <- attr(te, "term.labels")

  ## handle adjusting variables (e.g. adjust(V1, V2) -> c("V1", "V2"))
  adj <- tl[substr(tl, 1, 7) == "adjust("]
  if (length(adj) == 0L) adj <- NULL
  if (handle.adjust && !is.null(adj)) {
    tl <- setdiff(tl, adj)
    adj <- unlist(lapply(adj, function(stri) {
      e <- parse(text = stri)[[1]]
      e <- as.list(e)[-1]
      unlist(lapply(e, deparse))
    }))

    tl <- c(tl, adj)
  }

  ## to avoid e.g. c(1,2):c(3,4) NOT evaluating as c(3, 8)
  l <- lapply(tl, function(x) gsub(pattern = ":", x = x, replacement = "%.:%"))
  l <- lapply(l, function(x) parse(text = x)[[1L]])

  names(l) <- tl

  setattr(l, "adjust", adj)

  l
}

uses_colon <- function(e) {
  if (is.character(e)) e <- parse(text = e)[[1]]
  stopifnot(is.call(e) || is.name(e))
  stopifnot(!inherits(e, "formula"))

  l <- as.list(e)
  if (deparse(l[[1]]) %in% c(":", "%.:%") && length(l) == 3) {
    return(TRUE)
  }
  FALSE
}

replace_colon <- function(e) {
  stopifnot(is.call(e) || is.name(e))
  stopifnot(!inherits(e, "formula"))

  if (!uses_colon(e)) return(e)

  l <- as.list(e)
  l[[1]] <- parse(text = "popEpi:::`%.:%`")[[1]]
  as.call(l)
}

RHS2DT <- function(formula, data = data.frame(), enclos = parent.frame(1L)) {
  l <- RHS2list(formula)
  if (length(l) == 0L) return(data.table())
  adj <- attr(l, "adjust")

  ## foolproofing in case data contains column named e.g. "a:b" and that is
  ## intended to be used
  dana <- names(data)
  dana <- gsub(x=dana, pattern=" %.:% ", replacement = ":")
  dana <- gsub(x=dana, pattern="%.:%", replacement = ":")


  ld <- lapply(l, deparse)
  ld <- lapply(ld, function(ch) {
    ch <- gsub(x=ch, pattern=" %.:% ", replacement = ":")
    ch <- gsub(x=ch, pattern="%.:%", replacement = ":")
    int <- if (ch %in% dana) which(dana %in% ch) else ch
    if (is.integer(int)) data[[names(data)[int]]] else NULL
  })
  ld[which(sapply(ld, is.null))] <- NULL
  l[names(ld)] <- ld

  ## foolproofs use of function %.:% by explicit referral, i.e. :::
  ## (this avoids scoping problem)
  l <- lapply(l, function(elem) {
    if (!is.call(elem)) return(elem)
    if (!uses_colon(elem)) return(elem)

    replace_colon(elem)

  })

  l <- lapply(l, function(elem) {
    eval(expr = elem, envir = data, enclos = enclos)
  })

  l <- as.data.table(l)
  setattr(l, "adjust", adj)
  l
}





Surv2DT <- function(Surv) {
  sa <- attributes(Surv)
  type <- sa$type

  dt <- as.data.table(lapply(sa$dimnames[[2]], function(col_nm) {
    Surv[, col_nm]
  }))
  setnames(dt, names(dt), sa$dimnames[[2]])

  statNA <- sum(is.na(dt$status))
  if (statNA) {
    stop("Some status indicators (", statNA  ," values in total) were NA. ",
         "Usual suspects: original status variable ",
         "has NA values, or you have numeric status variable with more than ",
         "two levels and you did not assign e.g. type = 'mstate' (e.g. ",
         "Surv(time = c(1,1,1), event = c(0,1,2), type = 'mstate') works).")

  }


  setattr(dt, "type", type)

  label_sources <- c("time2", "event")
  lapply(label_sources, function(lbl_src) {
    if (identical(sa$inputAttributes[[lbl_src]]$class, "factor")) {
      set(
        dt, j = "status",
        value = factor(dt$status, labels = sa$inputAttributes[[lbl_src]]$levels)
      )
    }
    NULL
  })

  dt[]
}





evalPopFormula <- function(formula, data = data.frame(), enclos = parent.frame(2L), subset = NULL, Surv.response = TRUE) {

  ## INTENTION: given a formula object, returns a DT where each column
  ## is an evaluated expression from formula (separated by  + )

  either <- FALSE
  if (is.character(Surv.response)) {
    Surv.response <- match.arg(Surv.response, "either")
    Surv.response <- TRUE
    either <- TRUE
  } else if (!is.logical(Surv.response)) {
    stop("Surv.response must be either logical or 'either'")
  }

  ## subset if needed ----------------------------------------------------------
  if (!is.null(subset) && !is.logical(subset)) {
    stop("subset must be NULL or a logical vector and not an expression at ",
         "this point. If you see this, complain to the package maintainer.")
  }
  if (!is.null(subset)) {
    keepVars <- c(all.vars(formula), "lex.Xst")
    data <- subsetDTorDF(data, subset = subset, select = keepVars)
  }


  ## formula -------------------------------------------------------------------
  if (!inherits(formula, "formula")) {
    stop("formula is not of class 'formula'; supply it as e.g. y ~ x")
  }
  if (length(formula) < 3L) {
    stop("formula appears to be one-sided, which is not supported; ",
         "supply it as e.g. y ~ x")
  }

  ## response
  y <- eval(formula[[2L]], envir = data, enclos = enclos)
  if (inherits(y, "Surv") && !either && !Surv.response) {
    stop("Response is a result of using Surv(), which is not allowed in ",
         "this context.")
  }

  if (!inherits(y, "Surv") && !either && Surv.response) {
    stop("The response of the formula must be a Surv object; ",
         "see ?Surv (in package survival).")
  }

  if (inherits(y, "Surv")) {
    y <- Surv2DT(y)
    setcolsnull(y, keep = c("time", "start", "status"), colorder = TRUE)
    if (!any(c("time", "start") %in% names(y))) {
      stop("You must supply function Surv a value to the 'time' ",
           "argument. See ?Surv")
    }
    setnames(y, names(y), c("time", "status")[seq_len(ncol(y))])
  } else {
    y <- data.table(y)
    setnames(y, 1, deparse(formula[[2L]]))
    if (either && inherits(data, "Lexis")) {
      ## we assume the unmodified lex.Xst to be a useful status variable.
      if (!"lex.Xst" %in% names(data)) {
        stop("Supplied a formula without using Surv(), and data was a Lexis ",
             "object, so assumed you intended to use 'lex.Xst' in data as the ",
             "status variable in this context, but that column was missing ",
             "from data.")
      }
      setnames(y, 1, "time")
      y[, "status"] <- data$lex.Xst

    }
  }


  ## RHS
  l <- RHS2DT(formula, data = data, enclos = enclos)
  adj <- attr(l, "adjust")

  ## combine
  l <- if (length(l) > 0L) cbind(y, l) else y

  setattr(l, "adjust.names", adj)
  setattr(l, "print.names", setdiff(names(l), c(adj, names(y))))
  setattr(l, "Surv.names", names(y))
  setattr(l, "formula", formula)

  l
}


evalRecursive <- function(arg, env, enc, max.n = 100L) {
  ## INTENTION: digs out actual evaluatable value and expression

  if (missing(env)) env <- environment()
  if (missing(enc)) enc <- parent.frame(1L)

  if (is.data.frame(env)) {
    na <- names(env)
    env <- env[seq_len(min(10L, nrow(env))), ]

    env <- data.frame(env)
    setattr(env, "names", na)

  }
  argSub <- arg

  tick <- 1L
  while (!inherits(arg, "try-error") && is.language(arg) &&
         !inherits(arg, "formula") && tick < max.n) {

    argSub <- arg
    arg <- try(eval(argSub, envir = env, enclos = enc), silent = TRUE)

    tick <- tick + 1L
  }

  if (tick == max.n) {
    stop("evaluated expression ", max.n,
         " times and still could not find underlying expression")
  }
  if (!is.language(argSub)) argSub <- substitute(arg)
  list(arg = arg, argSub = argSub, all.vars = all.vars(argSub))
}


usePopFormula <- function(form = NULL, adjust = NULL, data = data.frame(),
                          enclos, Surv.response = TRUE) {
  ## INTENTION: evaluates form and combines with adjust appropriately
  ## returns a list of the elements dug out from the formula and adjust
  ## arguments.
  # formSub <- substitute(form)
  al <- evalRecursive(arg = form, env = data, enc = enclos)

  if (!inherits(al$arg, "formula")) stop("'form' is not a formula object")

  dt <- evalPopFormula(formula = al$arg, data = data, enclos = enclos,
                       Surv.response = Surv.response)
  adNames <- attr(dt, "adjust.names")
  prNames <- attr(dt, "print.names")
  suNames <- attr(dt, "Surv.names")

  adjust <- evalPopArg(data, adjust, DT = TRUE, recursive = TRUE,
                       enclos = new.env(), naming = "model",
                       types = c("NULL", "character", "list", "expression"))

  if (is.data.frame(adjust) && (nrow(adjust) == 0L || ncol(adjust) == 0L)) {
    stop("adjust evaluated to an empty data.frame")
  }
  if (!is.null(adjust) && ncol(adjust) > 0L && length(adNames) > 0L) {
    stop("Cannot both use argument 'adjust' AND use an adjust() term within ",
         "the formula argument. Please only use one.")
  }
  if (is.null(adjust) && length(adNames) > 0L) {
    adjust <- dt[, .SD, .SDcols = c(adNames)]
  }


  print <- NULL
  if (length(prNames > 0L)) print <- dt[, .SD, .SDcols = eval(prNames)]

  list(y = dt[, .SD, .SDcols = c(suNames)],
       print = print,
       adjust = adjust, formula = al$arg)
}





#' @title Adjust Estimates by Categorical Variables
#' @description This function is only intended to be used within a formula
#' when supplied to e.g. `[survtab_ag]` and should not be
#' used elsewhere.
#' @param ... variables to adjust by, e.g. `adjust(factor(v1), v2, v3)`
#' @return Returns a list of promises of the variables supplied which can be
#' evaluated.
#' @examples
#'
#' y ~ x + adjust(z)
#' @export
adjust <- function(...) {

  call <- sys.call(1L)
  call <- as.list(call)[1L]

  if (deparse(call) %in% c("adjust", "list(adjust)")) {
    stop("Function adjust() only intended to be used within the formulas of ",
         "certain functions of package popEpi. See e.g. ?survtab_ag for usage.")
  }

  mc <- as.list(match.call())[-1L]
  if (is.list(mc) && length(mc) == 1) mc <- mc[[1]]
  mc
}

parse_adjust_formula <- function(f) {

  env <- environment(f)
  t <- attr(terms(f), "term.labels")

  l <- sapply(t, grepl, pattern = "adjust(", fixed = TRUE)

  a <- t[l]

  if (!any(l)) return(f)

  f <- deparse(f)
  f <- paste0(f, collapse = "")
  f <- oneWhitespace(f)

  for (k in seq_along(a)) {
    f <- sub(x = f, pattern = paste0(" + ", a[k]), replacement = "", fixed = TRUE)
  }

  a <- lapply(a, function(stri) {
    e <- parse(text = stri)[[1]]
    as.list(e)[-1]
  })

  a <- unlist(a, recursive = FALSE)

  a <- sapply(a, function(e) {

    e <- substitute(adjust(e), list(e = e))
    deparse(e)

  })

  f <- paste0(f, " + ", paste0(a, collapse = " + "))
  f <- eval(parse(text = f)[[1]])
  environment(f) <- env
  f
}

dt_robust_by <- function(e, by.var.nms) {
  stopifnot(
    length(e) == 1,
    is.character(e),
    grepl(x = e, pattern = "by\\s{0,}=\\s{0,}%%BY_VAR_NMS%%"),
    length(by.var.nms) == 0 || is.character(by.var.nms)
  )

  le <- paste0("list(", paste0("`", by.var.nms, "`", collapse = ", "), ")")
  if (!length(by.var.nms)) le <- "NULL"

  e <- gsub(x = e, pattern = "%%BY_VAR_NMS%%", fixed = TRUE, replacement = le)
  eval(parse(text = e), envir = parent.frame(1L))
  NULL
}