File: incidence_rates.R

package info (click to toggle)
r-cran-popepi 0.4.13%2Bdfsg-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 1,656 kB
sloc: sh: 13; makefile: 2
file content (347 lines) | stat: -rw-r--r-- 12,771 bytes
#' @title Direct-Standardised Incidence/Mortality Rates
#' @author Matti Rantanen, Joonas Miettinen
#'
#' @description `rate` calculates adjusted rates using
#' preloaded weights data or user specified weights.
#'
#' @param data aggregated data (see e.g. `[lexpand]`,
#' `[aggre]` if you have subject-level data)
#' @param pyrs person-years variable name in data.
#' [Flexible input][flexible_argument], typically e.g.
#' `pyrs = pyrs`.
#' @param obs observations variable name in data.
#' [Flexible input][flexible_argument], typically e.g.
#' `obs = obs`.
#' @param adjust variable for adjusting the rates.
#' [Flexible input][flexible_argument], typically e.g.
#' `adjust = agegroup`.
#' @param print variable name to stratify the rates.
#' [Flexible input][flexible_argument], typically e.g.
#' `print = sex` or `print = list(sex, area)`.
#' @param weights typically a list of weights or a `character` string
#' specifying an age group standardization scheme; see
#' the [dedicated help page][direct_standardization]
#' and examples.
#'
#' @param subset a logical expression to subset data.
#'
#' @details Input data needs to be in aggregated format with observations
#' and person-years. For individual data use `[lexpand]`, or
#' `[ltable]` and merge person-years manually.
#'
#' The confidence intervals are based on the normal approximation of the logarithm of the rate.
#' The variance of the log rate that is used to derive the confidence intervals
#' is derived using the delta method.
#'
#' @return Returns a `data.table` with observations, person-years, rates and
#' adjusted rates, if available. Results are stratified by `print`.
#' Adjusted rates are identified with suffix `.adj` and
#' `.lo` and `.hi` are for confidence intervals lower and upper
#' 95% bounds, respectively.
#' The prefix `SE.` stands for standard error.
#'
#' @seealso `[lexpand]`, `[ltable]`
#'
#' @examples
#' ## Prepare data with lexpand and then reformat agegroup.
#' data(sibr)
#' x <- lexpand(sibr, birth = bi_date, entry = dg_date, exit = ex_date,
#'              breaks = list(per = c(1990,2000,2010,2020), age = c(0:17*5,Inf)),
#'              aggre = list(agegroup = age, year.cat = per),
#'              status =  status != 0)
#'
#' x$agegroup <- cut(x$agegroup,  c(0:17*5,Inf), right = FALSE)
#'
#' ## calculate rates for selected periods with Nordic 2000 weights:
#' r1 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat,
#'             adjust = agegroup, weights = 'nordic')
#' r1
#'
#' ## use total person-years by stratum as weights (some have zero)
#' w <- ltable(x, by.vars = "agegroup", expr = sum(pyrs))
#' w[is.na(w$V1),]$V1 <- 0
#'
#' r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat,
#'             adjust = agegroup,
#'             weights = w$V1)
#' r2
#'
#' ## use data.frame of weights:
#' names(w) <- c("agegroup", "weights")
#' r2 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat,
#'             adjust = agegroup,
#'             weights = w)
#' r2
#'
#' ## internal weights (same result as above)
#' r3 <- rate( data = x, obs = from0to1, pyrs = pyrs, print = year.cat,
#'             adjust = agegroup,
#'             weights = "internal")
#' r3
#'
#' @import data.table
#' @export
#' @family main functions
#' @family rate functions

rate <- function( data,
                  obs = NULL,
                  pyrs = NULL,
                  print = NULL,
                  adjust = NULL,
                  weights = NULL,
                  subset = NULL
) {

  PF <- parent.frame(1L)
  TF <- environment()

  ## subsetting -----------------------------------------------------------
  subset <- substitute(subset)
  subset <- evalLogicalSubset(data = data, substiset = subset, enclos = PF)
  data <- data[subset,]
  setDT(data)

  # evalPopArg
  obs <- substitute(obs)
  inc.obs <- evalPopArg(data = data, arg = obs, enclos = PF)
  if (!length(inc.obs)) {
    stop("No observations given.")
  }
  obsNames <- copy(names(inc.obs))
  tmpObsNames <- makeTempVarName(data = data, pre = "obs")
  setnames(inc.obs, obsNames, tmpObsNames)

  pyrs <- substitute(pyrs)
  inc.pyr <- evalPopArg(data = data, arg = pyrs, enclos = PF)
  if (!length(inc.pyr)) {
    stop("No pyrs given.")
  }
  pyrNames <- copy(names(inc.pyr))
  tmpPyrNames <- makeTempVarName(data = data, pre = "pyr")
  setnames(inc.pyr, pyrNames, tmpPyrNames)

  print <- substitute(print)
  inc.pri <- evalPopArg(data = data, arg = print, enclos = PF)
  prNames <- tmpPrNames <- NULL
  if (length(inc.pri)) {
    prNames <- copy(names(inc.pri))
    tmpPrNames <- makeTempVarName(data = data,
                                  pre = paste0("print", seq_along(prNames)))
    setnames(inc.pri, prNames, tmpPrNames)
  }

  adjust <- substitute(adjust)
  inc.adj <- evalPopArg(data = data, arg = adjust, enclos = PF)
  adNames <- tmpAdNames <- NULL
  if (length(inc.adj)) {
    adNames <- copy(names(inc.adj))
    tmpAdNames <- makeTempVarName(data = data,
                                  pre = paste0("adjust", seq_along(adNames)))
    setnames(inc.adj, adNames, tmpAdNames)
  }

  ## collect data --------------------------------------------------------------
  data <- cbind(inc.obs, inc.pyr)
  if (!is.null(prNames)) data <- cbind(data, inc.pri)
  if (!is.null(adNames)) data <- cbind(data, inc.adj)


  ## handle weights ------------------------------------------------------------
  weights <- substitute(weights)
  weights <- eval(weights, envir = PF)
  weights <- copy(weights)
  if (length(inc.adj)) {
    ## rename adjust variables in inc.adj back to original names
    ## for more human-readable errors in checkWeights if any occur
    setnames(inc.adj, tmpAdNames, adNames)
  }

  checkWeights(weights, inc.adj)
  if (is.list(weights) && !is.data.frame(weights)) {
    ## ensure weights list / DF names match to temp adjust var names
    weights <- weights[adNames]
    names(weights) <- tmpAdNames
  } else if (is.data.frame(weights)) {
    setnames(weights, adNames, tmpAdNames)
  }

  ## form table with weights ---------------------------------------------------
  NA.msg <- "Data contains %%NA_COUNT%% NA values."
  data <- makeWeightsDT(data,
                        values = list(tmpObsNames, tmpPyrNames),
                        print = tmpPrNames,
                        adjust = tmpAdNames,
                        weights = weights,
                        internal.weights.values = tmpPyrNames,
                        NA.text = NA.msg)

  ## estimate standardized rates -----------------------------------------------
  data <- rate_est(data = data,
                   obs = tmpObsNames,
                   pyrs = tmpPyrNames,
                   print = tmpPrNames,
                   weights = "weights")

  ## final touch ---------------------------------------------------------------
  setDT(data)
  setattr(data, "class", c("rate", "data.table", "data.frame"))
  setattr(data, name = 'rate.meta', value = list(obs = obsNames,
                                                 pyrs = pyrNames,
                                                 weights = weights,
                                                 adjust = adNames,
                                                 print = prNames,
                                                 call = match.call(),
                                                 NAs = NA))
  setnames(data, c(tmpObsNames, tmpPyrNames, tmpPrNames),
           c(obsNames, pyrNames, prNames))

  # data.frame output option
  if (!return_DT()) {
    setDFpe(data)
  }

  return(data[])
}

#' @export
getCall.rate <- function (x, ...) {
  attributes(x)$rate.meta$call
}

stdr.weights <- function(wp = 'world00_1') {

  ## This one returns the standard population
  ## output: data.table with colnames: agegroup, reference
  ## standard populations are from datasets: stdpop18 and stdpop101
  allow.pop <- c("world_1966_18of5",
                 "europe_1976_18of5",
                 "nordic_2000_18of5",
                 "world_2000_18of5",
                 "world_2000_20of5",
                 "world_2000_101of1")
  wp <- match.arg(wp, allow.pop)

  if (length(wp) > 1) {
    stop('Standard population name is not a scalar (length != 1).')

  } else if (wp %in% allow.pop[1:3]) {

    # get standard pop
    sr <- data.table(popEpi::stdpop18)
    setnames(sr, 1:4, c("agegroup",allow.pop[1:3]))
    sr[, agegroup := 1:18]
    sr[, setdiff(allow.pop[1:3], wp) := NULL]

    setnames(sr, wp, 'reference')

  } else if (wp %in% allow.pop[4:6]) {

    sr <- data.table(popEpi::stdpop101)
    if (wp == "world_2000_18of5") {
      sr[,agegroup := cut(agegroup, breaks=c(0:17*5,Inf), right=FALSE, labels=FALSE)]
      sr <- sr[,list(world_std = sum(world_std)), by="agegroup"]
    }
    if (wp == 'world_2000_20of5') {
      sr[,agegroup := cut(agegroup, breaks=c(0:19*5,Inf), right=FALSE, labels=FALSE)]
      sr <- sr[,list(world_std = sum(world_std)), by="agegroup"]
    }
    else {
      sr <- sr[,list(world_std = sum(world_std)), by="agegroup"]
    }
    setnames(sr, "world_std", "reference")
  }
  else {
    stop("Invalid standard population name.")
  }
  sr[]
}
globalVariables(c('stdpop18','stdpop101','agegroup','world_std'))


rate_est <- function(data = data,
                     obs = 'obs',
                     pyrs = 'pyrs',
                     print = NULL,
                     weights = NULL
) {
  ## This one estimates the rates and calculates CI's and SE's.

  badVars <- paste0("Internal error: missing following variable names in ",
                    "working data: %%VARS%%. Complain to the pkg maintainer ",
                    "if you see this.")
  all_names_present(data, c(obs, pyrs, print), msg = badVars)

  data <- data.table(data)
  if ( is.null(weights) |  !weights %in% colnames(data)) {
    weights <- NULL
  }

  if (all(!is.null(weights), !is.null(obs), !is.null(pyrs))) {
    # rate.adj

    f2 <- function(list) list[[1]]/list[[2]]*list[[3]]
    funx <- function(n,d,w,fun)  eval(parse(text=fun))


    # variance rate.adj for each strata A
    fun1 <- '(._d_/._n_^2) * ._w_^2'
    fun2 <- '._d_ / ._n_ * ._w_'

    make_fun <- function(n = NA, d = NA, w = NA, fun) {
      fun <- gsub(pattern = "._n_", replacement = n, x = fun)
      fun <- gsub(pattern = "._d_", replacement = d, x = fun)
      fun <- gsub(pattern = "._w_", replacement = w, x = fun)
      parse(text = fun)
    }
    eval.me1 <- make_fun(d = obs, n = pyrs, w=weights, fun = fun1)
    eval.me2 <- make_fun(d = obs, n = pyrs, w=weights, fun = fun2)
    data[, var.temp := eval(eval.me1)]
    data[, lam.temp := eval(eval.me2)]
    # add std weighted rates and variances
    #data[, ':='(var.temp = funx(d=get(obs), n=get(pyrs), w=get(weights), fun = fun1),
    #            lam.temp = funx(d=get(obs), n=get(pyrs), w=get(weights), fun = fun2)) ]
    data[, rate.adj := f2(.SD), .SDcols= c(obs, pyrs, weights)]

    # aggregate data
    ie <- paste0('list(', obs, '=sum(',obs,',na.rm=TRUE), ', pyrs, '=sum(',pyrs,',na.rm=TRUE),',
                 'rate.adj=sum(rate.adj,na.rm=TRUE),' ,'lam.temp=sum(lam.temp,na.rm=TRUE), var.temp=sum(var.temp,na.rm=TRUE))')
    l <- parse(text = ie)

    data <- data[, eval(l), by=print]
    # rate.adj: S.E.
    data[, SE.log.rate.adj := sqrt((1/lam.temp)^2 * var.temp) ] # tämä on log-rate
    data[, SE.rate.adj := sqrt(var.temp)]
    # rate.adj: CI
    data[, ':='(rate.adj.lo = exp( log(rate.adj) - SE.log.rate.adj*1.96 ),
                rate.adj.hi = exp( log(rate.adj) + SE.log.rate.adj*1.96 )) ]
    data[,c('lam.temp','var.temp','SE.log.rate.adj') := NULL]
  }

  else {
    ie <- paste0('list(', obs, '=sum(',obs,'), ', pyrs, '=sum(',pyrs,'))')
    l <- parse(text = ie)
    data <- data[, eval(l), by=print]
  }
  # rate
  ia <- paste0('rate := ',obs,'/', pyrs)
  k <- parse(text = ia)
  data[, eval(k), by = print]

  # var(rate)
  var_r <- paste0('SE.rate := sqrt(',obs,'/(',pyrs,'*',pyrs,'))')
  k <- parse(text = var_r)
  data[, eval(k), by = print]

  # var(log(rate)) and CI
  eval.me3 <- paste('exp(sqrt(1/',obs,'))')
  eval.me3 <- parse(text = eval.me3)
  data[, SE.log.rate := eval(eval.me3)]
  data[, ':='(rate.lo = exp(log(rate)-log(SE.log.rate)*1.96),
              rate.hi = exp(log(rate)+log(SE.log.rate)*1.96)) ]
  data[, SE.log.rate := NULL]
  return(data[])
}

globalVariables(c('var.temp','lam.temp','rate.adj','SE.rate.adj','SE.rate','SE.log.rate','SE.log.rate.adj'))