File: pbinom.R

package info (click to toggle)
r-cran-poissonbinomial 1.2.7-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 736 kB
sloc: cpp: 719; makefile: 3
file content (454 lines) | stat: -rw-r--r-- 17,700 bytes
parent folder | download | duplicates (2)
#'@name PoissonBinomial-Distribution
#'
#'@importFrom stats dbinom pbinom runif
#'
#'@title The Poisson Binomial Distribution
#'
#'@description
#'Density, distribution function, quantile function and random generation for
#'the Poisson binomial distribution with probability vector \code{probs}.
#'
#'@param x           Either a vector of observed numbers of successes or NULL.
#'                   If NULL, probabilities of all possible observations are
#'                   returned.
#'@param p           Vector of probabilities for computation of quantiles.
#'@param n           Number of observations. If \code{length(n) > 1}, the
#'                   length is taken to be the number required.
#'@param probs       Vector of probabilities of success of each Bernoulli
#'                   trial.
#'@param method      Character string that specifies the method of computation
#'                   and must be one of \code{"DivideFFT"}, \code{"Convolve"},
#'                   \code{"Characteristic"}, \code{"Recursive"},
#'                   \code{"Mean"}, \code{"GeoMean"}, \code{"GeoMeanCounter"},
#'                   \code{"Poisson"}, \code{"Normal"} or
#'                   \code{"RefinedNormal"} (abbreviations are allowed).
#'@param wts         Vector of non-negative integer weights for the input
#'                   probabilities.
#'@param log,log.p   Logical value indicating if results are given as
#'                   logarithms.
#'@param lower.tail  Logical value indicating if results are \eqn{P[X \leq x]}
#'                   (if \code{TRUE}; default) or \eqn{P[X > x]} (if 
#'                   \code{FALSE}).
#'@param generator   Character string that specifies the random number
#'                   generator and must either be \code{"Sample"} (default) or
#'                   \code{"Bernoulli"} (abbreviations are allowed). See
#'                   Details for more information.
#'
#'@details
#'See the references for computational details. The \emph{Divide and Conquer}
#'(\code{"DivideFFT"}) and \emph{Direct Convolution} (\code{"Convolve"})
#'algorithms are derived and described in Biscarri, Zhao & Brunner (2018). The
#'\emph{Discrete Fourier Transformation of the Characteristic Function}
#'(\code{"Characteristic"}), the \emph{Recursive Formula} (\code{"Recursive"}),
#'the \emph{Poisson Approximation} (\code{"Poisson"}), the
#'\emph{Normal Approach} (\code{"Normal"}) and the
#'\emph{Refined Normal Approach} (\code{"RefinedNormal"}) are described in Hong
#'(2013). The calculation of the \emph{Recursive Formula} was modified to
#'overcome the excessive memory requirements of Hong's implementation.
#'
#'The \code{"Mean"} method is a naive binomial approach using the arithmetic
#'mean of the probabilities of success. Similarly, the \code{"GeoMean"} and
#'\code{"GeoMeanCounter"} procedures are binomial approximations, too, but
#'they form the geometric mean of the probabilities of success
#'(\code{"GeoMean"}) and their counter probabilities (\code{"GeoMeanCounter"}),
#'respectively.
#'
#'In some special cases regarding the values of \code{probs}, the \code{method}
#'parameter is ignored (see Introduction vignette).
#'
#'Random numbers can be generated in two ways. The \code{"Sample"} method
#'uses \code{R}'s \code{sample} function to draw random values according to
#'their probabilities that are calculated by \code{dgpbinom}. The
#'\code{"Bernoulli"} procedure ignores the \code{method} parameter and
#'simulates Bernoulli-distributed random numbers according to the probabilities
#'in \code{probs} and sums them up. It is a bit slower than the \code{"Sample"}
#'generator, but may yield better results, as it allows to obtain observations
#'that cannot be generated by the \code{"Sample"} procedure, because
#'\code{dgpbinom} may compute 0-probabilities, due to rounding, if the length
#'of \code{probs} is large and/or its values contain a lot of very small
#'values.
#'
#'@return
#'\code{dpbinom} gives the density, \code{ppbinom} computes the distribution
#'function, \code{qpbinom} gives the quantile function and \code{rpbinom}
#'generates random deviates.
#'
#'For \code{rpbinom}, the length of the result is determined by \code{n}, and
#'is the lengths of the numerical arguments for the other functions.
#'
#'@section References:
#'Hong, Y. (2013). On computing the distribution function for the Poisson
#'    binomial distribution. \emph{Computational Statistics & Data Analysis},
#'    \strong{59}, pp. 41-51. \doi{10.1016/j.csda.2012.10.006}
#'
#'Biscarri, W., Zhao, S. D. and Brunner, R. J. (2018) A simple and fast method
#'    for computing the Poisson binomial distribution.
#'    \emph{Computational Statistics and Data Analysis}, \strong{31}, pp.
#'    216–222. \doi{10.1016/j.csda.2018.01.007}
#'    
#'@examples
#'set.seed(1)
#'pp <- c(0, 0, runif(995), 1, 1, 1)
#'qq <- seq(0, 1, 0.01)
#'
#'dpbinom(NULL, pp, method = "DivideFFT")
#'ppbinom(450:550, pp, method = "DivideFFT")
#'qpbinom(qq, pp, method = "DivideFFT")
#'rpbinom(100, pp, method = "DivideFFT")
#'
#'dpbinom(NULL, pp, method = "Convolve")
#'ppbinom(450:550, pp, method = "Convolve")
#'qpbinom(qq, pp, method = "Convolve")
#'rpbinom(100, pp, method = "Convolve")
#'
#'dpbinom(NULL, pp, method = "Characteristic")
#'ppbinom(450:550, pp, method = "Characteristic")
#'qpbinom(qq, pp, method = "Characteristic")
#'rpbinom(100, pp, method = "Characteristic")
#'
#'dpbinom(NULL, pp, method = "Recursive")
#'ppbinom(450:550, pp, method = "Recursive")
#'qpbinom(qq, pp, method = "Recursive")
#'rpbinom(100, pp, method = "Recursive")
#'
#'dpbinom(NULL, pp, method = "Mean")
#'ppbinom(450:550, pp, method = "Mean")
#'qpbinom(qq, pp, method = "Mean")
#'rpbinom(100, pp, method = "Mean")
#'
#'dpbinom(NULL, pp, method = "GeoMean")
#'ppbinom(450:550, pp, method = "GeoMean")
#'qpbinom(qq, pp, method = "GeoMean")
#'rpbinom(100, pp, method = "GeoMean")
#'
#'dpbinom(NULL, pp, method = "GeoMeanCounter")
#'ppbinom(450:550, pp, method = "GeoMeanCounter")
#'qpbinom(qq, pp, method = "GeoMeanCounter")
#'rpbinom(100, pp, method = "GeoMeanCounter")
#'
#'dpbinom(NULL, pp, method = "Poisson")
#'ppbinom(450:550, pp, method = "Poisson")
#'qpbinom(qq, pp, method = "Poisson")
#'rpbinom(100, pp, method = "Poisson")
#'
#'dpbinom(NULL, pp, method = "Normal")
#'ppbinom(450:550, pp, method = "Normal")
#'qpbinom(qq, pp, method = "Normal")
#'rpbinom(100, pp, method = "Normal")
#'
#'dpbinom(NULL, pp, method = "RefinedNormal")
#'ppbinom(450:550, pp, method = "RefinedNormal")
#'qpbinom(qq, pp, method = "RefinedNormal")
#'rpbinom(100, pp, method = "RefinedNormal")
#'
#'@export
dpbinom <- function(x, probs, wts = NULL, method = "DivideFFT", log = FALSE){
  ## preliminary checks
  # number of probabilities
  n <- length(probs)
  
  # check if 'x' contains only integers
  if(!is.null(x) && any(x - round(x) != 0)){
    warning("'x' should contain integers only! Using rounded off values.")
    x <- floor(x)
  }
  
  # check if 'probs' contains only probabilities
  if(is.null(probs) || any(is.na(probs) | probs < 0 | probs > 1))
    stop("'probs' must contain real numbers between 0 and 1!")
  
  # make sure that the value of 'method' matches one of the implemented procedures
  method <- match.arg(method, c("DivideFFT", "Convolve", "Characteristic", "Recursive", "Mean", "GeoMean", "GeoMeanCounter", "Poisson", "Normal", "RefinedNormal"))
  
  # check if 'wts' contains only integers (zeros are allowed)
  if(!is.null(wts) && any(is.na(wts) | wts < 0 | abs(wts - round(wts)) > 1e-07))
    stop("'wts' must contain non-negative integers!")
  
  if(!is.null(wts) && length(wts) != n)
    stop("'probs' and 'wts' (if not NULL) must have the same length!")
  
  ## expand 'probs' according to the counts in 'wts'
  # if 'wts' is NULL, set it to be a vector of ones
  if(is.null(wts))
    wts <- rep(1, n)
  
  # expand 'probs'
  probs <- rep(probs, wts)
  
  # re-compute length of 'probs' (= sum of 'wts')
  n <- sum(wts)
  
  # if x = NULL, return all possible probabilities
  if(is.null(x)) x <- 0:n
  
  # identify valid 'x' values (invalid ones will have 0-probability)
  idx.x <- which(x >= 0 & x <= n)
  
  # select valid observations
  y <- x[idx.x]
  
  ## compute probabilities
  # vector for storing the probabilities
  d <- double(length(x))
  
  # no computation needed, if there are no valid observations in 'x'
  if(length(idx.x)){
    # which probabilities are 0 or 1
    idx0 <- which(probs == 0)
    idx1 <- which(probs == 1)
    probs <- probs[probs > 0 & probs < 1]
    
    # number of zeros and ones
    n0 <- length(idx0)
    n1 <- length(idx1)
    np <- n - n0 - n1
    
    # relevant observations
    idx.y <- which(y %in% n1:(n - n0))
    
    if(length(idx.y)){
      z <- y[idx.y] - n1
    
      if(np == 0){
        # 'probs' contains only zeros and ones, i.e. only one possible observation
        d[idx.x][idx.y] <- 1
      }else if(np == 1){
        # 'probs' contains only one value that is not 0 or 1, i.e. a Bernoulli distribution
        d[idx.x][idx.y] <- c(1 - probs, probs)[z + 1]
      }else{
        if(all(probs == probs[1])){
          # all values of 'probs' are equal, i.e. a standard binomial distribution
          d[idx.x][idx.y] <- dbinom(z, np, probs[1])
        }else{
          # otherwise, compute distribution according to 'method'
          d[idx.x][idx.y] <- switch(method, DivideFFT = dpb_dc(z, probs),
                                            Convolve = dpb_conv(z, probs),
                                            Characteristic = dpb_dftcf(z, probs),
                                            Recursive = dpb_rf(z, probs),
                                            Mean = dpb_mean(z, probs),
                                            GeoMean = dpb_gmba(z, probs, FALSE),
                                            GeoMeanCounter = dpb_gmba(z, probs, TRUE),
                                            Poisson = dpb_pa(z, probs),
                                            Normal = dpb_na(z, probs, FALSE),
                                            RefinedNormal = dpb_na(z, probs, TRUE))
        }
      }
    }
  }
  
  # logarithm, if required
  if(log) d <- log(d)
  
  # return results
  return(d)
}

#'@rdname PoissonBinomial-Distribution
#'@export
ppbinom <- function(x, probs, wts = NULL, method = "DivideFFT", lower.tail = TRUE, log.p = FALSE){
  ## preliminary checks
  # number of probabilities
  n <- length(probs)
  
  # check if 'x' contains only integers
  if(!is.null(x) && any(x - round(x) != 0)){
    warning("'x' should contain integers only! Using rounded off values.")
    x <- floor(x)
  }
  
  # check if 'probs' contains only probabilities
  if(is.null(probs) || any(is.na(probs) | probs < 0 | probs > 1))
    stop("'probs' must contain real numbers between 0 and 1!")
  
  # make sure that the value of 'method' matches one of the implemented procedures
  method <- match.arg(method, c("DivideFFT", "Convolve", "Characteristic", "Recursive", "Mean", "GeoMean", "GeoMeanCounter", "Poisson", "Normal", "RefinedNormal"))
  
  # check if 'wts' contains only integers (zeros are allowed)
  if(!is.null(wts) && any(is.na(wts) | wts < 0 | abs(wts - round(wts)) > 1e-07))
    stop("'wts' must contain non-negative integers!")
  
  if(!is.null(wts) && length(wts) != n)
    stop("'probs' and 'wts' (if not NULL) must have the same length!")
  
  ## expand 'probs' according to the counts in 'wts'
  # if 'wts' is NULL, set it to be a vector of ones
  if(is.null(wts))
    wts <- rep(1, n)
  
  # expand 'probs'
  probs <- rep(probs, wts)
  
  # re-compute length of 'probs' (= sum of 'wts')
  n <- sum(wts)
  
  # if x = NULL, return all possible probabilities
  if(is.null(x)) x <- 0:n
  
  # identify valid 'x' values (invalid ones will have 0-probability)
  idx.x <- which(x >= 0 & x <= n)
  
  # select valid observations
  y <- x[idx.x]
  
  ## compute probabilities
  # vector for storing the probabilities
  d <- rep(as.numeric(!lower.tail), length(x))
  
  # no computation needed, if there are no valid observations in 'x'
  if(length(idx.x)){
    # which probabilities are 0 or 1
    idx0 <- which(probs == 0)
    idx1 <- which(probs == 1)
    probs <- probs[probs > 0 & probs < 1]
    
    # number of zeros and ones
    n0 <- length(idx0)
    n1 <- length(idx1)
    np <- n - n0 - n1
    
    # relevant observations
    idx.y <- which(y %in% n1:(n - n0))
    idx.z <- which(y > n - n0)
    
    if(length(idx.y)){
      z <- y[idx.y] - n1
      
      if(np == 0){
        # 'probs' contains only zeros and ones, i.e. there is only one possible observation
        d[idx.x][idx.y] <- if(lower.tail) 1 else 0
      }else if(np == 1){
        # 'probs' contains only one value that is not 0 or 1, i.e. a Bernoulli distribution
        d[idx.x][idx.y] <- if(lower.tail) c(1 - probs, 1)[z + 1] else c(probs, 0)[z + 1]
      }else{
        if(all(probs == probs[1])){
          # all values of 'probs' are equal, i.e. a standard binomial distribution
          d[idx.x][idx.y] <- pbinom(q = z, size = np, prob = probs[1], lower.tail = lower.tail)
        }else{
          # otherwise, compute distribution according to 'method'
          d[idx.x][idx.y] <- switch(method,
                                    DivideFFT = ppb_dc(z, probs, lower.tail),
                                    Convolve = ppb_conv(z, probs, lower.tail),
                                    Characteristic = ppb_dftcf(z, probs, lower.tail),
                                    Recursive = ppb_rf(z, probs, lower.tail),
                                    Mean = ppb_mean(z, probs, lower.tail),
                                    GeoMean = ppb_gmba(z, probs, FALSE, lower.tail),
                                    GeoMeanCounter = ppb_gmba(z, probs, TRUE, lower.tail),
                                    Poisson = ppb_pa(z, probs, lower.tail),
                                    Normal = ppb_na(z, probs, FALSE, lower.tail),
                                    RefinedNormal = ppb_na(z, probs, TRUE, lower.tail))
          
          # compute counter-probabilities, if necessary
          #if(!lower.tail) d[idx.x][idx.y] <- 1 - d[idx.x][idx.y]
        }
      }
    }
    # fill cumulative probabilities of values above the relevant range
    if(length(idx.z)) d[idx.x][idx.z] <- as.double(lower.tail)
  }
  # fill cumulative probabilities of values above n
  d[x > n] <- as.double(lower.tail)
  
  # logarithm, if required
  if(log.p) d <- log(d)
  
  # return results
  return(d)
}

#'@rdname PoissonBinomial-Distribution
#'@importFrom stats stepfun
#'@export
qpbinom <- function(p, probs, wts = NULL, method = "DivideFFT", lower.tail = TRUE, log.p = FALSE){
  ## preliminary checks
  # check if 'p' contains only probabilities
  if(!log.p){
    if(is.null(p) || any(is.na(p) | p < 0 | p > 1))
      stop("'p' must contain real numbers between 0 and 1!")
  }else{
    if(is.null(p) || any(is.na(p) | p > 0))
      stop("'p' must contain real numbers between -Inf and 0!")
  }
  
  # make sure that the value of 'method' matches one of the implemented procedures
  method <- match.arg(method, c("DivideFFT", "Convolve", "Characteristic", "Recursive", "Mean", "GeoMean", "GeoMeanCounter", "Poisson", "Normal", "RefinedNormal"))
  
  ## compute probabilities (does checking for the other variables)
  cdf <- ppbinom(NULL, probs, wts, method, lower.tail)
  
  # size of distribution
  size <- length(probs)
  
  # length of cdf
  #len <- length(cdf)
  
  # logarithm, if required
  if(log.p) p <- exp(p)
  
  ## compute quantiles
  # observable range and indices
  n0 <- sum(probs == 0)
  n1 <- sum(probs == 1)
  hi <- size - n0
  range <- n1:hi
  #idx <- range + 1
  
  # handle quantiles between 0 and 1
  if(lower.tail) Q <- stepfun(cdf[range + 1], c(range, hi), right = TRUE)
  else Q <- stepfun(rev(cdf[range + 1]), c(hi, rev(range)), right = TRUE)
  
  # vector to store results
  res <- Q(p)
  
  # handle quantiles of 0 or 1
  res[p == lower.tail]  <- hi
  res[p == !lower.tail] <- n1
  
  # return results
  return(res)
}

#'@rdname PoissonBinomial-Distribution
#'@importFrom stats runif rbinom
#'@export
rpbinom <- function(n, probs, wts = NULL, method = "DivideFFT", generator = "Sample"){
  len <- length(n)
  if(len > 1) n <- len
  
  # check if 'n' is NULL
  if(is.null(n)) stop("'n' must not be NULL!")
  
  # number of probabilities
  len <- length(probs)
  
  # check if 'probs' contains only probabilities
  if(is.null(probs) || any(is.na(probs) | probs < 0 | probs > 1))
    stop("'probs' must contain real numbers between 0 and 1!")
  
  # check if 'wts' contains only integers (zeros are allowed)
  if(!is.null(wts) && any(is.na(wts) | wts < 0 | abs(wts - round(wts)) > 1e-07))
    stop("'wts' must contain non-negative integers!")
  
  if(!is.null(wts) && length(wts) != len)
    stop("'probs' and 'wts' (if not NULL) must have the same length!")
  
  ## expand 'probs' according to the counts in 'wts'
  # if 'wts' is NULL, set it to be a vector of ones
  if(is.null(wts))
    wts <- rep(1, len)
  
  # expand 'probs'
  probs <- rep(probs, wts)
  
  # make sure that the value of 'method' matches one of the implemented procedures
  method <- match.arg(method, c("DivideFFT", "Convolve", "Characteristic", "Recursive", "Mean", "GeoMean", "GeoMeanCounter", "Poisson", "Normal", "RefinedNormal"))
  
  # make sure that the value of 'generator' matches one of the implemented procedures
  generator <- match.arg(generator, c("Sample", "Bernoulli"))
  
  # generate random numbers
  res <- switch(generator, Sample    = sample(0:length(probs), n, TRUE, dpbinom(NULL, probs, NULL, method)),
                           Bernoulli = rpb_bernoulli(n, probs))
  
  # return results
  return(res)
}