File: hashCheck.r

package info (click to toggle)
hmisc 5.2-4-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,044 kB
  • sloc: asm: 28,905; f90: 590; ansic: 415; xml: 160; fortran: 75; makefile: 2
file content (147 lines) | stat: -rw-r--r-- 6,536 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
##' Check for Changes in List of Objects
##'
##' Given an RDS file name and a list of objects, does the following:
##' * makes a vector of hashes, one for each object.  Function objects are run through `deparse` so that the environment of the function will not be considered.
##' * see if the file exists; if not, return a list with result=NULL, `hash` = new vector of hashes, `changed='All'`
##' * if the file exists, read the file and its hash attribute as `prevhash`
##' * if `prevhash` is not identical to hash:
##'     if `.print.=TRUE` (default), print to console a summary of what's changed
##'     return a list with result=NULL, `hash` = new hash vector, changed
##' * if `prevhash = hash`, return a list with result=file object, `hash`=new hash,  changed=''
##'
##' Set `options(debughash=TRUE)` to trace results in `/tmp/debughash.txt`
##' @title hashCheck
##' @param ... a list of objects including data frames, vectors, functions, and all other types of R objects that represent dependencies of a certain calculation
##' @param file name of file in which results are stored
##' @param .print. set to `FALSE` to suppress printing information messages about what has changed
##' @param .names. vector of names of original arguments if not calling `hashCheck` directly
##' @return a `list` with elements `result` (the computations), `hash` (the new hash), and `changed` which details what changed to make computations need to be run
##' @author Frank Harrell
##' @md
hashCheck <- function(..., file, .print.=TRUE, .names.=NULL) {
  .d.      <- list(...)
  .nam.    <- if(length(.names.)) .names. else as.character(sys.call())[-1]
  .nam.    <- .nam.[1 : length(.d.)]
  names(.d.) <- .nam.
  
  .debug. <- length(.Options$debughash) && .Options$debughash
  ct <- if(.debug.)
          function(...) cat(..., '\n', file='/tmp/debughash.txt', append=TRUE)
        else
          function(...) {}

  if(! requireNamespace('digest', quietly=TRUE))
    stop('must install digest package to use hashCheck or runifChanged')

  ct(.nam.)
  .g. <- function(x) digest::digest(if(is.function(x)) deparse(x) else x)
  .hash. <- sapply(.d., .g.)
  if(.debug.) prn(.hash., file='/tmp/debughash.txt')
  
  .prevhash. <- NULL
  if(! file.exists(file)) {
    ct('no file', file)
    return(list(result=NULL, hash=.hash., changed='All'))
    }

  R        <- readRDS(file)
  .prevhash. <- attr(R, 'hash')
if(! length(.prevhash.)) {
    if(.print.) cat('\nRe-run because of no previous hash\n\n')
    ct('no previous hash')
    return(list(result=NULL, hash=.hash., changed='No previous hash'))
    }

  samelen <- length(.hash.) == length(.prevhash.)
  if(samelen && all(.hash. == .prevhash.)) {
    ct('no change')
    return(list(result=R, hash=.hash., changed=''))
    }
  .s. <- character(0)

  if(! samelen) {
    .a. <- names(.prevhash.)
    .b. <- names(.hash.)
    .w. <- setdiff(.a., .b.)
    if(length(.w.))
      .s. <- c(.s., paste('objects removed:',
                      paste(.w., collapse=' ')))
    .w. <- setdiff(.b., .a.)
    if(length(.w.))
      .s. <- c(.s., paste('objects added:',
                      paste(.w., collapse=' ')))
  } else 
    .s. <- c(.s., paste('changes in the following objects:',
                    paste(.nam.[.hash. != .prevhash.], collapse=' ')))

  .s. <- paste(.s., collapse=';')
  ct(.s.)
      
  if(.print.) cat('\nRe-run because of', .s., '\n\n')

  list(result=NULL, hash=.hash., changed=.s.)
}


##' Re-run Code if an Input Changed
##'
##' Uses `hashCheck` to run a function and save the results if specified inputs have changed, otherwise to retrieve results from a file.  This makes it easy to see if any objects changed that require re-running a long simulation, and reports on any changes.  The file name is taken as the chunk name appended with `.rds` unless it is given as `file=`.  `fun` has no arguments.  Set `.inclfun.=FALSE` to not include `fun` in the hash check (for legacy uses).  The typical workflow is as follows.
##' ```
##' f <- function(       ) {
##' # . . . do the real work with multiple function calls ...
##' }
##' seed <- 3
##' set.seed(seed)
##' w <- runifChanged(f, seed, obj1, obj2, ....)
##' ```
##' `seed, obj1, obj2`, ... are all the objects that `f()` uses that if changed
##' would give a different result of `f()`.  This can include functions such as
##' those in a package, and `f` will be re-run if any of the function's code
##' changes.  `f` is also re-run if the code inside `f` changes.
##' The result of `f` is stored with `saveRDS` by default in file named `xxx.rds`
##' where `xxx` is the label for the current chunk.  To control this use instead
##' `file=xxx.rds` add the file argument to `runifChanged(...)`.  If nothing has
##' changed and the file already exists, the file is read to create the result
##' object (e.g., `w` above).  If `f()` needs to be run, the hashed input objects
##' are stored as attributes for the result then the enhanced result is written to the file.
##'
##' See [here](https://hbiostat.org/rflow/caching.html) for examples.
##' 
##' @title runifChanged
##' @param fun the (usually slow) function to run
##' @param ... input objects the result of running the function is dependent on
##' @param file file in which to store the result of `fun` augmented by attributes containing hash digests
##' @param .print. set to `TRUE` to list which objects changed that neessitated re-running `f`
##' @param .inclfun. set to `FALSE` to not include `fun` in the hash digest, i.e., to not require re-running `fun` if only `fun` itself has changed 
##' @return the result of running `fun`
##' @author Frank Harrell
##' @md
runifChanged <- function(fun, ..., file=NULL, .print.=TRUE, .inclfun.=TRUE) {
  if(! length(file)) {
    file <- knitr::opts_current$get('label')
    if(! length(file))
      stop('attempt to run runifChanged without file= outside a knitr chunk')
    file <- paste0(file, '.rds')
  }
  w <- list(...)
  .names. <- (as.character(sys.call())[-1])[1 : (length(w) + 1)]

  hashobj <- if(! .inclfun.) hashCheck(..., file=file,
                                       .print.=.print., .names.=.names.[-1])
             else {
               w <- c(list(fun), w)
               w$file    <- file
               w$.print. <- .print.
               w$.names. <- .names.
               do.call(hashCheck, w)
               }

  hash    <- hashobj$hash
  result  <- hashobj$result
  if(! length(result)) {
    result <- fun()
    attr(result, 'hash') <- hash
    saveRDS(result, file, compress='xz')
  }
  result
}