File: simMarkovOrd.r

package info (click to toggle)
hmisc 5.2-4-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,044 kB
  • sloc: asm: 28,905; f90: 590; ansic: 415; xml: 160; fortran: 75; makefile: 2
file content (705 lines) | stat: -rw-r--r-- 39,265 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
##' Simulate Ordinal Markov Process
##'
##' Simulates longitudinal data for subjects following a first-order Markov process under a proportional odds model.  Optionally, response-dependent sampling can be done, e.g., if a subject hits a specified state at time t, measurements are removed for times t+1, t+3, t+5, ...  This is applicable when for example a study of hospitalized patients samples every day, Y=1 denotes patient discharge to home, and sampling is less frequent outside the hospital.  This example assumes that arriving home is not an absorbing state, i.e., a patient could return to the hospital.
##' @title simMarkovOrd
##' @param n number of subjects to simulate
##' @param y vector of possible y values in order (numeric, character, factor)
##' @param times vector of measurement times
##' @param initial initial value of `y` (baseline state; numeric, character, or factor matching `y`).  If length 1 this value is used for all subjects, otherwise it is a vector of length `n`.
##' @param X an optional vector of matrix of baseline covariate values passed to `g`.  If a vector, `X` represents a set of single values for all the covariates and those values are used for every subject.  Otherwise `X` is a matrix with rows corresponding to subjects and columns corresponding to covariates which `g` must know how to handle.  `g` only sees one row of `X` at a time.
##' @param absorb vector of absorbing states, a subset of `y` (numeric, character, or factor matching `y`).  The default is no absorbing states.  Observations are truncated when an absorbing state is simulated.
##' @param intercepts vector of intercepts in the proportional odds model.  There must be one fewer of these than the length of `y`.
##' @param g a user-specified function of three or more arguments which in order are `yprev` - the value of `y` at the previous time, the current time `t`, the `gap` between the previous time and the current time, an optional (usually named) covariate vector `X`, and optional arguments such as a regression coefficient value to simulate from.  The function needs to allow `yprev` to be a vector and `yprev` must not include any absorbing states.  The `g` function returns the linear predictor for the proportional odds model aside from `intercepts`.  The returned value must be a matrix with row names taken from `yprev`.  If the model is a proportional odds model, the returned value must be one column.  If it is a partial proportional odds model, the value must have one column for each distinct value of the response variable Y after the first one, with the levels of Y used as optional column names.  So columns correspond to `intercepts`. The different columns are used for `y`-specific contributions to the linear predictor (aside from `intercepts`) for a partial or constrained partial proportional odds model.  Parameters for partial proportional odds effects may be included in the ... arguments.
##' @param carry set to `TRUE` to carry absorbing state forward after it is first hit; the default is to end records for the subject once the absorbing state is hit
##' @param rdsample an optional function to do response-dependent sampling.  It is a function of these arguments, which are vectors that stop at any absorbing state: `times` (ascending measurement times for one subject), `y` (vector of ordinal outcomes at these times for one subject.  The function returns `NULL` if no observations are to be dropped, returns the vector of new times to sample.
##' @param ... additional arguments to pass to `g` such as a regresson coefficient
##' @return data frame with one row per subject per time, and columns id, time, gap, yprev, y
##' @author Frank Harrell
##' @seealso <https://hbiostat.org/R/Hmisc/markov/>
##' @export
##' @md
simMarkovOrd <- function(n=1, y, times, initial, X=NULL, absorb=NULL,
                         intercepts, g, carry=FALSE, rdsample=NULL, ...) {

  if(is.factor(y))       y       <- as.character(y)
  if(is.factor(initial)) initial <- as.character(initial)
  if(is.factor(absorb))  absorb  <- as.character(absorb)
  ychar <- is.character(y)

  if(length(initial) == 1) initial <- rep(initial, n)
  if(length(initial) != n) stop('initial must have length 1 or n')
  if(any(initial %in% absorb))
    stop('initial state cannot be an absorbing state')

  Xmat <- length(X) && is.matrix(X)
  if(Xmat && ! length(colnames(X)))
    stop('when a matrix, X must have column names')
  if(length(X) && ! Xmat && ! length(names(X)))
    stop('when a vector, elements of X must have names')
  
  nt     <- length(times)
  Y      <- Yp <- if(ychar) character(nt) else numeric(nt)
  gaps   <- numeric(nt)
  ID     <- Time <- Gap <- numeric(n * nt)
  YYprev <- YY <- if(ychar) character(n * nt) else numeric(n * nt)
  is     <- 1
  times.saved <- 0
  
  for(id in 1 : n) {
    tprev <- 0
    i     <- 0
    for(t in times) {
      i       <- i + 1
      gap     <- t - tprev
      gaps[i] <- gap
      yprev   <- if(i == 1) initial[id] else Y[i - 1]
      Yp[i]   <- yprev
      if(carry && (yprev %in% absorb)) Y[i] <- yprev
      else {
        xb    <- g(yprev, t, gap, X=if(Xmat) X[id, ] else X, ...)
        ## If partial PO model xb has 1 row (since yprev is scalar) and
        ## columns corresponding to intercepts.  If PO, is 1x1
        probs <- plogis(intercepts + xb)
        ## Compute cell probabilities from successive differences in
        ## exceedance probs
        probs <- c(1., probs) - c(probs, 0.)
        lo <- probs < 0.
        hi <- probs > 1.
        ## The following is needed for partial proportional odds models
        if(any(c(lo , hi))) {
          warning(paste('Probabilities < 0 or > 1 at time t=', t,
                        'id=', id,
                        ':', paste(probs[c(lo, hi)], collapse=' '),
                        'set to 0 or 1'))
          if(any(lo)) probs[lo] <- 0.
          if(any(hi)) probs[hi] <- 1.
          }
        Y[i]  <- sample(y, 1, prob=probs)
        if(! carry && (Y[i] %in% absorb)) break
      }
      tprev <- t
    }
    s     <- 1 : i
    atimes <- times[s]
    agaps  <- gaps[s]
    aYp    <- Yp[s]
    aY     <- Y[s]

    if(length(rdsample)) {
      stimes <- rdsample(atimes, aY)
      lt     <- length(stimes)
      if(lt) {
        times.saved <- times.saved + i - lt
        tsprev <- c(0, stimes[- lt])
        agaps   <- stimes - tsprev
        aY      <- aY[times %in% stimes]
        if(length(aY) != lt) stop('program logic error in simMarkovOrd')
        aYp     <- c(aYp[1], aY[- lt])
        atimes  <- stimes
      }
    }

    ie              <- is + length(aY) - 1
    ID    [is : ie] <- id
    Time  [is : ie] <- atimes
    Gap   [is : ie] <- agaps
    YYprev[is : ie] <- aYp
    YY    [is : ie] <- aY
    is              <- ie + 1
  }

  yy <- YY[1 : ie]
  if(ychar) yy <- factor(yy, levels=y)
  yyp <- YYprev[1 : ie]
  if(ychar) yyp <- factor(yyp, levels=setdiff(y, absorb))

  ## thanks: MoserGitHub (GH issue #199)
  ## prior to v5.2-4, ... passed to data.frame when primary intention
  ## was to pass ... to `g`; this could result in errors
  res <- data.frame(id=ID[1 : ie], time=Time[1 : ie], gap=Gap[1 : ie],
                    yprev=yyp, y=yy)
  attr(res, 'times.saved.per.subject') <- times.saved / n
    
  ## Handle case where X is a constant vector to distribute to all obs
  if(length(X)) {
    if(Xmat) for(nam in colnames(X)) res[[nam]] <- X[res$id, nam]
    else
      for(nam in names(X)) res[[nam]] <- X[nam]
    }
  res
}    


#' State Occupancy Probabilities for First-Order Markov Ordinal Model
#'
#' @title soprobMarkovOrd
#' @inheritParams simMarkovOrd
#' @param y a vector of possible y values in order (numeric, character, factor)
#' @param times vector of measurement times
#' @param initial initial value of `y` (baseline state; numeric, character, factr)
#' @param absorb vector of absorbing states, a subset of `y`.  The default is no absorbing states. (numeric, character, factor)
#' @param intercepts vector of intercepts in the proportional odds model, with length one less than the length of `y`
#' @param ... additional arguments to pass to `g` such as covariate settings
#'
#' @return matrix with rows corresponding to times and columns corresponding to states, with values equal to exact state occupancy probabilities
#' @export
#' @author Frank Harrell
#' @seealso <https://hbiostat.org/R/Hmisc/markov/>
#' @export 
#' @md
soprobMarkovOrd <- function(y, times, initial, absorb=NULL,
                            intercepts, g, ...) {

  if(initial %in% absorb) stop('initial state cannot be an absorbing state')
  k  <- length(y)
  nt <- length(times)
  P  <- matrix(NA, nrow=nt, ncol=k)
  colnames(P) <- as.character(y)
  rownames(P) <- as.character(times)
  yna         <- setdiff(y, absorb)   # all states except absorbing ones
  yc          <- as.character(y)
  ynac        <- as.character(yna)

  ## Don't uncondition on initial state
  xb <- g(initial, times[1], times[1], ...)  # 3rd arg (gap) assumes time origin 0
  ## Since initial is scalar, xb has one row.  It has multiple columns if
  ## model is partial PO model, with columns exactly corresponding to intercepts
  pp <- plogis(intercepts + xb)
  ## Compute cell probabilities
  pp <- c(1., pp) - c(pp, 0.)
  P[1, ] <- pp

  tprev <- times[1]
  for(it in 2 : nt) {
    t <- times[it]
    gap <- t - tprev
    ## Compute linear predictor at all non-absorbing states
    xb <- g(yna, t, gap, ...)   #non-intercept part of x * beta
    ## g puts non-absorbing states as row names (= ynac)
    ## If partial PO model xb has > 1 column that correspond to intercepts

    ## Matrix of conditional probabilities of Y conditioning on previous Y
    ## Columns = k conditional probabilities conditional on a single previous state
    ## Rows    = all possible previous states
    ## When the row corresponds to an absorbing state, with prob. 1
    ## a subject will remain in that state so give it a prob of 1 and
    ## all other states a prob of 0

    cp <- matrix(NA, nrow=k, ncol=k, dimnames=list(yc, yc))
    for(yval in y) {   # current row
      yvalc <- as.character(yval)
      if(yval %in% absorb) {   # current row is an absorbing state
        cp[yvalc, setdiff(yc, yvalc)]    <- 0.  # P(moving to non-abs state)=0
        cp[yvalc, yvalc]                 <- 1.  # certainty in staying
      }
      else {  # current row is non-absorbing state
        pp <- plogis(intercepts + xb[yvalc, ])
        ## Compute cell probabilities
        pp <- c(1., pp) - c(pp, 0.)
        cp[yvalc, ] <- pp
      }
    }
    P[it, ] <- t(cp) %*% P[it - 1, ]
    tprev <- t
  }
  P
}


##' Simulate Comparisons For Use in Sequential Markov Longitudinal Clinical Trial Simulations
##'
##' Simulates sequential clinical trials of longitudinal ordinal outcomes using a first-order Markov model.  Looks are done sequentially after subject ID numbers given in the vector `looks` with the earliest possible look being after subject 2.  At each look, a subject's repeated records are either all used or all ignored depending on the sequent ID number.  For each true effect parameter value, simulation, and at each look, runs a function to compute the estimate of the parameter of interest along with its variance.  For each simulation, data are first simulated for the last look, and these data are sequentially revealed for earlier looks.  The user provides a function `g` that has extra arguments specifying the true effect of `parameter` the treatment `group` expecting treatments to be coded 1 and 2.  `parameter` is usually on the scale of a regression coefficient, e.g., a log odds ratio.  Fitting is done using the `rms::lrm()` function, unless non-proportional odds is allowed in which case `VGAM::vglm()` is used.  If `timecriterion` is specified, the function also, for the last data look only, computes the first time at which the criterion is satisfied for the subject or use the event time and event/censoring indicator computed by `timecriterion`.  The Cox/logrank chi-square statistic for comparing groups on the derived time variable is saved.  If `coxzph=TRUE`, the `survival` package correlation coefficient `rho` from the scaled partial residuals is also saved so that the user can later determine to what extent the Markov model resulted in the proportional hazards assumption being violated when analyzing on the time scale.  `vglm` is accelerated by saving the first successful fit for the largest sample size and using its coefficients as starting value for further `vglm` fits for any sample size for the same setting of `parameter`.
##' @title estSeqMarkovOrd
##' @inheritParams simMarkovOrd
##' @param y vector of possible y values in order (numeric, character, factor)
##' @param times vector of measurement times
##' @param initial a vector of probabilities summing to 1.0 that specifies the frequency distribution of initial values to be sampled from.  The vector must have names that correspond to values of `y` representing non-absorbing states.
##' @param absorb vector of absorbing states, a subset of `y`.  The default is no absorbing states.  Observations are truncated when an absorbing state is simulated.  May be numeric, character, or factor.
##' @param intercepts vector of intercepts in the proportional odds model.  There must be one fewer of these than the length of `y`.
##' @param parameter vector of true parameter (effects; group differences) values.  These are group 2:1 log odds ratios in the transition model, conditioning on the previous `y`.
##' @param looks integer vector of ID numbers at which maximum likelihood estimates and their estimated variances are computed.  For a single look specify a scalar value for `loops` equal to the number of subjects in the sample.
##' @param formula a formula object given to the `lrm()` function using variables with these name: `y`, `time`, `yprev`, and `group` (factor variable having values '1' and '2').  The `yprev` variable is converted to a factor before fitting the model unless `yprevfactor=FALSE`.
##' @param ppo a formula specifying the part of `formula` for which proportional odds is not to be assumed, i.e., that specifies a partial proportional odds model.  Specifying `ppo` triggers the use of `VGAM::vglm()` instead of `rms::lrm` and will make the simulations run slower.
##' @param yprevfactor see `formula`
##' @param groupContrast omit this argument if `group` has only one regression coefficient in `formula`.  Otherwise if `ppo` is omitted, provide `groupContrast` as a list of two lists that are passed to `rms::contrast.rms()` to compute the contrast of interest and its standard error.  The first list corresponds to group 1, the second to group 2, to get a 2:1 contrast.  If `ppo` is given and the group effect is not just a simple regression coefficient, specify as `groupContrast` a function of a `vglm` fit that computes the contrast of interest and its standard error and returns a list with elements named `Contrast` and `SE`.  For the latter type you can optionally have formal arguments `n1`, `n2`, and `parameter` that are passed to `groupContrast` to compute the standard error of the group contrast, where `n1` and `n2` respectively are the sample sizes for the two groups and `parameter` is the true group effect parameter value.
##' @param cscov applies if `ppo` is not used.  Set to `TRUE` to use the cluster sandwich covariance estimator of the variance of the group comparison.
##' @param timecriterion a function of a time-ordered vector of simulated ordinal responses `y` that returns a vector `FALSE` or `TRUE` values denoting whether the current `y` level met the condition of interest.  For example `estSeqMarkovOrd` will compute the first time at which `y >= 5` if you specify `timecriterion=function(y) y >= 5`.  This function is only called at the last data look for each simulated study.  To have more control, instead of `timecriterion` returning a logical vector have it return a numeric 2-vector containing, in order, the event/censoring time and the 1/0 event/censoring indicator.
##' @param sstat set to a function of the time vector and the corresponding vector of ordinal responses for a single group if you want to compute a Wilcoxon test on a derived quantity such as the number of days in a given state.  
##' @param coxzph set to `TRUE` if `timecriterion` is specified and you want to compute a statistic for testing proportional hazards at the last look of each simulated data
##' @param nsim number of simulations (default is 1)
##' @param maxest maximum acceptable absolute value of the contrast estimate, ignored if `NULL`.  Any values exceeding `maxest` will result in the estimate being set to `NA`.
##' @param maxvest like `maxest` but for the estimated variance of the contrast estimate
##' @param progress set to `TRUE` to send current iteration number to `pfile` every 10 iterations.  Each iteration will really involve multiple simulations, if `parameter` has length greater than 1.
##' @param pfile file to which to write progress information.  Defaults to `''` which is the console.  Ignored if `progress=FALSE`.
##' @return a data frame with number of rows equal to the product of `nsim`, the length of `looks`, and the length of `parameter`, with variables `sim`, `parameter`, `look`, `est` (log odds ratio for group), and `vest` (the variance of the latter).  If `timecriterion` is specified the data frame also contains `loghr` (Cox log hazard ratio for group), `lrchisq` (chi-square from Cox test for group), and if `coxph=TRUE`, `phchisq`, the chi-square for testing proportional hazards.  The attribute `etimefreq` is also present if `timecriterion` is present, and it probvides the frequency distribution of derived event times by group and censoring/event indicator.  If `sstat` is given, the attribute `sstat` is also present, and it contains an array with dimensions corresponding to simulations, parameter values within simulations, `id`, and a two-column subarray with columns `group` and `y`, the latter being the summary measure computed by the `sstat` function.  The returned data frame also has attribute `lrmcoef` which are the last-look logistic regression coefficient estimates over the `nsim` simulations and the parameter settings, and an attribute `failures` which is a data frame containing the variables `reason` and `frequency` cataloging the reasons for unsuccessful model fits.
##' @author Frank Harrell
##' @seealso `gbayesSeqSim()`, `simMarkovOrd()`, <https://hbiostat.org/R/Hmisc/markov/>
##' @export
##' @md

estSeqMarkovOrd <- function(y, times, initial, absorb=NULL, intercepts,
                            parameter, looks, g, formula, ppo=NULL,
                            yprevfactor=TRUE,
                            groupContrast=NULL, cscov=FALSE,
                            timecriterion=NULL, coxzph=FALSE,
                            sstat=NULL, rdsample=NULL,
                            maxest=NULL, maxvest=NULL,
                            nsim=1, progress=FALSE, pfile='') {

  olddd <- getOption('datadist')
  on.exit(options(datadist=olddd))

  isppo <- length(ppo) > 0
  if(isppo) {
    if(! inherits(ppo, 'formula')) stop('ppo must be a formula')
    if(! requireNamespace('VGAM'))
      stop('ppo specified and VGAM package not available')
    # VGAM wants you to declare FALSE to indicate non-PO
    vglm <- VGAM::vglm
    ppo  <- formula(paste('FALSE ~', as.character(ppo)[-1]))
  } else if (!requireNamespace("rms", quietly = TRUE))
    stop('ppo not specified and rms package not available')
  

  if(isppo && cscov) stop('may not specify cscov=TRUE with ppo')
  
  nas <- setdiff(y, absorb)    # non-absorbing states
  if(length(initial) != length(nas))
    stop('length of initial must be number of non-absorbing values of y')
  if(! all(sort(names(initial)) == sort(as.character(nas))))
    stop('names of elements in initial are incorrect')
  if(coxzph && ! length(timecriterion))
    stop('must specify timecriterion when coxzph=TRUE')
  
  looks   <- sort(looks)
  nlook   <- length(looks)
  N       <- max(looks)
  np      <- length(parameter)
  nc      <- nsim * nlook * np
  parm    <- est <- vest <- numeric(nc)
  look    <- sim <- integer(nc)
  ndy     <- length(y)
  
  Etimefreq <- NULL
  if(length(timecriterion)) {
    Etimefreq <-
      array(0, dim=c(nsim, np, 2, 2, length(times)),
            dimnames=list(paste('sim', 1 : nsim),
                          as.character(parameter),
                          c('1', '2'),
                          c('censored', 'event'),
                          as.character(times)))
    loghr <-   lrchisq <- rep(NA, nc) 
    if(coxzph) phchisq <- rep(NA, nc)
  }
  if(length(sstat))
    Sstat <- array(0L, dim=c(nsim, np, N, 2),
                   dimnames=list(paste('sim', 1 : nsim),
                                 as.character(parameter),
                                 paste('id', 1 : N),
                                 c('group', 'y')))

  groupContrastUsesN <-
    length(groupContrast) &&
       all(c('n1', 'n2', 'parameter') %in% names(formals(groupContrast)))

  ## For each simulation and each parameter value, simulate data for the
  ## whole study

  is    <- 0
  pname <- if(isppo) 'group2' else 'group=2'
  h <- function(time, y) {
    u <- timecriterion(y)
    if(! is.logical(u))
      return(list(etime=as.numeric(u[1]), event=as.integer(u[2])))
    # Note that if there are any absorbing events, the time vector
    # would already have been truncated at the first of such events
    if(any(u)) list(etime=as.numeric(min(time[u])), event=1L)
    else
               list(etime=as.numeric(max(time)),   event=0L)
  }

  lrmcoef  <- NULL
  co.na    <- NULL   # template of coef vector to be all NAs
  coefprev <- list() # to hold first working fit at last look for each parameter
  ## coefprev speeds up vglm (last look = maximum sample size)
  failures <- character(0)
  
  for(isim in 1 : nsim) {
    if(progress && (isim %% 10 == 0))
      cat('Simulation', isim, '\n', file=pfile)
    for(param in parameter) {
      cparam <- as.character(param)
      ## Sample N initial states
      initials <- sample(names(initial), N, replace=TRUE, prob=initial)
      if(is.numeric(y)) initials <- as.numeric(initials)
      ## Sample treatment groups 1 and 2
      X   <- matrix(sample(1 : 2, N, replace=TRUE), ncol=1,
                    dimnames=list(NULL, 'group'))
      ## For simMarkovOrd X must be a matrix if it varies
      sdata <- simMarkovOrd(n=N, y, times, initials, X=X, absorb=absorb,
                            intercepts=intercepts, g=g, parameter=param,
                            rdsample=rdsample)
      
      tsps <- attr(sdata, 'time.saved.per.subject')
      if(length(tsps))
        cat('Average number of measurement times saved per subject by response-dependent sampling:', round(tsps, 1), '\n')
      ## sdata is a data frame containing id, time, yprev, y, ...
      sdata$group <- as.factor(sdata$group)
      if(yprevfactor) sdata$yprev <- as.factor(sdata$yprev)
      if(isim == 1 && ! isppo) {
        .dd. <- rms::datadist(sdata)
        options(datadist=.dd.)   # requires rms 6.1-1
      }
      
      ## For each look compute the parameter estimate and its variance
      ## If a contrast is specified (say when treatment interacts with time)
      ## use that instead of a simple treatment effect
      ## For vglm speed up by taking as starting values the estimates
      ## from the last successful run

      for(l in looks) {
        ## Subjects are numbered consecutively with id=1,2,3,... and
        ## these correspond to sequential data looks when accumulated
        dat <- subset(sdata, id <= l)
        luy <- length(unique(dat$y))
        if(luy != ndy) {
          f <- paste('Simulated data for simulation with sample size', l,
                     'has', luy, 'distinct y values instead of the required',
                     ndy)
          fail <- TRUE
          } else {
            if(isppo) {
              cprev <- coefprev[[cparam]]
              ## Could not get system to find cprev when regular call inside try()
              ff <- call('vglm', formula,
                         VGAM::cumulative(parallel=ppo, reverse=TRUE),
                         coefstart=cprev, data=dat)
              f <- try(eval(ff), silent=TRUE)
            } else
              f <- try(rms::lrm(formula, data=dat, x=cscov, y=cscov), silent=TRUE)
            fail <- inherits(f, 'try-error')
          }
        
        if(fail) failures <- c(failures, as.character(f))
        else {
          if(isppo && l == max(looks) && ! length(coefprev[[cparam]]))
            coefprev[[cparam]] <- coef(f) 
          if(! length(co.na)) {   # save template to insert for failures
            co.na <- coef(f)
            co.na[] <- NA
          }
        }
        if(cscov && ! fail) f <- rms::robcov(f, dat$id)
        is         <- is + 1
        sim [is]   <- isim
        parm[is]   <- param
        look[is]   <- l
        if(fail) {
          est [is] <- NA
          vest[is] <- NA
        } else {
          if(length(groupContrast)) {
            fc <- if(isppo) (if(groupContrastUsesN)
                               groupContrast(f, n1=sum(dat$group == '1'),
                                                n2=sum(dat$group == '2'),
                                                parameter=param)
                               else
                                 groupContrast(f))
                  else
                    rms::contrast(f, groupContrast[[2]], groupContrast[[1]])
            est [is] <- fc$Contrast
            vest[is] <- (fc$SE) ^ 2
          }
          else {
            est [is]   <- coef(f)[pname]
            vest[is]   <- vcov(f)[pname, pname]
          }
          if(length(maxest) && abs(est[is]) > maxest) {
            failures <- c(failures, paste0('|contrast|>', maxest))
            est[is] <- vest[is] <- NA
            fail <- TRUE
          } else if(length(maxvest) && vest[is] > maxvest) {
            failures <- c(failures, paste0('variance>', maxvest))
            est[is] <- vest[is] <- NA
            fail <- TRUE
          }
        }    # end else if not fail
      }  # end looks
      co <- if(fail) co.na else coef(f)
      if(! length(lrmcoef))
        lrmcoef <- array(0., dim=c(length(parameter), nsim, length(co)),
                         dimnames=list(as.character(parameter),
                                       paste('sim', 1 : nsim),
                                       names(co)))

      ww <- try(lrmcoef[as.character(param), isim, ] <- co)
      if(inherits(ww, 'try-error')) {
        wf <- 'estSeqMarkovOrd.err'
        prn(dimnames(lrmcoef), file=wf)
        prn(as.character(param), file=wf)
        prn(isim, file=wf)
        prn(co, file=wf)
        stop('non-conformable coefficients in estSeqMarkovOrd.  See file estSeqMarkovOrd.err in current working directory.')
        }
      
      if(length(timecriterion)) {
        # Separately for each subject compute the time until the
        # criterion is satisfied.  Right censor at last observed time if it
        # doesn't occur
        setDT(sdata, key=c('group', 'id', 'time'))
        d <- sdata[, h(time, y), by=.(group, id)]
        
        fit <- survival::coxph(Surv(etime, event) ~ group, data=d)
        loghr  [is] <- fit$coef
        lrchisq[is] <- 2. * diff(fit$loglik)
        if(coxzph)
          phchisq[is] <- survival::cox.zph(fit, transform='identity',
                                           global=FALSE)$table[, 'chisq']

        for(gr in c('1', '2')) {
          for(ev in 0 : 1) {
            utimes <- with(subset(d, group == gr & event == ev),
                           as.character(etime))
            utimes <- factor(utimes, as.character(times))
            tab    <- table(utimes)
            Etimefreq[isim, as.character(param), gr, ev + 1, ] <-
              Etimefreq[isim, as.character(param), gr, ev + 1, ] + tab
          } # end censored vs event
        } # end group
      } # end timecriterion
      if(length(sstat)) {
        ## Separately for each subject compute the summary statistic
        sds <- sdata[, ys := sstat(time, y), by=.(group, id)]
        Sstat[isim, as.character(param), sds$id, ] <-
          cbind(sds$group, sds$ys)
      }  # end sstat
    }  # end param
  } # end sim
  
  res <- data.frame(sim=sim, parameter=parm, look=look,
                    est=est, vest=vest)
  if(length(timecriterion)) {
    res$loghr   <- loghr
    res$lrchisq <- lrchisq
    if(coxzph) res$phchisq <- phchisq
    attr(res, 'etimefreq') <- Etimefreq
  }
  if(length(sstat)) attr(res, 'sstat') <- Sstat
  attr(res, 'lrmcoef') <- lrmcoef
  failures <- if(length(failures))
                as.data.frame(table(failure=failures))
              else
                data.frame(failure='', Freq=0)
  attr(res, 'failures') <- failures
  res
}

#' Compute Parameters for Proportional Odds Markov Model
#'
#' Given a vector `intercepts` of initial guesses at the intercepts in a Markov proportional odds model, and a vector `extra` if there are other parameters, solves for the `intercepts` and `extra` vectors that yields a set of occupancy probabilities at time `t` that equal, as closely as possible, a vector of target values.
#' @title intMarkovOrd
#' @inheritParams simMarkovOrd
#' @param intercepts vector of initial guesses for the intercepts
#' @param extra an optional vector of intial guesses for other parameters passed to `g` such as regression coefficients for previous states and for general time trends.  Name the elements of `extra` for more informative output.
#' @param target vector of target state occupancy probabilities at time `t`.  If `extra` is specified, `target` must be a matrix where row names are character versions of `t` and columns represent occupancy probabilities corresponding to values of `y` at the time given in the row.
#' @param t target times.  Can have more than one element only if `extra` is given.
#' @param ftarget an optional function defining constraints that relate to transition probabilities.  The function returns a penalty which is a sum of absolute differences in probabilities from target probabilities over possibly multiple targets.  The `ftarget` function must have two arguments: `intercepts` and `extra`.
#' @param onlycrit set to `TRUE` to only return the achieved objective criterion and not print anything
#' @param constraints a function of two arguments: the vector of current intercept values and the vector of `extra` parameters, returning `TRUE` if that vector meets the constrains and `FALSE` otherwise
#' @param printsop set to `TRUE` to print solved-for state occupancy probabilities for groups 1 and 2 and log odds ratios corresponding to them
#' @param ... optional arguments to pass to [stats::nlm()].  If this is specified, the arguments that `intMarkovOrd` normally sends to `nlm` are not used.
#'
#' @return list containing two vectors named `intercepts` and `extra` unless `oncrit=TRUE` in which case the best achieved sum of absolute errors is returned
#' @author Frank Harrell
#' @export
#' @md
#' @seealso <https://hbiostat.org/R/Hmisc/markov/>
intMarkovOrd <- function(y, times, initial, absorb=NULL,
                         intercepts, extra=NULL, g, target, t, ftarget=NULL,
                         onlycrit=FALSE, constraints=NULL,
                         printsop=FALSE, ...) {

  if(any(diff(intercepts) > 0)) stop('initial intercepts are out of order')
  
  t <- as.character(t)
  if(length(t) > 1 && (! is.matrix(target) || nrow(target) != length(t)))
    stop('target must be a matrix with # rows = length of t')
  if(length(t) == 1) target <- matrix(target, nrow=1, dimnames=list(t, NULL))
  for(ti in t)
    if(abs(sum(target[ti, ]) - 1.) > 1e-5)
      stop('each row of target must sum to 1')
  
  h <- function(a) {
    ## Compute state occupancy probabilities at time t for current
    ## vector of intercept values and extra
    ints <- a[1 : (length(a) - length(extra))]
    if(any(diff(ints) > 0.)) return(1000.)
    if(length(extra)) extra <- a[-(1 : length(ints))]
    if(length(constraints) && ! constraints(ints, extra)) return(1000.)
    s <- soprobMarkovOrd(y, times, initial=initial, absorb=absorb,
                         intercepts=ints, g=g, X=1, extra=extra)[t,, drop=FALSE ]
    # Objective function to minimize: sum of absolute differences with targets
    # with restriction that intercepts be in descending order
    crit <- 0.  # if(any(diff(ints) > 0.)) 1000. else 0.
    for(tim in rownames(s)) crit <- crit + sum(abs(s[tim, ] - target[tim, ]))
    if(length(ftarget)) crit <- crit + ftarget(intercepts=ints, extra=extra)
    crit
  }

  if(length(list(...)))
    u <- nlm(h, c(intercepts, extra), ...)
  else
    u <- nlm(h, c(intercepts, extra), iterlim=300)

  if(onlycrit) return(u$minimum)
  
  cat('\nIterations:', u$iterations, '\n')
  cat('Sum of absolute errors:', u$minimum, '\n')
  ints <- u$estimate[1 : (length(u$estimate) - length(extra))]
  if(length(extra)) extra <- structure(u$estimate[-(1 : length(ints))],
                                       names=names(extra))
  
  cat('Intercepts:', round(ints, 3), '\n')
  if(length(extra)) {
    cat('\nExtra parameters:\n\n')
    print(round(extra, 4))
    }
  s1 <- soprobMarkovOrd(y, times, initial=initial, absorb=absorb,
                        intercepts=ints, g=g, X=1, extra=extra)
  if(printsop) {
    cat('\nOccupancy probabilities for group 1:\n\n')
    print(round(s1, 3))
    }
  # Show occupancy probabilities for group 2
  s2 <- soprobMarkovOrd(y, times, initial=initial, absorb=absorb,
                        intercepts=ints, g=g, X=2, extra=extra)
  if(printsop) {
    cat('\nOccupancy probabilities for group 2:\n\n')
    print(round(s2, 3))
    }

  ## Compute log odds ratios at day t

  if(printsop) for(ti in t) {
    ## Get cumulative probabilities from right to left except for the first    
    s1t <- rev(cumsum(rev(s1[ti, -1])))
    s2t <- rev(cumsum(rev(s2[ti, -1])))
    lor <- round(qlogis(s2t) - qlogis(s1t), 3)
    cat('\nLog odds ratios at', paste0('t=', ti),
        'from occupancy probabilities:' , lor, '\n')
    }
  list(intercepts=ints, extra=extra)
}


#' State Occupancy Probabilities for First-Order Markov Ordinal Model from a Model Fit
#'
#' Computes state occupancy probabilities for a single setting of baseline covariates.  If the model fit was from `rms::blrm()`, these probabilities are from all the posterior draws of the basic model parameters.  Otherwise they are maximum likelihood point estimates.
#'
#' @title soprobMarkovOrdm
#' @param object a fit object created by `blrm`, `lrm`, `orm`, `VGAM::vglm()`, or `VGAM::vgam()`
#' @param data a single observation list or data frame with covariate settings, including the initial state for Y
#' @param times vector of measurement times
#' @param ylevels a vector of ordered levels of the outcome variable (numeric or character)
#' @param absorb vector of absorbing states, a subset of `ylevels`.  The default is no absorbing states. (numeric, character, factor)
#' @param tvarname name of time variable, defaulting to `time`
#' @param pvarname name of previous state variable, defaulting to `yprev`
#' @param gap name of time gap variable, defaults assuming that gap time is not in the model
#'
#' @return if `object` was not a Bayesian model, a matrix with rows corresponding to times and columns corresponding to states, with values equal to exact state occupancy probabilities.  If `object` was created by `blrm`, the result is a 3-dimensional array with the posterior draws as the first dimension.
#' @export
#' @author Frank Harrell
#' @seealso <https://hbiostat.org/R/Hmisc/markov/>
#' @md
soprobMarkovOrdm <- function(object, data, times, ylevels, absorb=NULL,
                             tvarname='time', pvarname='yprev',
                             gap=NULL) {

  cl <- class(object)[1]
  ftypes <- c(lrm='rms', orm='rms', blrm='rmsb', vglm='vgam', vgam='vgam')
  ftype  <- ftypes[cl]
  if(is.na(ftype)) stop(paste('object must be a fit from one of:',
                              paste(ftypes, collapse=' ')))

  ## For VGAM objects, predict() did not find the right function when
  ## inside the rms package
  
  prd <-
    switch(ftype,
           rms =function(obj, data) predict(obj, data, type='fitted.ind'),
           vgam=function(obj, data) VGAM::predict(obj, data, type='response'),
           rmsb=function(obj, data) predict(obj, data, type='fitted.ind',
                                            posterior.summary='all'))
  
  if(pvarname %nin% names(data))
    stop(paste(pvarname, 'is not in data'))
  if(length(absorb) && (pvarname %in% absorb))
    stop('initial state cannot be an absorbing state')
  
  nd <- if(ftype == 'rmsb' && length(object$draws)) nrow(object$draws) else 0
  if((nd == 0) != (ftype != 'rmsb'))
    stop('model fit inconsistent with having posterior draws')
  
  k  <- length(ylevels)
  s  <- length(times)
  P  <- if(nd == 0)
          array(NA, c(s, k),
  						dimnames=list(as.character(times), 
  													as.character(ylevels))) else
          array(NA, c(nd, s, k),
  						dimnames=list(paste('draw', 1 : nd), as.character(times), 
  													as.character(ylevels)))
  # Never uncondition on initial state
  data[[tvarname]] <- times[1]
  if(length(gap)) data[[gap]] <- times[1]
  data <- as.data.frame(data)
  p <- prd(object, data)
  if(nd == 0) P[1, ] <- p else P[, 1, ] <- p

  # cp: matrix of conditional probabilities of Y conditioning on previous time Y
  # Columns = k conditional probabilities conditional on a single previous state
  # Rows    = all possible previous states
  # This is for a single posterior draw (or for a frequentist fit)
  rnameprev   <- paste('t-1', ylevels)
  rnameprevna <- paste('t-1', setdiff(ylevels, absorb))
  if(length(absorb)) {
    rnamepreva <- paste('t-1', absorb)
    cnamea     <- paste('t',   absorb)
    }
  cp <- matrix(0., nrow=k, ncol=k, 
               dimnames=list(rnameprev, paste('t', ylevels)))
  ## cp is initialized to zero, which will remain the values for
  ## probabilities of moving out of absorbing (row) states
  ## Set probabilities of staying in absorbing states to 1
  if(length(absorb)) cp[cbind(rnamepreva, cnamea)] <- 1.
  
  data <- as.list(data)
  yna  <- setdiff(ylevels, absorb)  # non-absorbing states
  data[[pvarname]] <- yna   # don't request estimates for absorbing states
  edata <- expand.grid(data)
  for(it in 2 : s) {
    edata[[tvarname]] <- times[it]
    if(length(gap)) edata[[gap]] <- times[it] - times[it - 1]
    pp <- prd(object, edata)
    if(nd == 0) {
      ## If there are absorbing states, make a bigger version of
      ## the cell probability matrix that includes them
      ## Rows representing absorbing states have P(stating in that state)=1
      cp[rnameprevna, ] <- pp
      
      ## Compute unconditional probabilities of being in all possible states
      ## at current time t
      P[it, ] <- t(cp) %*% P[it - 1, ]
    }
    else {
      for(idraw in 1 : nd) {
        cp[rnameprevna, ] <- pp[idraw, ,]
        P[idraw, it, ] <- t(cp) %*% P[idraw, it - 1, ]
      }
    }
  }
  P
}

utils::globalVariables(c('id', 'group', 'event', ':=', 'ys'))