File: ordMoments.R

package info (click to toggle)
r-cran-semtools 0.5.7-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,204 kB
  • sloc: makefile: 2
file content (353 lines) | stat: -rw-r--r-- 14,765 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
### Terrence D. Jorgensen & Andrew R. Johnson
### Last updated: 20 January 2025
### function to derive ordinal-scale moments implied by LRV-scale moments


##' Calculate Population Moments for Ordinal Data Treated as Numeric
##'
##' This function calculates ordinal-scale moments implied by LRV-scale moments
##'
##' Binary and ordinal data are frequently accommodated in SEM by incorporating
##' a threshold model that links each observed categorical response variable to
##' a corresponding latent response variable that is typically assumed to be
##' normally distributed (Kamata & Bauer, 2008; Wirth & Edwards, 2007).
##' This function can be useful for real-data analysis or for designing
##' Monte Carlo simulations, as described by Jorgensen and Johnson (2022).
##'
##' @importFrom stats dnorm setNames
##' @importFrom lavaan lavInspect
##' @importFrom pbivnorm pbivnorm
##'
##' @param Sigma Population covariance [matrix()], with variable names
##'   saved in the [dimnames()] attribute.
##' @param Mu Optional `numeric` vector of population means. If missing,
##'   all means will be set to zero.
##' @param thresholds Either a single `numeric` vector of population
##'   thresholds used to discretize each normally distributed variable, or a
##'   named `list` of each discretized variable's vector of thresholds.
##'   The discretized variables may be a subset of all variables in `Sigma`
##'   if the remaining variables are intended to be observed rather than latent
##'   normally distributed variables.
##' @param cWts Optional (default when missing is to use 0 for the lowest
##'   category, followed by successive integers for each higher category).
##'   Either a single `numeric` vector of category weights (if they are
##'   identical across all variables) or a named `list` of each
##'   discretized variable's vector of category weights.
##'
##' @return A `list` including the LRV-scale population moments (means,
##'   covariance matrix, correlation matrix, and thresholds), the category
##'   weights, a `data.frame` of implied univariate moments (means,
##'   *SD*s, skewness, and excess kurtosis (i.e., in excess of 3, which is
##'   the kurtosis of the normal distribution) for discretized data treated as
##'   `numeric`, and the implied covariance and correlation matrix of
##'   discretized data treated as `numeric`.
##'
##' @author
##'   Terrence D. Jorgensen (University of Amsterdam; \email{TJorgensen314@@gmail.com})
##'
##'   Andrew R. Johnson (Curtin University; \email{andrew.johnson@@curtin.edu.au})
##'
##' @references
##'
##' Jorgensen, T. D., & Johnson, A. R. (2022). How to derive expected values of
##'   structural equation model parameters when treating discrete data as
##'   continuous. *Structural Equation Modeling, 29*(4), 639--650.
##'   \doi{10.1080/10705511.2021.1988609}
##'
##' Kamata, A., & Bauer, D. J. (2008). A note on the relation between factor
##'   analytic and item response theory models.
##'   *Structural Equation Modeling, 15*(1), 136--153.
##'   \doi{10.1080/10705510701758406}
##'
##' Wirth, R. J., & Edwards, M. C. (2007). Item factor analysis: Current
##'   approaches and future directions. *Psychological Methods, 12*(1),
##'   58--79. \doi{10.1037/1082-989X.12.1.58}
##'
##' @examples
##'
##' ## SCENARIO 1: DIRECTLY SPECIFY POPULATION PARAMETERS
##'
##' ## specify population model in LISREL matrices
##' Nu <- rep(0, 4)
##' Alpha <- c(1, -0.5)
##' Lambda <- matrix(c(1, 1, 0, 0, 0, 0, 1, 1), nrow = 4, ncol = 2,
##'                  dimnames = list(paste0("y", 1:4), paste0("eta", 1:2)))
##' Psi <- diag(c(1, .75))
##' Theta <- diag(4)
##' Beta <- matrix(c(0, .5, 0, 0), nrow = 2, ncol = 2)
##'
##' ## calculate model-implied population means and covariance matrix
##' ## of latent response variables (LRVs)
##' IB <- solve(diag(2) - Beta) # to save time and space
##' Mu_LRV <- Nu + Lambda %*% IB %*% Alpha
##' Sigma_LRV <- Lambda %*% IB %*% Psi %*% t(IB) %*% t(Lambda) + Theta
##'
##' ## Specify (unstandardized) thresholds to discretize normally distributed data
##' ## generated from Mu_LRV and Sigma_LRV, based on marginal probabilities
##' PiList <- list(y1 = c(.25, .5, .25),
##'                y2 = c(.17, .33, .33, .17),
##'                y3 = c(.1, .2, .4, .2, .1),
##'                ## make final variable highly asymmetric
##'                y4 = c(.33, .25, .17, .12, .08, .05))
##' sapply(PiList, sum) # all sum to 100%
##' CumProbs <- sapply(PiList, cumsum)
##' ## unstandardized thresholds
##' TauList <- mapply(qnorm, p = lapply(CumProbs, function(x) x[-length(x)]),
##'                   m = Mu_LRV, sd = sqrt(diag(Sigma_LRV)))
##' for (i in 1:4) names(TauList[[i]]) <- paste0(names(TauList)[i], "|t",
##'                                              1:length(TauList[[i]]))
##'
##' ## assign numeric weights to each category (optional, see default)
##' NumCodes <- list(y1 = c(-0.5, 0, 0.5), y2 = 0:3, y3 = 1:5, y4 = 1:6)
##'
##'
##' ## Calculate Population Moments for Numerically Coded Ordinal Variables
##' lrv2ord(Sigma = Sigma_LRV, Mu = Mu_LRV, thresholds = TauList, cWts = NumCodes)
##'
##'
##' ## SCENARIO 2: USE ESTIMATED PARAMETERS AS POPULATION
##'
##' data(datCat) # already stored as c("ordered","factor")
##' fit <- cfa(' f =~ 1*u1 + 1*u2 + 1*u3 + 1*u4 ', data = datCat)
##' lrv2ord(Sigma = fit, thresholds = fit) # use same fit for both
##' ## or use estimated thresholds with specified parameters, but note that
##' ## lrv2ord() will only extract standardized thresholds
##' dimnames(Sigma_LRV) <- list(paste0("u", 1:4), paste0("u", 1:4))
##' lrv2ord(Sigma = cov2cor(Sigma_LRV), thresholds = fit)
##'
##' @export
lrv2ord <- function(Sigma, Mu, thresholds, cWts) {
  if (inherits(Sigma, "lavaan")) {
    if (lavInspect(Sigma, "ngroups") > 1L || lavInspect(Sigma, "nlevels") > 1L) {
      stop('Sigma= only accepts single-group/level lavaan models')
    }
    fitSigma <- Sigma
    Sigma <- lavInspect(fitSigma, "cov.ov")
  } else stopifnot(is.matrix(Sigma))
  vn <- rownames(Sigma) # variable names
  SDs <- sqrt(diag(Sigma))

  if (missing(Mu)) {
    Mu <- rep(0, nrow(Sigma))
  } else if (inherits(Mu, "lavaan")) {
    if (lavInspect(Mu, "ngroups") > 1L || lavInspect(Mu, "nlevels") > 1L) {
      stop('Mu= only accepts single-group/level lavaan models')
    }
    fitMu <- Mu
    Mu <- lavInspect(fitMu, "mean.ov")
  }
  names(Mu) <- names(SDs) <- vn


  ## If a single vector of thresholds is passed, broadcast to a list
  if (inherits(thresholds, "lavaan")) {
    if (lavInspect(thresholds, "ngroups") > 1L || lavInspect(thresholds, "nlevels") > 1L) {
      stop('thresholds= only accepts single-group/level lavaan models')
    }
    ## check whether diag(Sigma) == 1
    isSTD <- sapply(SDs, function(x) {
      isTRUE(all.equal(x, current = 1, tolerance = .001))
    })
    if (!all(isSTD)) warning('standardized thresholds= extracted from a ',
                             'lavaan object, but Sigma= is not a ',
                             'correlation matrix.')
    fitThr <- thresholds
    thresholds <- lavInspect(fitThr, "th") # STANDARDIZED thresholds

    thresh <- lapply(unique(lavInspect(fitThr, "th.idx")), function(x) {
      thresholds[lavInspect(fitThr, "th.idx") == x]
    })
    names(thresh) <- sapply(thresh, function(x) {
      strsplit(names(x)[1], "|t", fixed = TRUE)[[1]][1]
    })

  } else if (is.atomic(thresholds)) {
    thresh <- sapply(vn, function(x) {thresholds}, simplify = FALSE)
  } else {
    stopifnot(is.list(thresholds)) # must be a list
    stopifnot(length(thresholds) <= nrow(Sigma)) # no more than 1 per variable
    stopifnot(all(names(thresholds) %in% vn)) # names must match
    thresh <- thresholds
  }
  cn <- names(thresh)
  stopifnot(length(cn) > 0L)

  ## If no category weights are passed, default to 0:nCat
  if (missing(cWts)) {
    cWts <- sapply(thresh, function(x) { 0:length(x) }, simplify = FALSE)
  } else if (is.atomic(cWts)) {
    ## If a single vector of category weights is passed, broadcast to a list
    #FIXME: assumes same number of thresholds across variables
    cWts <- sapply(cn, function(x) { cWts }, simplify = FALSE)
  } else {
    stopifnot(is.list(cWts)) # must be a list
    stopifnot(length(cWts) <= nrow(Sigma)) # no more than 1 per variable
    stopifnot(all(names(cWts) %in% vn)) # names must match
    stopifnot(all(cn %in% names(cWts))) # names must match
    cWts <- cWts[cn] # discard any others
  }
  stopifnot(all((sapply(thresh, length) + 1L) == sapply(cWts, length)))

  ## Calculate marginal probabilities implied by thresholds on moments
  get_marg_probs <- function(threshs, m, sd) {
    thr  <- c(-Inf, threshs, Inf)
    sapply(2:length(thr), function(k) {
      pnorm(thr[k], m, sd) - pnorm(thr[k-1], m, sd)
    })
  }
  marginal_probs <- mapply(get_marg_probs, SIMPLIFY = FALSE, threshs = thresh,
                           m = Mu[cn], sd = SDs[cn])

  ## Marginal means
  Mu_ord <- Mu
  Mu_ord[cn] <- mapply(function(p, w) {
    stopifnot(length(p) == length(w))
    sum(p * w)
  }, p = marginal_probs, w = cWts)

  ## marginal variances (fill in covariances below)
  Sigma_ord <- Sigma
  if (length(cn) == 1) {
    ## drop=FALSE is not a solution (yields different error)
    Sigma_ord[cn,cn] <- mapply(function(p, w, mu) {
      stopifnot(length(p) == length(w))
      sum(p * (w - mu)^2)
    }, p = marginal_probs, w = cWts, mu = Mu_ord[cn])

  } else {
    diag(Sigma_ord[cn,cn]) <- mapply(function(p, w, mu) {
      stopifnot(length(p) == length(w))
      sum(p * (w - mu)^2)
    }, p = marginal_probs, w = cWts, mu = Mu_ord[cn])
  }

  ## marginal (standardized) skew
  skew_ord <- setNames(rep(0, nrow(Sigma)), nm = vn)
  skew_ord[cn] <- mapply(function(p, w, mu) {
    stopifnot(length(p) == length(w))
    numerator <- sum(p * (w - mu)^3)
    Moment2 <- sum(p * (w - mu)^2)
    denominator <- sqrt(Moment2)^3
    numerator / denominator
  }, p = marginal_probs, w = cWts, mu = Mu_ord[cn])

  ## marginal (standardized, excess) kurtosis
  kurt_ord <- setNames(rep(0, nrow(Sigma)), nm = vn)
  kurt_ord[cn] <- mapply(function(p, w, mu) {
    stopifnot(length(p) == length(w))
    numerator <- sum(p * (w - mu)^4)
    Moment2 <- sum(p * (w - mu)^2)
    denominator <- sqrt(Moment2)^4
    numerator / denominator
  }, p = marginal_probs, w = cWts, mu = Mu_ord[cn]) - 3 # excess kurtosis

  ## all marginal descriptives
  (margMoments <- data.frame(Mean = Mu_ord, SD = sqrt(diag(Sigma_ord)),
                             Skew = skew_ord, Kurtosis3 = kurt_ord,
                             row.names = vn))
  class(margMoments) <- c("lavaan.data.frame","data.frame") # for printing

  ## save old copies to return with new
  out <- list(Mu_LRV = Mu, Sigma_LRV = Sigma, R_LRV = stats::cov2cor(Sigma),
              Thresholds = thresh, Category_weights = cWts, Uni_ord = margMoments)
  class(out$Mu_LRV)    <- c("lavaan.vector","numeric")
  class(out$Sigma_LRV) <- c("lavaan.matrix.symmetric","matrix")
  class(out$R_LRV)     <- c("lavaan.matrix.symmetric","matrix")
  out$Thresholds       <- lapply(out$Thresholds, "class<-",
                                 c("lavaan.vector","numeric"))
  out$Category_weights <- lapply(out$Category_weights, "class<-",
                                 c("lavaan.vector","numeric"))
  ## need bivariate moments?
  if (length(vn) == 1L) return(out)

  ## function to apply to any pair of indicators (i and j) in Sigma
  getOrdCov <- function(i, j) {
    ## to use apply(), i= can be 2 values indicating the [row, column]
    if (length(i) > 1L) {
      if (!missing(j)) warning("j ignored when i has multiple values")
      if (length(i) > 2L) stop("i had ", length(i), " values. Only the first 2 were used.")
      j <- i[2]
      i <- i[1]
    }
    ## if i/j are numeric, get names
    if (is.numeric(i)) i <- vn[i]
    if (is.numeric(j)) j <- vn[j]
    ## make sure thresholds are standardized
    # i.thr <-
    # j.thr <-

    ## template for matrices of joint probabilities and cross-products
    JointProbs <- CP <- matrix(0, nrow = length(cWts[[i]]),
                                  ncol = length(cWts[[j]]))
    i.thr <- c(-1e5, (thresh[[i]] - Mu[i]) / SDs[i], 1e5)
    j.thr <- c(-1e5, (thresh[[j]] - Mu[j]) / SDs[j], 1e5)
    tCombos <- cbind(expand.grid(i = i.thr, j = j.thr),
                     expand.grid(cat1 = c(0, seq_along(cWts[[i]])),
                                 cat2 = c(0, seq_along(cWts[[j]]))))
    tCombos$cp <- pbivnorm(x = tCombos$i, y = tCombos$j, rho = out$R_LRV[i,j])
    ## loop over rows & columns
    for (RR in seq_along(cWts[[i]])) for (CC in seq_along(cWts[[j]])) {
      ## calculate joint probabilities
      idx1 <- which(tCombos$cat1 == RR     & tCombos$cat2 == CC    )
      idx2 <- which(tCombos$cat1 == RR - 1 & tCombos$cat2 == CC    )
      idx3 <- which(tCombos$cat1 == RR     & tCombos$cat2 == CC - 1)
      idx4 <- which(tCombos$cat1 == RR - 1 & tCombos$cat2 == CC - 1)
      JointProbs[RR,CC] <- tCombos$cp[idx1] - tCombos$cp[idx2] - tCombos$cp[idx3] + tCombos$cp[idx4]
      ## calculate cross-products
      CP[RR,CC] <- (cWts[[i]][RR] - Mu_ord[i]) * (cWts[[j]][CC] - Mu_ord[j])
    }
    sum(JointProbs * CP) # return covariance
  }

  ## check whether all variables are being discretized
  stayCon <- setdiff(vn, cn)
  if (length(stayCon) == 0) {
    ## all are polychoric
    (ij <- which(lower.tri(Sigma_ord), arr.ind = TRUE))
    Sigma_ord[ij] <- mapply(getOrdCov, i = ij[,1], j = ij[,2])
    Sigma_ord[ ij[,2:1] ] <- Sigma_ord[ij] # copy lower to upper triangle

  } else {
    ## pair by pair, choose polychoric or polyserial
    for (i in vn[-length(vn)]) for (j in vn[(which(vn == i)+1):length(vn)]) {
      if (i %in% stayCon && j %in% stayCon) next
      if (j %in% cn && j %in% cn) {
        ## both discretized, calculate polychoric
        Sigma_ord[i,j] <- Sigma_ord[j,i] <- getOrdCov(i, j)
        next
      }
      ## else, calculate polyserial
      if (i %in% stayCon) {
        CON <- i
        CAT <- j
      } else {
        CAT <- i
        CON <- j
      }
      DENS <- mapply(function(tau, interval, m = 0, sd = 1) {
        dnorm(tau, mean = m, sd = sd) * interval
      }, tau = thresh[[CAT]], interval = diff(cWts[[CAT]]),
         m = Mu[CAT], sd = SDs[CAT])
      ## Note: polyserial correlation divides by sqrt(diag(Sigma_ord)[CAT]),
      ##       but that cancels out when scaling by both SDs to get covariance
      Sigma_ord[CON, CAT] <- Sigma_ord[CAT, CON] <-
        out$R_LRV[CON, CAT] * sum(DENS) * sqrt(diag(out$Sigma_LRV)[CON])

    }
  }

  class(Sigma_ord) <- c("lavaan.matrix.symmetric","matrix")
  if (nrow(Sigma_ord) > 1L) {
    R_ord <- cov2cor(Sigma_ord)
    class(R_ord)     <- c("lavaan.matrix.symmetric","matrix")
  } else R_ord <- NULL

  c(out, list(Sigma_ord = Sigma_ord, R_ord = R_ord))
}