File: clm_article.Rnw

package info (click to toggle)
r-cran-ordinal 2023.12-4.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,852 kB
  • sloc: ansic: 979; sh: 13; makefile: 5
file content (1719 lines) | stat: -rw-r--r-- 107,806 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
% \documentclass[article]{article}
% \documentclass[article]{jss}
\documentclass[nojss]{jss}

%% -- Latex packages and custom commands ---------------------------------------

%% recommended packages
\usepackage{thumbpdf,lmodern,amsmath,amssymb,bm,url}

\usepackage{textcomp}
\usepackage[utf8]{inputenc}

%% another package (only for this demo article)
\usepackage{framed}

%% new custom commands
\newcommand{\class}[1]{`\code{#1}'}
\newcommand{\fct}[1]{\code{#1()}}

%% For Sweave-based articles about R packages:
%% need no \usepackage{Sweave}
\SweaveOpts{engine=R, eps=FALSE, keep.source = TRUE, prefix.string=clmjss}
<<preliminaries, echo=FALSE, results=hide>>=
options(prompt = "R> ", continue = "+  ", width = 70, useFancyQuotes = FALSE)
library("ordinal")
library("xtable")
@
%%\VignetteIndexEntry{Cumulative Link Models for Ordinal Regression}
%%\VignetteDepends{ordinal, xtable}


%% -- Article metainformation (author, title, ...) -----------------------------

%% - \author{} with primary affiliation
%% - \Plainauthor{} without affiliations
%% - Separate authors by \And or \AND (in \author) or by comma (in \Plainauthor).
%% - \AND starts a new line, \And does not.
\author{Rune Haubo B Christensen\\Technical University of Denmark\\ 
\& \\
Christensen Statistics}
\Plainauthor{Rune Haubo B Christensen}

%% - \title{} in title case
%% - \Plaintitle{} without LaTeX markup (if any)
%% - \Shorttitle{} with LaTeX markup (if any), used as running title
\title{Cumulative Link Models for Ordinal Regression with the \proglang{R} Package \pkg{ordinal}}
\Plaintitle{Cumulative Link Models for Ordinal Regression with the R Package ordinal}
\Shorttitle{Cumulative Link Models with the \proglang{R} package \pkg{ordinal}}

%% - \Abstract{} almost as usual
\Abstract{
This paper introduces the R-package \pkg{ordinal} for the analysis of ordinal data using cumulative link models. The model framework implemented in \pkg{ordinal} includes partial proportional odds, structured thresholds, scale effects and flexible link functions. The package also support cumulative link models with random effects which are covered in a future paper. A speedy and reliable regularized Newton estimation scheme using analytical derivatives provides maximum likelihood estimation of the model class. The paper describes the implementation in the package as well as how to use the functionality in the package for analysis of ordinal data including topics on model identifiability and customized modelling. 
The package implements methods for profile likelihood confidence intervals, analysis of deviance tables with type I, II and III tests, predictions of various kinds as well as methods for checking the convergence of the fitted models.
}

%% - \Keywords{} with LaTeX markup, at least one required
%% - \Plainkeywords{} without LaTeX markup (if necessary)
%% - Should be comma-separated and in sentence case.
\Keywords{ordinal, cumulative link models, proportional odds, scale effects, \proglang{R}}
\Plainkeywords{ordinal, cumulative link models, proportional odds, scale effects, R}

%% - \Address{} of at least one author
%% - May contain multiple affiliations for each author
%%   (in extra lines, separated by \emph{and}\\).
%% - May contain multiple authors for the same affiliation
%%   (in the same first line, separated by comma).
\Address{
  Rune Haubo Bojesen Christensen\\
  Section for Statistics and Data Analysis\\
  Department of Applied Mathematics and Computer Science\\
  DTU Compute\\
  Technical University of Denmark\\
  Richard Petersens Plads \\
  Building 324 \\
  DK-2800 Kgs. Lyngby, Denmark\\
  \emph{and}\\
  Christensen Statistics\\
  Bringetoften 7\\
  DK-3500 V\ae rl\o se, Denmark \\
  E-mail: \email{Rune.Haubo@gmail.com}; \email{Rune@ChristensenStatistics.dk}%\\
  % URL: \url{http://christensenstatistics.dk/}
}

\begin{document}

This is a copy of an article that is no longer submitted for publication in Journal of
Statistical Software (\url{https://www.jstatsoft.org/}).


%% -- Introduction -------------------------------------------------------------

%% - In principle "as usual".
%% - But should typically have some discussion of both _software_ and _methods_.
%% - Use \proglang{}, \pkg{}, and \code{} markup throughout the manuscript.
%% - If such markup is in (sub)section titles, a plain text version has to be
%%   added as well.
%% - All software mentioned should be properly \cite-d.
%% - All abbreviations should be introduced.
%% - Unless the expansions of abbreviations are proper names (like "Journal
%%   of Statistical Software" above) they should be in sentence case (like
%%   "generalized linear models" below).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction} \label{sec:intro}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Ordered categorical data, or simply \emph{ordinal} data, are
common in a multitude of empirical sciences and in particular in 
scientific disciplines where humans are used as measurement
instruments. Examples include school grades, ratings of preference
in consumer studies, degree of tumor involvement in MR images and
animal fitness in ecology. Cumulative link models (CLM)
are a powerful model class for such data since observations are
treated correctly as categorical, the ordered nature is exploited and
the flexible regression framework allows for in-depth analyses.

This paper introduces the \pkg{ordinal} package \citep{ordinal-pkg} for \proglang{R} \citep{R} for the analysis of ordinal data with cumulative link models. The paper describes how \pkg{ordinal} supports the fitting of CLMs with various models structures, model assessment and inferential options including tests of partial proportional odds, scale effects, threshold structures and flexible link functions. The implementation, its flexibility in allowing for costumizable models and an effective fitting algorithm is also described. The \pkg{ordinal} package also supports cumulative link \emph{mixed} models (CLMM); CLMs with normally distributed random effects. The support of this model class will not be given further treatment here but remain a topic for a future paper.

The name, \emph{cumulative link models} is adopted from \citet{agresti02}, but
the model class has been referred to by several other names in the literature, such as \emph{ordered logit models} and \emph{ordered probit models} \citep{greene10} for the
logit and probit link functions. The cumulative link model
with a logit link is widely known as the \emph{proportional odds
  model} due to \citet{mccullagh80} and with a complementary log-log
link, the model is sometimes referred to as the \emph{proportional hazards model} for grouped survival times.

CLMs is one of several types of models specifically developed for ordinal data. Alternatives to CLMs include continuation ratio models, adjacent category models, and stereotype models \citep{ananth97} but only models in the CLM framework will be considered in this paper. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Software review}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Cumulative link models can be fitted by all the major software packages and while some software packages support scale effects, partial proportional odds (also referred to as unequal slopes, partial effects, and nominal effects), different link functions and structured thresholds all model structures are not available in any one package or implementation. The following brief software review is based on the publicly available documentation at software package websites retrieved in May 2020.

\proglang{IBM SPSS} \citep{SPSS} implements McCullagh's \pkg{PLUM} \citep{mccullagh80} procedure, allows for the five standard link functions (cf. Table~\ref{tab:linkFunctions}) and scale effects. Estimation is via Fisher-Scoring and a test for equal slopes is available for the location-only model while it is not possible to estimate a partial proportional odds model.

\proglang{Stata} \citep{Stata} includes the \code{ologit} and \code{oprobit} procedures for CLMs with logistic and probit links but without support for scale effects, partial effect or structured thresholds. The add-on package \pkg{oglm} \citep{oglm} allows for all five standard link functions and scale effects. The \pkg{GLLAMM} package \citep{gllamm} also has some support for CLMs in addition to some support for random effects. 

\proglang{SAS} \citep{SAS} implements CLMs with logit links in \code{proc logistic} and CLMs with the 5 standard links in \code{prog genmod}. 

\proglang{Matlab} \citep{Matlab} fits CLMs with the \code{mnrfit} function allowing for logit, probit, complementary log-log and log-log links.

\proglang{Python} has a package \pkg{mord} \citep{mord} for ordinal classification and prediction focused at machine learning applications.

In \proglang{R}, several packages on the Comprehensive \proglang{R} Archive Network (CRAN) implements CLMs. \code{polr} from \pkg{MASS} \citep{MASS} implements standard CLMs allowing for the 5 standard link functions but no further extensions; the \pkg{VGAM} package \citep{VGAM} includes CLMs via the \code{vglm} function using the \code{cumulative} link. \code{vglm} allows for several link functions as well as partial effects. The \code{lrm} and \code{orm} functions from the \pkg{rms} package \citep{rms} also implements CLMs with the 5 standard link functions but without scale effects, partial or structured thresholds. A Bayesian alternative is implemented in the \pkg{brms} package \citep{brms, brms2} which includes structured thresholds in addition to random-effects.   

In addition, several other \proglang{R} packages include methods for analyses of ordinal data including \pkg{oglmx} \citep{oglmx}, \pkg{MCMCpack} \citep{MCMCpack}, \pkg{mvord} \citep{mvord}, \pkg{CUB} \citep{CUB}, and \pkg{ordinalgmifs} \citep{ordinalgmifs}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection[ordinal package overview]{\pkg{ordinal} package overview}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The \pkg{ordinal} package implements CLMs and CLMMs along with functions and methods to support these model classes as summarized in Table~\ref{tab:functions_in_ordinal}. The two key functions in \pkg{ordinal} are \code{clm} and \code{clmm} which fits CLMs and CLMMs respectively; \code{clm2} and \code{clmm2}\footnote{A brief tutorial on \code{clmm2} is currently available at the package website on CRAN: \url{https://CRAN.R-project.org/package=ordinal}} provide legacy implementations primarily retained for backwards compatibility. This paper introduces \code{clm} and its associated functionality covering CLMs with location, scale and nominal effects, structured thresholds and flexible link functions. \code{clm.fit} is the main work horse behind \code{clm} and an analogue to \code{lm.fit} for linear models. The package includes methods for assessment of convergence with \code{convergence} and \code{slice}, an auxiliary method for removing linearly dependent columns from a design matrix in \code{drop.coef}. Distributional support functions in \pkg{ordinal} provide support for Gumbel and log-gamma distributions as well as gradients\footnote{gradients with respect to $x$, the quantile; not the parameters of the distributions} of normal, logistic and Cauchy probability density functions which are used in the iterative methods implemented in \code{clm} and \code{clmm}. 


\begin{table}[t!]
  \centering
  \renewcommand*{\arraystretch}{1.2}
  \begin{tabular}{llll}
    \hline
    \rotatebox{0}{Fitting} &
    \rotatebox{0}{Miscellaneous} &
    \rotatebox{0}{Former impl.} &
    \rotatebox{0}{Distributions} \\
    \hline
    \code{clm} & \code{convergence} & \code{clm2} &
    \code{[pdqrg]gumbel}$^{\textsf{c}}$ \\
    \code{clmm}$^{\textsf{c}}$ & \code{slice} & \code{clmm2}$^{\textsf{c}}$
    & \code{[pdg]lgamma}$^{\textsf{c}}$ \\
    \code{clm.fit} & \code{drop.coef} & \code{clm2.control}
    & \code{gnorm}$^{\textsf{c}}$ \\
    \code{clm.control} & & \code{clmm2.control} &
    \code{glogis}$^{\textsf{c}}$ \\
    \code{clmm.control} & &  & \code{gcauchy}$^{\textsf{c}}$ \\
    \hline
  \end{tabular} \\
  \caption{Key functions in \pkg{ordinal}. Superscript "c" indicates (partial or full) implementation in \proglang{C}.\label{tab:functions_in_ordinal}}
\end{table}
  
As summarized in Table~\ref{tab:clm_methods}, \pkg{ordinal} provides the familiar suite of extractor and print methods for \code{clm} objects known from \code{lm} and \code{glm}. These methods all behave in ways similar to those for \code{glm}-objects with the exception of \code{model.matrix} which returns a list of model matrices and \code{terms} which can return the \code{terms} object for each of three formulae. The inference methods facilitate profile likelihood confidence intervals via \code{profile} and \code{confint}, likelihood ratio tests for model comparison via \code{anova}, model assessment by tests of removal of model terms via \code{drop1} and addition of new terms via \code{add1} or AIC-based model selection via \code{step}. Calling \code{anova} on a single \code{clm}-object provides an analysis of deviance table with type I, II or III Wald-based $\chi^2$ tests following the \proglang{SAS}-definitions of such tests \citep{SAStype}. In addition to standard use of \code{clm}, the implementation facilitates extraction a model environment containing a complete representation of the model allowing the user to fit costumized models containing, for instance, special structures on the threshold parameters, restrictions on regression parameters or other case-specific model requirements.
As CLMMs are not covered by this paper methods for \code{clmm} objects will not be discussed.

Other packages including \pkg{emmeans} \citep{emmeans}, \pkg{margins} \citep{margins}, \pkg{ggeffects} \citep{ggeffects}, \pkg{generalhoslem} \citep{generalhoslem} and \pkg{effects} \citep{effects1, effects2} extend the \pkg{ordinal} package by providing methods marginal means, tests of functions of the coefficients, goodness-of-fit tests and methods for illustration of fitted models.
  
\begin{table}[t!]
  \centering
  \renewcommand*{\arraystretch}{1.2}
  \begin{tabular}{llll}
    \hline
    \multicolumn{2}{l}{Extractor and Print} & Inference & Checking \\[3pt]
    \hline
    \code{coef} & \code{print} & \code{anova} & \code{slice} \\
    \code{fitted} & \code{summary} & \code{drop1} & \code{convergence}\\
    \code{logLik} & \code{model.frame} & \code{add1} & \\
    \code{nobs} & \code{model.matrix} & \code{confint} & \\
    \code{vcov} & \code{update} & \code{profile} & \\
    \code{AIC}, \code{BIC} & & \code{predict} & \\
    \code{extractAIC} & & \code{step}, \code{stepAIC} & \\
    \hline
  \end{tabular}
  \caption{Key methods for \code{clm} objects.\label{tab:clm_methods}}
\end{table}
  
The \pkg{ordinal} package is therefore unique in providing a comprehensive framework for cumulative link models exceeding that of other software packages with its functionality extended by a series of additional \proglang{R} packages. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Organization of the paper}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The remainder of the paper is organized as follows. The next section establishes notation by defining CLMs and associated log-likelihood functions, then describes the extended class of CLMs that is implemented in \pkg{ordinal} including details about scale effects, structured thresholds, partial proportional odds and flexible link functions. The third section describes how maximum likelihood (ML) estimation of CLMs is implemented in \pkg{ordinal}. The fourth section describes how CLMs are fitted and ordinal data are analysed with \pkg{ordinal} including sections on nominal effects, scale effects, structured thresholds, flexible link functions, profile likelihoods, assessment of model convergence, fitted values and predictions. The final parts of section four is on a more advanced level and include issues around model identifiability and customizable fitting of models not otherwise covered by the \pkg{ordinal} API. We end in section~\ref{sec:conclusions} with Conclusions.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Cumulative link models}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

A cumulative link model is a model for ordinal-scale observations, i.e., observations that fall in an ordered finite set of categories. Ordinal observations can be represented by a random variable $Y_i$ that takes a value $j$ if the $i$th ordinal observations falls in the $j$'th category where $j = 1, \ldots, J$ and 
$J \geq 2$.\footnote{binomial models ($J = 2$) are also included.}%
%
A basic cumulative link model is
\begin{equation}
  \label{eq:BasicCLM}
  \gamma_{ij} = F(\eta_{ij})~, \quad
  \eta_{ij} = \theta_j - \bm x_i^\top \bm\beta~, \quad
  i = 1,\ldots,n~, \quad  j = 1, \ldots, J-1 ~,
\end{equation}
where 
\begin{equation*}
  %% \label{eq:cum}
  \gamma_{ij} = \Prob (Y_i \leq j) = \pi_{i1} + \ldots + \pi_{ij} 
  \quad \mathrm{with} \quad \sum_{j=1}^J \pi_{ij} = 1
\end{equation*}
are cumulative probabilities\footnote{we have suppressed the
  conditioning on the covariate vector, $\bm x_i$, i.e.,
  $\gamma_{ij} = \gamma_j(\bm x_i)$ and $P(Y_i \leq j) = P(Y \leq j |
  \bm x_i)$.}, $\pi_{ij}$ is the probability that the $i$th observation falls in the $j$th category, $\eta_{ij}$ is the linear predictor and
$\bm x_i^\top$ is a $p$-vector of regression variables for the parameters,
$\bm\beta$ without a leading column for an intercept and $F$ is the inverse link 
function.
%
The thresholds (also known as cut-points or intercepts) are strictly ordered:
\begin{equation*}
  -\infty \equiv \theta_0 \leq \theta_1 \leq \ldots \leq \theta_{J-1} \leq
  \theta_J \equiv \infty.
\end{equation*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{The multinomial distribution and the log-likelihood function}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The ordinal observation $Y_i$ which assumes the value $j$ can be represented 
by a multinomially distributed 
variable $\bm Y_i^* \sim \mathrm{multinom}(\bm\pi_i, 1)$, where $\bm Y_i^*$ is
a $J$-vector with a $1$ at
the $j$'th entry and 0 otherwise, and with probability mass function
%
\begin{equation}
  \label{eq:multinom_pmf}
  \Prob(\bm Y_i^* = \bm y_i^*) = \prod_j \pi_{ij}^{y_{ij}^*} ~.
\end{equation}
%
The log-likelihood function can therefore be written as 
%
\begin{equation*}
  \ell(\bm\theta, \bm\beta; \bm y^*) = \sum_i \sum_j y_{ij}^* \log \pi_{ij}
\end{equation*}
%
or equivalently
%
\begin{align*}
  \ell(\bm\theta, \bm\beta; \bm y) =~& \sum_i \sum_j \mathrm I (y_i = j) \log \pi_{ij} \\
  =~& \sum_i \log \tilde\pi_i
\end{align*}
%
where $\tilde\pi_i$ is the $j$'th entry in $J$-vector $\bm \pi_i$ with elements $\pi_{ij}$ and $\mathrm I(\cdot)$ is 
the indicator function.

Allowing for observation-level weights (case weights), $w_i$ leads finally to
%
\begin{equation}
  \label{eq:clm-log-likelihood}
  \ell(\bm\theta, \bm\beta; \bm y) = \sum_i w_i \log \tilde\pi_i ~.
\end{equation}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Likelihood based inference}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Confidence intervals for model parameters are obtained by appealing to the asymptotic normal distribution of a statistic $s(\cdot)$ for a scalar parameter of interest $\beta_a$ and defined as 
\begin{equation*}
  CI:~\left\{ \beta_a; |s(\beta_a)| < z_{1 - \alpha/2} \right\} .
\end{equation*}
where $z_{1 - \alpha/2}$ is the $(1 - \alpha/2)$ quantile of the standard normal cumulative distribution function.
Taking $s(\cdot)$ to be the Wald statistic $s(\beta_a):~ w(\beta_a) = (\hat\beta_a - \beta_a)/\hat{\mathrm{se}}(\hat\beta_a)$ leads to the classical symmetric intervals. Better confidence intervals can be obtained by choosing instead the likelihood root statistic \citep[see e.g.,][]{pawitan01, brazzale07}:
\begin{equation*}
  s(\beta_a):~ r(\beta_a) = \mathrm{sign}(\hat\beta_a - \beta_a) \sqrt{-2 [
    \ell(\hat{\bm\theta}, \hat{\bm\beta}; \bm y) - \ell_p(\beta_a; \bm y)]}
\end{equation*}
where 
\begin{equation*}
  \ell_p(\beta_a; \bm y) = \max_{\bm\theta, \bm\beta_{-a}}
  \ell(\bm\theta, \bm\beta; \bm y)~,
\end{equation*}
is the profile likelihood for the scalar parameter $\beta_a$ and $\bm\beta_{-a}$ is the vector of regression parameters without the $a$'th one. 

While the profile likelihood has to be optimized over all parameters except $\beta_a$ we define a \emph{log-likelihood slice} as
\begin{equation}
  \label{eq:slice}
  \ell_{\mathrm{slice}}(\beta_a; \bm y) = 
     \ell(\beta_a; \hat{\bm\theta}, \hat{\bm\beta}_{-a}, \bm y)~,
\end{equation}
which is the log-likelihood function evaluated at $\beta_a$ while keeping the remaining parameters fixed at their ML estimates.

A quadratic approximation to the log-likelihood slice is $(\hat\beta_a - \beta_a)^2 / 2\tau_a^2$ where the \emph{curvature unit} $\tau_a$ is the square root of $a$'th diagonal element of the Hessian of $-\ell(\hat{\bm\theta}, \hat{\bm\beta}; \bm y)$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Link functions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

A commonly used link function is the logit link which leads to 
%
\begin{equation}
  \label{eq:cum_logit_model}
  \mathrm{logit}(\gamma_{ij}) = \log \frac{\Prob (Y_i \leq j)}{1 - \Prob(Y_i \leq j)} 
\end{equation}
%
The odds ratio (OR) of the event $Y_i \leq j$ at $\bm x_1$ relative to the same event at $\bm x_2$ is then
%
\begin{equation}
  \label{eq:odds_ratio}
  \mathrm{OR} = \frac{\gamma_j(\bm x_1) / [1 - \gamma_j(\bm x_1)]}
  {\gamma_j(\bm x_2) / [1 - \gamma_j(\bm x_2)]} =
  \frac{\exp(\theta_j - \bm x_1^\top \bm\beta)}
  {\exp(\theta_j - \bm x_2^\top \bm\beta)}
  %% =&~ \exp(\theta_j - \theta_j - \bm x_1 \bm\beta + \bm x_2 \bm\beta)
  = \exp[(\bm x_2^\top - \bm x_1^\top)\bm\beta]
\end{equation}
which is independent of $j$. Thus the cumulative odds ratio is
proportional to the distance between $\bm x_1$ and $\bm x_2$ which
motivated \citet{mccullagh80} to denote the cumulative logit model a
\emph{proportional odds model}. If $x$ represent a treatment variable
with two levels (e.g., placebo and treatment), then $x_2 - x_1 = 1$
and the odds ratio is $\exp(-\beta_\textup{treatment})$. Similarly the
odds ratio of the event $Y \geq j$ is
$\exp(\beta_\textup{treatment})$.

The probit link has its own interpretation through a normal linear model for a latent variable which is considered in section~\ref{sec:latent-variable-motivation}. 

The complementary log-log (clog-log) link is also sometimes used because of its interpretation as a proportional hazards model for grouped survival times:
\begin{equation*}
  -\log\{1 - \gamma_{j}(\bm x_i) \} = \exp( \theta_j - \bm x_i^T
  \bm\beta )
\end{equation*}
Here $1 - \gamma_{j}(\bm x_i)$ is the probability or survival beyond
category $j$ given $\bm x_i$. The proportional hazards model has the
property that
\begin{equation*}
  \log \{ \gamma_{j}(\bm x_1) \} = \exp[ (\bm x_2^T - \bm x_1^T)
  \bm\beta ] \log \{ \gamma_{j}(\bm x_2) \}~.
\end{equation*}
thus the ratio of hazards at $\bm x_1$ relative to $\bm x_2$ are proportional.
If the log-log link is used on the response categories in the reverse
order, this is equivalent to using the clog-log link on the response
in the original order. This reverses the sign of $\bm\beta$ as well as
the sign and order of $\{\theta_j\}$ while the likelihood and standard
errors remain unchanged.
%
% Thus, similar to the proportional odds
% model, the ratio of hazard functions beyond category $j$ at $\bm x_1$
% relative to $\bm x_2$ (the hazard ratio, $HR$) is:
% \begin{equation*}
%   HR = \frac{-\log\{1 - \gamma_{j}(\bm x_2) \}}
%   {-\log\{1 - \gamma_{j}(\bm x_1) \}} =
%   \frac{\exp( \theta_j - \bm x_1^T \bm\beta )}
%   {\exp( \theta_j - \bm x_2^T \bm\beta )} =
%   \exp[(\bm x_2 - \bm x_1)\bm\beta]
% \end{equation*}
%

Details of the most common link functions are described in Table~\ref{tab:linkFunctions}. 

\begin{table}[t!]
  \begin{center}
  %\footnotesize
  \begin{tabular}{llll}
    \hline
    Name & logit & probit & log-log \\
    \hline
    Distribution & logistic & normal & Gumbel (max)$^b$ \\
    Shape & symmetric & symmetric & right skew\\
    Link function ($F^{-1}$)  & $\log[\gamma / (1 - \gamma)]$ & $\Phi^{-1}(\gamma)$ &
    $-\log[-\log(\gamma)]$ \\
    Inverse link ($F$) & $1 / [1 + \exp(\eta)]$ & $\Phi(\eta)$ &
    $\exp(-\exp(-\eta))$ \\
    Density ($f = F'$) & $\exp(-\eta) / [1 + \exp(-\eta)]^2$ & $\phi(\eta)$ \\
    \hline
    \hline
    Name & clog-log$^a$ & cauchit \\
    \hline
    Distribution  & Gumbel (min)$^b$ & Cauchy$^c$ \\
    Shape & left skew &  kurtotic \\
    Link function ($F^{-1}$) & $\log[ -\log(1 - \gamma)]$ & $\tan[\pi (\gamma - 0.5)]$ \\
    Inverse link ($F$) & $1 - \exp[-\exp(\eta)]$ & $\arctan(\eta)/\pi + 0.5$ \\
    Density ($f = F'$) & $\exp[-\exp(\eta) + \eta]$ & $1 / [\pi(1 + \eta^2)]$ \\
    \hline
  \end{tabular}
  \end{center}
  % \footnotesize
  % 
  % $^a$: the \emph{complementary log-log} link \\
  % $^b$: the Gumbel distribution is also known as the extreme value
  % (type I) distribution for extreme minima or maxima. It is also
  % sometimes referred to as the Weibull (or log-Weibull) distribution
  % (\url{http://en.wikipedia.org/wiki/Gumbel_distribution}). \\
  % $^c$: the Cauchy distribution is a $t$-distribution with one df
  \caption{Summary of the five standard link functions. 
  $^a$: the \emph{complementary log-log} link; 
  $^b$: the Gumbel distribution is also known as the extreme value
  (type I) distribution for extreme minima or maxima. It is also
  sometimes referred to as the Weibull (or log-Weibull) distribution;
  $^c$: the Cauchy distribution is a $t$-distribution with one degree of freedom.
  \label{tab:linkFunctions}}
\end{table}


The \pkg{ordinal} package allows for the estimation of an extended class of 
cumulative link models in which the basic model~(\ref{eq:BasicCLM}) is extended
in a number of ways including structured thresholds, partial proportional odds,
scale effects and flexible link functions. The following sections will describe
these extensions of the basic CLM.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Extensions of cumulative link models} \label{sec:extensions-of-clms}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

A general formulation of the class of models (excluding random effects) that 
is implemented in \pkg{ordinal} can be written
%
\begin{equation}
  \gamma_{ij} = F_{\lambda}(\eta_{ij}), \quad 
  \eta_{ij} = \frac{g_{\bm\alpha} (\theta_j) - \bm x_i^\top \bm\beta - \bm w_i^\top \tilde{\bm\beta}_j}{\exp(\bm z_i\bm\zeta)}
\end{equation}
%
where 
\begin{description}
  \item[$F_{\lambda}$] is the inverse link function. It may be parameterized by the 
   scalar parameter $\lambda$ in which case we refer to $F_{\lambda}^{-1}$ as a 
   \emph{flexible link function},
   %
   \item[$g_{\bm\alpha}(\theta_j)$] parameterises thresholds $\{\theta_j\}$ by the 
   vector $\bm\alpha$ such that $g$ restricts $\{\theta_j\}$ to be for example 
   symmetric or equidistant. We denote this \emph{structured thresholds}.
   %
   \item[$\bm x_i^\top\bm\beta$] are the ordinary regression effects,
   %
   \item[$\bm w_i^\top \tilde{\bm\beta}_j$] are regression effects which are allowed to
   depend on the response category $j$ and they are denoted \emph{partial} or 
   \emph{non-proportional odds} \citep{peterson90} when the logit link is
   applied. To include other link functions in the terminology we denote these effects 
   \emph{nominal effects} (in text and code) because these effects are not integral to the
   ordinal nature of the data.
   %
   \item[$\exp(\bm z_i\bm\zeta)$] are \emph{scale effects} since in a latent 
   variable view these effects model the scale of the underlying location-scale 
   distribution.
\end{description}

With the exception of the structured thresholds, these extensions of the basic CLM have been considered individually in a number
of sources but to the author's best knowledge not previously in a unified 
framework.
%
For example partial proportional odds have been considered by \citet{peterson90} and scale effect have been considered by \citet{mccullagh80} and \citet{cox95}. 
% \citet{agresti02} is a good introduction to cumulative link models in the context of categorical data analysis and includes discussions of scale effects. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Latent variable motivation of CLMs} \label{sec:latent-variable-motivation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

It is natural to motivate the CLM from a linear model for a categorized version
of a latent variable. Assume the following linear model for an unobserved latent
variable:
%
\begin{equation}
  \label{eq:latent}
  S_i = \alpha^* + \bm x_i^\top \bm\beta^* + \varepsilon_i, \quad
  \varepsilon_i \sim N(0, \sigma^{*2})
\end{equation}
%
If $S_i$ falls between two thresholds, $\theta_{j-1}^* < S_i \leq \theta_j^*$ where
%
\begin{equation}
  \label{eq:thresholds}
  -\infty \equiv \theta_0^* < \theta_1^* < \ldots < \theta^*_{J-1} <
  \theta_{J}^* \equiv \infty
\end{equation}
%
then $Y_i = j$ is observed and the cumulative probabilities are:
%
\begin{equation*}
  \gamma_{ij} = \Prob (Y_i \leq j) = \Prob(S_i \leq \theta_j^*) = 
  \Prob \left( Z \leq \frac{\theta_j^* - \alpha^* - \bm x_i^\top \bm\beta^*}{%
    \sigma^*} \right) = 
  \Phi ( \theta_j - \bm x_i^\top \bm\beta )
\end{equation*}
%
where $Z$ follows a standard normal distribution, 
$\Phi$ denotes the standard normal cumulative distribution function,
parameters with an ``$^*$'' exist on the latent scale, 
$\theta_j = (\theta_j^* - \alpha^*) / \sigma^*$ and
$\bm\beta = \bm\beta^* / \sigma^*$. Note that $\alpha^*$, $\bm\beta^*$ and 
$\sigma^*$ would 
have been identifiable if the latent variable $S$ was directly observed, but they
are not identifiable with ordinal observations.

If we allow a log-linear model for the scale such that
%
\begin{equation*}
  \varepsilon_i \sim N(0, \sigma^{*2}_i), \quad 
  \sigma_i^* = \exp(\mu + \bm z_i^\top \bm\zeta) = \sigma^* \exp(\bm z_i^\top \bm\zeta)
\end{equation*}
%
where $\bm z_i$ is the $i$'th row of a design matrix $\bm Z$ without a leading column for an intercept and $\sigma^* = \exp(\mu)$, then
\begin{equation*}
  \gamma_{ij} = 
  \Prob \left( Z \leq \frac{\theta_j^* - \alpha^* - \bm x_i^\top \bm\beta^*}{%
    \sigma^*_i} \right) = 
  \Phi \left( \frac{\theta_j - \bm x_i^T \bm\beta}{\sigma_i} \right)
\end{equation*}
where $\sigma_i = \sigma_i^* / \sigma^* = \exp(\bm z_i^\top \bm\zeta)$ is the 
\emph{relative} scale.

The common link functions: probit, logit, log-log, c-log-log and cauchit correspond to inverse cumulative distribution functions of the normal, logistic, Gumbel(max), Gumbel(min) and Cauchy distributions respectively. These distributions are all members of the location-scale family with common form $F(\mu, \sigma)$, with location $\mu$ and non-negative scale $\sigma$, for example, the logistic distribution has mean $\mu$ and standard deviation $\sigma \pi / \sqrt{3}$. Choosing a link function therefore corresponds to assuming a particular distribution for the latent variable $S$ in which $\bm x_i^\top \bm\beta$ and $\exp(\bm z_i^\top \bm\zeta)$ models location \emph{differences} and scale \emph{ratios} respectively of that distribution. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Structured thresholds}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Structured thresholds, $\{ g(\bm\alpha)_j \}$ makes it possible to impose restrictions on the thresholds $\bm\theta = g(\bm\alpha)$. For instance restricting the thresholds to be equidistant means that only the location of, say, the first threshold and the spacing between adjacent thresholds has to be estimated, thus only two parameters are used to parameterize the thresholds irrespective of the number of response categories. 

\pkg{ordinal} takes $g(\bm\alpha)$ to be a linear function and operates with 
\begin{equation*}
  g(\bm\alpha) = \mathcal{J}^\top \bm\alpha = \bm \theta
\end{equation*}
where the Jacobian $\mathcal{J}$ defines the mapping from the parameters $\bm\alpha$ to the thresholds $\bm\theta$. The traditional ordered but otherwise unrestricted thresholds are denoted \emph{flexible thresholds} and obtained by taking $\mathcal{J}$ to be an identity matrix.

Assuming $J=6$ ordered categories, the Jacobians for equidistant and symmetric thresholds (denoted \code{equidistant} and \code{symmetric} in the \code{clm}-argument \code{threshold}) are 
\begin{equation*}
  \mathcal{J}_{\mathrm{equidistant}} = 
    \begin{bmatrix} 
    1 & 1 & 1 & 1 & 1 \\
    0 & 1 & 2 & 3 & 4 \\
    \end{bmatrix}, \quad
  \mathcal{J}_{\mathrm{symmetric}} = 
    \begin{bmatrix} 
    1 &  1  & 1 &  1  & 1 \\
    0 & -1 &  0 &  1 &  0 \\
   -1 &  0  & 0 &  0  & 1 \\
    \end{bmatrix}.
\end{equation*}
Another version of symmetric thresholds (denoted \code{symmetric2}) is sometimes relevant with an unequal number of response categories here illustrated with $J=5$ together with the \code{symmetric} thresholds:
\begin{equation*}
  \mathcal{J}_{\mathrm{symmetric2}} = 
    \begin{bmatrix} 
    0 &  -1 &   1 &   0 \\
   -1 &   0 &   0 &   1 \\
    \end{bmatrix}, \quad
  \mathcal{J}_{\mathrm{symmetric}} = 
    \begin{bmatrix} 
    1 &   1 &   0 &   0 \\
    0 &   0 &   1 &   1 \\
   -1 &   0 &   0 &   1 \\
    \end{bmatrix}    
\end{equation*}
The nature of $\mathcal{J}$ for a particular model can always be inspected by printing the \code{tJac} component of the \code{clm} fit.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Partial proportional odds and nominal effects}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The nominal effects $\bm w_i^\top\tilde{\bm\beta}_j$ can be considered an extension of the regression part of the model $\bm x_i^\top \bm\beta$ in which the regression effects are allowed to vary with $j$. 
%
The nominal effects can also be considered an extension of the thresholds $\theta_j$ which allows them to depend on variables $\bm w_i^\top$: $\tilde{\theta}_{ij}(\bm w_i^\top) = \theta_j - \bm w_i^\top \tilde{\bm\beta}_j$ is the $j$'th threshold for the $i$'th observation. The following treatment assumes for latter view.

In general let $\bm W$ denote the design matrix for the nominal effects without a leading column for an intercept; the nominal-effects parameter vector $\tilde{\bm\beta}_j$ is then $\mathrm{ncol}(\bm W)$ long and $\tilde{\bm\beta}$ is $\mathrm{ncol}(\bm W) \cdot (J-1)$ long.

If $\bm W$ is the design matrix for the nominal effects containing a single column for a continuous variable then $\tilde{\beta}_j$ is the slope parameter corresponding to the $j$'th threshold and $\theta_j$ is the $j$'th intercept, i.e., the threshold when the covariate is zero. Looking at $\tilde{\theta}_{ij}(\bm w_i^\top) = \theta_j - \bm w_i^\top \tilde{\bm\beta}_j$ as a linear model for the thresholds facilitates the interpretation.

If, on the other hand, $\bm W$ is the design matrix for a categorical variable (a \code{factor} in \proglang{R}) then the interpretation of $\tilde{\bm\beta}_j$ depends on the contrast-coding of $\bm W$. If we assume that the categorical variable has 3 levels, then $\tilde{\bm\beta}_j$ is a 2-vector. In the default treatment contrast-coding (\code{"contr.treatment"}) $\theta_j$ is the $j$'th threshold for the first (base) level of the factor, $\tilde{\beta}_{1j}$ is the differences between thresholds for the first and second level and $\tilde{\beta}_{2j}$ is the difference between the thresholds for the first and third level.

In general we define $\bm\Theta$ as a matrix with $J-1$ columns and with 1 row for each combination of the levels of factors in $\bm W$. This matrix is available in the \code{Theta} component of the model fit.

Note that variables in $\bm X$ cannot also be part of $\bm W$ if the model is to remain identifiable. \pkg{ordinal} detects this and automatically removes the offending variables from $\bm X$.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Flexible link functions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The \pkg{ordinal} package allows for two kinds of flexible link functions due to \citet{aranda-ordaz83} and \citet{genter85}. 

The link function proposed by \citet{aranda-ordaz83} reads
%
\begin{equation*}
  F^{-1}_\lambda (\gamma_{ij}) = \log \left\{ \frac{(1 -
      \gamma_{ij})^{-\lambda} - 1} 
    {\lambda} \right\}~,
\end{equation*}
which depends on the auxiliary parameter $\lambda \in ]0,
\infty[$. When $\lambda = 1$, the logistic link function arise, and
when $\lambda \rightarrow 0$,
\begin{equation*}
  \{ (1 - \gamma_{ij})^{-\lambda} - 1 \} / \lambda \rightarrow
  \log (1 - \gamma_{ij})^{-1}~,
\end{equation*}
so the log-log link arise. 

The inverse link function and its derivative are given by
\begin{align*}
  F(\eta) =&~ 1 - (\lambda \exp(\eta) + 1)^{-\lambda^{-1}} \\
  f(\eta) =&~ \exp(\eta) (\lambda \exp(\eta) + 1)^{-\lambda^{-1} - 1}
\end{align*}

The density implied by the inverse link function is left-skewed if $0 < \lambda < 1$, symmetric if $\lambda = 1$ and right-skewed if $\lambda >
1$, so the link function can be used to assess the evidence about possible skewness of the latent distribution. 

The log-gamma link function proposed by \citet{genter85} is based on the log-gamma density by \citet{farewell77}. The cumulative distribution function and hence inverse link function reads
\begin{equation*}
  F_\lambda(\eta) = 
    \begin{cases} 
      1 - G(q; \lambda^{-2}) & \lambda < 0 \\
      \Phi(\eta) & \lambda = 0 \\
      G(q; \lambda^{-2}) & \lambda > 0
    \end{cases}
\end{equation*}
where $q = \lambda^{-2}\exp(\lambda \eta)$ and $G(\cdot; \alpha)$ denotes the Gamma distribution with shape parameter $\alpha$ and unit rate parameter, and $\Phi$ denotes the standard normal cumulative distribution function.

The corresponding density function reads
\begin{equation*}
  f_\lambda(\eta) = 
    \begin{cases} 
      |\lambda| k^k \Gamma(k)^{-1} \exp\{ k(\lambda\eta - \exp(\lambda\eta)) \} & \lambda \neq 0 \\
      \phi(\eta) & \lambda = 0
    \end{cases}
\end{equation*}
where $k=\lambda^{-2}$, $\Gamma(\cdot)$ is the gamma function and $\phi$ is the standard normal density function.

By attaining the Gumbel(max) distribution at $\lambda = -1$, the standard normal distribution at $\lambda = 0$ and the Gumbel(min) distribution at $\lambda = 1$ the log-gamma link bridges the log-log, probit and complementary log-log links providing 
right-skew, symmetric and left-skewed latent distributions in a single family of link functions.

Note that choice and parameterization of the predictor, $\eta_{ij}$, e.g., the use of scale effects, can affect the evidence about the shape of the latent distribution. There are usually several link functions which provide essentially the same fit to the data and choosing among the good candidates is often better done by appealing to arguments such as ease of interpretation rather than arguments related to fit. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section[Implementation of ML Estimation of CLMs in ordinal]{Implementation of ML Estimation of CLMs in \pkg{ordinal}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


In the \pkg{ordinal} package cumulative link models are (by default) estimated with a regularized Newton-Raphson (NR) algorithm with step-halving (line search) using analytical expressions for the gradient and Hessian of the negative log-likelihood function. 

This NR algorithm with analytical derivatives is used irrespective of whether the model contains structured thresholds, nominal effects or scale effects; the only exception being models with flexible link functions for which a general-purpose quasi-Newton optimizer is used.

Due to computationally cheap and efficient evaluation of the analytical derivatives, the relative well-behaved log-likelihood function (with exceptions described below) and the speedy convergence of the Newton-Raphson algorithm, the estimation of CLMs is virtually instant on a modern computer even with complicated models on large datasets. This also facilitates simulation studies. More important than speed is perhaps that the algorithm is reliable and accurate. 

Technical aspects of the regularized NR algorithm with step-halving (line search) are described in appendix~\ref{sec:algorithm} and analytical gradients are described in detail in \citet{mythesis}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Properties of the log-likelihood function for extended CLMs}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\citet{pratt81} and \citet{burridge81} showed (seemingly independent
of each other) that the log-likelihood function of the basic cumulative link
model~(\ref{eq:BasicCLM}) 
is concave. This means that there is a unique global optimum of the log-likelihood function and therefore no risk of convergence to a local optimum. 

It also means that the Hessian matrix for the negative log-likelihood is strictly positive definite and therefore also that the Newton step is always in direction of higher likelihood. The genuine Newton step may be too long to actually cause an increase in likelihood from one iteration to the next (this is called ``overshoot''). This is easily overcome by successively halving the length of the Newton step until an increase in likelihood is achieved.

Exceptions to the strict concavity of the log-likelihood function include models using the cauchit link, flexible link functions as well as models with scale effects. Notably models with structured thresholds as well as nominal effects do not affect the linearity of the predictor, $\eta_{ij}$ and so are also guaranteed to have concave log-likelihoods.

The restriction of the threshold parameters $\{\theta_j\}$ being non-decreasing is dealt with by defining $\ell(\bm\theta, \bm\beta; y) = \infty$ when $\{\theta_j\}$ are not in a non-decreasing sequence. If the algorithm attempts evaluation at such illegal values step-halving effectively brings the algorithm back on track. 

Other implementations of CLMs re-parameterize $\{\theta_j\}$ such that the non-decreasing nature of $\{\theta_j\}$ is enforced by the parameterization, for example, \code{MASS::polr} (package version 7.3.49) optimize the likelihood using 
\begin{equation*}
  \tilde\theta_1 = \theta_1, ~\tilde{\theta}_2 = \exp(\theta_2 - \theta_1),~\ldots, ~
  \tilde{\theta}_{J-1} = \exp(\theta_{J-2} - \theta_{J-1})
\end{equation*}
This is deliberately not used in \pkg{ordinal} because the log-likelihood function is generally closer to quadratic in the original parameterization in our experience which facilitates faster convergence.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Starting values}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

For the basic CLMs~(\ref{eq:BasicCLM}) the threshold parameters are initialized to an increasing sequence such that the cumulative density of a logistic distribution between consecutive thresholds (and below the lowest or above the highest threshold) is constant. The regression parameters $\bm\beta$, scale parameters $\bm\zeta$ as well as nominal effect $\bm\beta^*$ are initialized to 0.

If the model specifies a cauchit link or includes scale parameters estimation starts at the parameter estimates of a model using the probit link and/or without the scale-part of the model.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Estimation problems}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

With many nominal effects it may be difficult to find a model in which the threshold parameters are strictly increasing for all combinations of the parameters. Upon
convergence of the NR algorithm the model evaluates the $\bm\Theta$-matrix and 
checks that each row of threshold estimates are increasing.

When a continuous variable is included among the nominal effects it is often helpful if the continuous variable is centered at an appropriate value (at least within the observed range of the data). This is because $\{\theta_j\}$ represent the thresholds when the continuous variable is zero and $\{\theta_j\}$ are enforced to be a non-decreasing sequence. Since the nominal effects represent different slopes for the continuous variable the thresholds will necessarily be ordered differently at some other value of the continuous variable.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Convergence codes}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Irrespective of the fitting algorithm, \pkg{ordinal} reports the following convergence codes for CLMs in which negative values indicate convergence failure:
%
\begin{description}
  \item[-3] Not all thresholds are increasing. This is only possible with nominal effects and the resulting fit is invalid.
  \item[-2] The Hessian has at least one negative eigenvalue. This means that the point at which the algorithm terminated does not represent an optimum.
  \item[-1] Absolute convergence criterion (maximum absolute gradient) was not satisfied. This means that the algorithm couldn't get close enough to a stationary point of the log-likelihood function.
  \item[0] Successful convergence.
  \item[1] The Hessian is singular (i.e., at least one eigenvalue is zero). This means that some parameters are not uniquely determined.
\end{description}
%
Note that with convergence code \textbf{1} the optimum of the log-likelihood function has been found although it is not a single point but a line (or in general a (hyper) plane), so while some parameters are not uniquely determined the value of the likelihood is valid enough and can be compared to that of other models.

In addition to these convergence codes, the NR algorithm in \pkg{ordinal} reports the following messages:
\begin{description}
  \item[0] Absolute and relative convergence criteria were met
  \item[1] Absolute convergence criterion was met, but relative criterion was not met
  \item[2] iteration limit reached
  \item[3] step factor reduced below minimum
  \item[4] maximum number of consecutive Newton modifications reached
\end{description}

Note that convergence is assessed irrespective of potential messages from the fitting algorithm and irrespective of whether the tailored NR algorithm or a general-purpose quasi-Newton optimizer is used.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section[Fitting cumulative link models in ordinal with clm]{Fitting cumulative link models in \pkg{ordinal} with \code{clm}}
\label{sec:fitting-clms}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


The \code{clm} function takes the following arguments:
%
<<echo=FALSE>>=
clm_args <- gsub("function ", "clm", deparse(args(clm)))
cat(paste(clm_args[-length(clm_args)], "\n"))
@
%
Several arguments are standard and well-known from \code{lm} and
\code{glm} and will not be described in detail; \code{formula}, \code{data},
\code{weights}, \code{subset} and \code{na.action} are all parts of the standard model specification in \proglang{R}. 

\code{scale} and \code{nominal} are interpreted as \proglang{R}-formulae with no left hand sides and specifies the scale and nominal effects of the model respectively, see sections~\ref{sec:scale-effects} and \ref{sec:nominal-effects} for details; \code{start} is an optional vector of starting values; \code{doFit} can be set to \code{FALSE} to prompt \code{clm} to return a model \emph{environment}, for details see section~\ref{sec:customized-modelling}; \code{model} controls whether the \code{model.frame} should be included in the returned model fit; \code{link} specifies the link function and \code{threshold} specifies an optional threshold structure, for details see section~\ref{sec:threshold-effects}.

Note the absence of a separate \code{offset} argument. Since \code{clm} allows for different offsets in \code{formula} and \code{scale}, offsets have to be specified within a each formulae, e.g., \verb!scale = ~ x1 + offset(x2)!. 

Methods for \code{clm} model fits are summarized in Table~\ref{tab:clm_methods} and introduced in the following sections.

Control parameters can either be specified as a named list, among the optional \code{...} arguments, or directly as a call to \code{clm.control} --- in the first two cases the arguments are passed on to \code{clm.control}. \code{clm.control} takes the following arguments:
%
<<echo=FALSE>>=
cc_args <- gsub("function ", "clm.control", deparse(args(clm.control)))
cat(paste(cc_args[-length(cc_args)], "\n"))
@
%
The \code{method} argument specifies the optimization and/or return method. The default estimation method (\code{Newton}) is the regularized Newton-Raphson estimation scheme described in section~\ref{sec:algorithm}; options \code{model.frame} and \code{design} prompts \code{clm} to return respectively the \code{model.frame} and a list of objects that represent the internal representation instead of fitting the model; options \code{ucminf}, \code{nlminb} and \code{optim} represent different general-purpose optimizers which may be used to fit the model (the former from package \pkg{ucminf} \citep{ucminf}, the latter two from package \pkg{stats}). 
The \code{sign.location} and \code{sign.nominal} options allow the user to flip the signs on the location and nominal model terms.
The \code{convergence} argument instructs \code{clm} how to alert the user of potential convergence problems; \code{...} are optional arguments passed on to the general purpose optimizers; \code{trace} applies across all optimizers and positive values lead to printing of progress during iterations; the remaining arguments (\code{maxIter, gradTol, maxLineIter, relTol, tol}) control the behavior of the regularized NR algorithm described in appendix~\ref{sec:algorithm}. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection[Fitting a basic cumulative link model with clm]{Fitting a basic cumulative link model with \code{clm}} \label{sec:fitting-basic-clm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In the following examples we will use the wine data from \citet{randall89} available in the
object \code{wine} in package \pkg{ordinal},
cf., Table~\ref{tab:wineData}.
The data represent a factorial experiment on factors determining the
bitterness of wine with 1 = ``least bitter'' and 5 = ``most bitter''.
Two treatment factors (temperature and contact)
each have two levels. Temperature and contact between juice and
skins can be controlled when crushing grapes during wine
production. Nine judges each assessed wine from two bottles from
each of the four treatment conditions, hence there are 72
observations in all. 
The main objective is to examine the effect of contact and temperature
on the perceived bitterness of wine.

\begin{table}[t!]
  \centering
  \begin{tabular}{llrrrrr}
  \hline
  & & \multicolumn{5}{c}{Least---Most bitter} \\
  \cline{3-7}
<<echo=FALSE, results=tex>>=
## data(wine)
tab <- with(wine, table(temp:contact, rating))
mat <- cbind(rep(c("cold", "warm"), each = 2),
             rep(c("no", "yes"), 2),
             tab)
colnames(mat) <- c("Temperature", "Contact",
                   paste("~~", 1:5, sep = ""))
xtab <- xtable(mat)
print(xtab, only.contents = TRUE, include.rownames = FALSE,
      sanitize.text.function = function(x) x)
@
  \end{tabular}
  \caption{The number of ratings from nine judges in bitterness categories 1 --- 5. Wine data from \citet{randall89} aggregated over bottles and judges.%
  \label{tab:wineData}}
\end{table}%

Initially we consider the following cumulative link model for the wine
data:
\begin{equation}
  \label{eq:CLM}
  \begin{array}{c}
    \textup{logit}(P(Y_i \leq j)) = \theta_j - \beta_1 (\mathtt{temp}_i)
    - \beta_2(\mathtt{contact}_i) \\
    i = 1,\ldots, n, \quad j = 1, \ldots, J-1
  \end{array}
\end{equation}%
%
where $\beta_1(\mathtt{temp}_i)$  attains the values $\beta_1(\mathtt{cold})$ and $\beta_1(\mathtt{warm})$, and $\beta_2(\mathtt{contact}_i)$ attains the values
$\beta_2(\mathtt{no})$ and $\beta_2(\mathtt{yes})$. The effect of temperature in this model is illustrated in Figure~\ref{fig:standard_clm}.

This is a model for the cumulative probability of the $i$th rating
falling in the $j$th category or below, where $i$ index
all observations ($n=72$),  $j = 1, \ldots, J$ index the response
categories ($J = 5$) and $\theta_j$ is the intercept or threshold
for the $j$th cumulative logit: $\textup{logit}(P(Y_i \leq j))$.

Fitting the model with \code{clm} we obtain:
<<>>=
library("ordinal")
fm1 <- clm(rating ~ temp + contact, data = wine)
summary(fm1)
@
The \code{summary} method prints basic information about the fitted model.
% most of which is self explanatory.
%
The primary result is the coefficient table with parameter estimates,
standard errors and Wald based $p$~values for tests of the
parameters being zero. If one of the flexible link functions (\code{link = "log-gamma"} or \code{link = "Aranda-Ordaz"}) is used a coefficient table for the link parameter, $\lambda$ is also included.
The maximum likelihood estimates of the model coefficients are:%
%
\begin{equation}  \label{eq:parameters}
\begin{gathered}
  \hat\beta_1(\mathtt{warm} - \mathtt{cold})= 2.50, 
  ~~\hat\beta_2(\mathtt{yes} - \mathtt{no}) = 1.53, \\
  \{\hat\theta_j\} = \{-1.34,~ 1.25,~ 3.47,~ 5.01\}.
\end{gathered}
\end{equation}
%
The coefficients for \code{temp} and \code{contact} are positive
indicating that higher temperature and contact increase the
bitterness of wine, i.e., rating in higher categories is more likely.
%
Because the treatment contrast coding which is the default in \proglang{R} was used, $\{\hat\theta_j\}$ refers to the thresholds at the setting with 
$\mathtt{temp}_i = \mathtt{cold}$ and $\mathtt{contact}_i = \mathtt{no}$.
% 
Three natural and complementing interpretations of this model are
%
\begin{enumerate}
  \item The thresholds $\{ \hat\theta_j \}$ at $\mathtt{contact}_i = \mathtt{yes}$ conditions have been shifted a constant amount $1.53$ relative to the thresholds $\{ \hat\theta_j \}$ at $\mathtt{contact}_i = \mathtt{no}$ conditions.
  \item The location of the latent distribution has been shifted $+1.53 \sigma^*$ (scale units) at $\mathtt{contact}_i = \mathtt{yes}$ relative to $\mathtt{contact}_i = \mathtt{no}$.
  \item The odds ratio of bitterness being rated in category $j$ or above ($\mathrm{OR}(Y \geq j)$) is $\exp(\hat\beta_2(\mathtt{yes} - \mathtt{no})) = 4.61$.
\end{enumerate}
%
Note that there are no $p$~values displayed for the threshold coefficients because it usually does not make sense to test the hypothesis that they equal zero. 

\begin{figure}
  \centering
  \includegraphics[width=6cm]{./static_figs/fig-fig2} 
  \caption{Illustration of the effect of temperature in the standard cumulative link model in Equation~\ref{eq:CLM} for the wine data in Table~\ref{tab:wineData} through a latent variable interpretation.\label{fig:standard_clm}}
\end{figure}

The number of Newton-Raphson iterations is given below \code{niter}
with the number of step-halvings in parenthesis.
\code{max.grad} is the maximum absolute gradient of the
log-likelihood function with respect to the parameters. 
%
The condition number of the Hessian (\code{cond.H}) is well below $10^4$ and so does not indicate a problem with the model.

The \code{anova} method produces an analysis of deviance (ANODE) table also based on Wald $\chi^2$-tests and provides tables with type I, II and III hypothesis tests using the \proglang{SAS} definitions. A type I table, the \proglang{R} default for linear models fitted with \code{lm}, sequentially tests terms from first to last, type II tests attempt to respect the principle of marginality and test each term after all others while ignoring higher order interactions, and type III tables are based on orthogonalized contrasts and tests of main effects or lower order terms can often be interpreted as averaged over higher order terms. Note that in this implementation any type of contrasts (e.g., \code{contr.treatment} or \code{contr.SAS} as well as \code{contr.sum}) can be used to produce type III tests. For further details on the interpretation and definition of type I, II and III tests, please see \citep{kuznetsova17} and \citep{SAStype}.

Here we illustrate with a type III ANODE table, which in this case is equivalent to type I and II tables since the variables are balanced:
<<>>=
anova(fm1, type = "III")
@

Likelihood ratio tests, though asymptotically equivalent to the Wald tests usually better reflect the evidence in the data. These tests can be obtained by comparing nested models with the \code{anova} method, for example, the likelihood ratio test of \code{contact} is
<<>>=
fm2 <- clm(rating ~ temp, data = wine)
anova(fm2, fm1)
@
which in this case produces a slightly lower $p$~value.
Equivalently we can use \code{drop1} to obtain likelihood ratio
tests of the explanatory variables while \emph{controlling} for the
remaining variables:
<<>>=
drop1(fm1, test = "Chi")
@
Likelihood ratio tests of the explanatory variables while
\emph{ignoring} the remaining variables are provided by the
\code{add1} method:
<<>>=
fm0 <- clm(rating ~ 1, data = wine)
add1(fm0, scope = ~ temp + contact, test = "Chi")
@
%
Confidence intervals of the parameter estimates are provided by the \code{confint} method which by default compute the so-called profile likelihood confidence intervals:
<<>>=
confint(fm1)
@

The cumulative link model in Equation~\ref{eq:CLM} assumes that the
thresholds, $\{\theta_j\}$ are constant for all values of the
remaining explanatory variables, here \code{temp} and
\code{contact}. This is generally referred to as the
\emph{proportional odds assumption} or \emph{equal slopes
  assumption}.
We can relax this
assumption in two general ways: with nominal effects and scale effects
examples of which will now be presented in turn.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Partial and non-proportional odds: nominal effects}
\label{sec:nominal-effects}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The CLM in Equation~\ref{eq:CLM} specifies a structure in which the
regression parameters, $\bm\beta$ are not allowed to vary with $j$ or equivalently that the threshold parameters $\{\theta_j\}$ are not 
allowed to depend on regression variables.
In the following model this assumption is relaxed and the threshold parameters are allowed to depend on \code{contact}. This leads to the so-called partial proportional odds for \code{contact}:
%
\begin{equation}
  \label{eq:CLM_nominal}
  \begin{array}{c}
    \textup{logit}(P(Y_i \leq j)) =
    \theta_j + \tilde{\beta}_{j} (\mathtt{contact}_i) - \beta (\mathtt{temp}_i)
    \\
    i = 1,\ldots, n, \quad j = 1, \ldots, J-1
  \end{array}
\end{equation}
%
One way to view this model is to think of two sets of thresholds being applied at conditions with and without contact as illustrated in Figure~\ref{fig:clm_nominal}. 
The model is specified as follows with \code{clm}:
<<>>=
fm.nom <- clm(rating ~ temp, nominal = ~ contact, data = wine)
summary(fm.nom)
@
As can be seen from the output of \code{summary} there are no
regression coefficient estimated for \code{contact}, but there are
additional threshold coefficients estimated instead.
%
The naming and meaning of the threshold coefficients depend on the contrast coding applied to \code{contact}. Here the \proglang{R} default treatment contrasts (\code{"contr.treatment"}) are used.

Here coefficients translate to the following parameter functions:
\begin{equation}  \label{eq:nom_parameters}
\begin{gathered}
  \hat\beta(\mathtt{warm} - \mathtt{cold})= 2.52, \\
  \{\hat\theta_j\} = \{-1.32,~ 1.25,~ 3.55,~ 4.66\}, \\
  \{ \hat{\tilde{\beta}}_j(\mathtt{yes} - \mathtt{no}) \} = 
  \{-1.62,~ -1.51,~ -1.67,~ -1.05\}.
\end{gathered}
\end{equation}
%
Again $\{ \theta_j \}$ refer to the thresholds at $\mathtt{temp}_i = \mathtt{cold}$ and $\mathtt{contact}_i = \mathtt{no}$ settings while the thresholds at $\mathtt{temp}_i = \mathtt{cold}$ and $\mathtt{contact}_i = \mathtt{yes}$ are $\{ \hat\theta_j + \hat{\tilde{\beta}}_j(\mathtt{yes} - \mathtt{no}) \}$. 
%
The odds ratio of bitterness being rated in category $j$ or above ($\mathrm{OR}(Y \geq j)$) now depend on $j$: $\{\exp(-\hat{\tilde{\beta}}_j(\mathtt{yes} - \mathtt{no}))\} = \{ 5.03,~ 4.53,~ 5.34,~ 2.86\}$.
% 

\begin{figure}
  \centering
  \includegraphics[width=6cm]{./static_figs/fig-figNom2} 
  \caption{Illustration of nominal effects leading to different sets of thresholds being applied for each level of \code{contact} in a latent variable interpretation, cf., Equation~\ref{eq:CLM_nominal}.\label{fig:clm_nominal}}
\end{figure}

The resulting thresholds for each level of \code{contact}, i.e., the estimated $\bm\Theta$-matrix can be extracted with:
<<>>=
fm.nom$Theta
@
As part of the convergence checks, \code{clm} checks the validity of $\bm\Theta$, i.e., that each row of the threshold matrix is non-decreasing.


We can perform a likelihood ratio test of the 
proportional odds assumption for \code{contact} by comparing the
likelihoods of models (\ref{eq:CLM}) and (\ref{eq:CLM_nominal}) as
follows:
<<>>=
anova(fm1, fm.nom)
@
There is only little difference in the log-likelihoods of the two
models and the test is insignificant. Thus there is no evidence
that the proportional odds assumption is violated for
\code{contact}.

It is not possible to estimate both $\beta_2(\mathtt{contact}_i)$ and $\tilde{\beta}_{j}(\mathtt{contact}_i)$ in the
same model. Consequently variables that appear in \code{nominal}
cannot enter in \code{formula} as well. For instance, not all parameters are
identifiable in the following model:
<<>>=
fm.nom2 <- clm(rating ~ temp + contact, nominal = ~ contact, data = wine)
@
We are made aware of this when summarizing or printing the model in which the coefficient for \code{contactyes} is \code{NA}:
<<>>=
fm.nom2
@

To test the proportional odds assumption for all variables, we can use
<<>>=
nominal_test(fm1)
@
This function \emph{moves} all terms in \code{formula} to \code{nominal} and \emph{copies} all terms in \code{scale} to \code{nominal} one by one and produces an \code{add1}-like table with likelihood ratio tests of each term. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Modelling scale effects} \label{sec:scale-effects}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% 
To allow the scale of the latent variable distribution to depend on explanatory variables we could for instance consider the following model where the scale is allowed to differ between cold and warm conditions. The location of the latent distribution is allowed to depend on both temperature and contact:
\begin{equation}  \label{eq:CLM_scale_wine}
  \begin{gathered}
  \textup{logit}(P(Y_i \leq j)) = \frac{\theta_j - \beta_1 (\mathtt{temp}_i)
    - \beta_{2} (\mathtt{contact}_i)} {\exp( \zeta (\mathtt{temp}_i))} \\
  i = 1,\ldots, n, \quad j = 1, \ldots, J-1
  \end{gathered}
\end{equation}
This model structure is illustrated in Figure~\ref{fig:clm_scale} and can be estimated with:
<<>>=
fm.sca <- clm(rating ~ temp + contact, scale = ~ temp, data = wine)
summary(fm.sca)
@
In a latent variable interpretation the location of the latent distribution is shifted $2.63\sigma^*$ (scale units) from cold to warm conditions and $1.59\sigma^*$ from absence to presence of contact. The scale of the latent distribution is $\sigma^*$ at cold conditions but $\sigma^* \exp(\zeta(\mathtt{warm} - \mathtt{cold})) = \sigma^*\exp(0.095) = 1.10 \sigma^*$, i.e., 10\% higher, at warm conditions. However, observe that the $p$~value for the scale effect in the summary output shows that the ratio of scales is not significantly different from 1 (or equivalently that the difference on the log-scale is not different from 0).

Scale effects offer an alternative to nominal effects (partial proportional odds) when non-proportional odds structures are encountered in the data. Using scale effects is often a better approach because the model is well-defined for all values of the explanatory variables irrespective of translocation and scaling of covariates. Scale effects also use fewer parameters which often lead to more sensitive tests than nominal effects. Potential scale effects of variables already included in \code{formula} can be discovered using \code{scale_test}. This function adds each model term in \code{formula} to \code{scale} in turn and reports the likelihood ratio statistic in an \code{add1}-like fashion:
<<>>=
scale_test(fm1)
@

\code{confint} and \code{anova} methods apply with no change to models with scale and nominal parts, but
\code{drop1}, \code{add1} and \code{step} methods will only drop
or add terms to the (location) \code{formula}.

\begin{figure}
  \centering
  \includegraphics[width=6cm]{./static_figs/fig-figSca} 
  \caption{Illustration of scale effects leading to different scales of the latent variable, cf., Equation~\ref{eq:CLM_scale_wine}.\label{fig:clm_scale}}
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Structured thresholds} \label{sec:threshold-effects}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


In section~\ref{sec:nominal-effects} nominal effects were described where the assumption that regression parameters have the same effect across all thresholds was relaxed. In
this section additional restrictions on the
thresholds will be imposed instead. 
The following model requires that the thresholds,
$\{ \theta_j \}$ are equidistant or equally spaced. This allows us to assess an assumption that judges are using the response scale in such a way that there is the same distance between adjacent response categories, i.e., that $\theta_j - \theta_{j-1} = \textup{constant}$ for $j = 2, ..., J-1$. The effect of equidistant thresholds is illustrated in Figure~\ref{fig:clm_structured_thresholds} and can be fitted with:
<<>>=
fm.equi <- clm(rating ~ temp + contact, data = wine,
               threshold = "equidistant")
summary(fm.equi)
@
The parameters determining the thresholds are now the first threshold (\code{threshold.1})
and the spacing among consecutive thresholds (\code{spacing}). The mapping to this
parameterization is stored in the transpose of the Jacobian matrix
(\code{tJac}) component of the model fit. This makes it possible to
extract the thresholds imposed by the equidistance structure with
<<>>=
drop(fm.equi$tJac %*% coef(fm.equi)[c("threshold.1", "spacing")])
@
These thresholds are in fact already stored in the \code{Theta} component of the 
model fit.
%
The following shows that the average distance between consecutive thresholds
in \code{fm1} which did not restrict the thresholds is very close to the \code{spacing} parameter from \code{fm.equi}:
<<>>=
mean(diff(coef(fm1)[1:4]))
@

One advantage of imposing additional restrictions on the thresholds is the
use of fewer parameters. Whether the restrictions are warranted by the
data can be assessed in a likelihood ratio test:
<<>>=
anova(fm1, fm.equi)
@
In this case the test is non-significant, so there is no considerable
loss of fit at the gain of saving two parameters, hence we may retain
the model with equally spaced thresholds.

Note that the shape of the latent distribution (determined by the choice of link function) also affects the distances between the thresholds. If thresholds are equidistant under a normal distribution (i.e., with the logit link) they will in general\footnote{The exception is perfect fits such as CLMs with flexible thresholds and no predictors where models have the same likelihood irrespective of link function.} not be equidistant under a differently shaped latent distribution such as a skew latent distribution (e.g., with the log-log or clog-log link). 

\begin{figure}
  \centering
  \includegraphics[width=6cm]{./static_figs/fig-figFlex} 
  \includegraphics[width=6cm]{./static_figs/fig-figEqui} 
  \caption{Illustration of flexible (left) and equidistant (right) thresholds being applied in a cumulative link model in a latent variable interpretation.\label{fig:clm_structured_thresholds}}
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Scale effects, nominal effects and link functions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

This section presents an example that connects aspects of scale effects, nominal effects and link functions. The example is based on the \code{soup} data available in the \pkg{ordinal} package. This dataset represents a sensory discrimination study of packet soup in which 185 respondents assessed a reference product and one of 5 test products on an ordinal sureness-scale with 6 levels from "reference, sure" to "test, sure".

The two key explanatory variables in this example are \code{PRODID} and \code{PROD}. \code{PRODID} identifies all 6 products while \code{PROD} distinguishes test and reference products:
<<>>=
with(soup, table(PROD, PRODID))
@

The so-called bi-normal model plays a special role in the field of signal detection theory \citep{decarlo98, macmillan05} and in sensometrics \citep{christensen11} and assumes the existence of normal latent distributions potentially with different variances. The bi-normal model can be fitted to ordinal data by identifying it as a CLM with a probit link. The following bi-normal model assumes that the location of the normal latent distribution depends on \code{PRODID} while the scale only varies with \code{PROD}:
<<>>=
fm_binorm <- clm(SURENESS ~ PRODID, scale = ~ PROD, 
                 data = soup, link="probit")
summary(fm_binorm)
@
Here we observe significant differences in scale for reference and test products and this is an example of what would have been denoted non-proportional odds had the link function been the logit function. In this context differences in scale are interpreted to mean that a location shift of the latent normal distribution is not enough to represent the data. Another test of such non-location effects is provided by the nominal effects:
<<>>=
fm_nom <- clm(SURENESS ~ PRODID, nominal = ~ PROD, 
              data = soup, link="probit")
@

A comparison of these models shows that the scale effects increase the likelihood substantially using only one extra parameter. The addition of nominal effects provides a smaller increase in likelihood using three extra parameters:
<<>>=
fm_location <- update(fm_binorm, scale = ~ 1)
anova(fm_location, fm_binorm, fm_nom)
@
Note that both the location-only and bi-normal models are nested under the model with nominal effects making these models comparable in likelihood ratio tests. This example illustrates an often seen aspect: that models allowing for scale differences frequently capture the majority of deviations from location-only effects that could otherwise be captured by nominal effects using fewer parameters.

The role of link functions in relation to the evidence of non-location effects is also illustrated by this example. If we consider the complementary log-log link it is apparent that there is no evidence of scale differences. Furthermore, the likelihood of a complementary log-log model with constant scale is almost the same as that of the bi-normal model:
<<>>=
fm_cll_scale <- clm(SURENESS ~ PRODID, scale = ~ PROD, 
              data = soup, link="cloglog")
fm_cll <- clm(SURENESS ~ PRODID, 
               data = soup, link="cloglog")
anova(fm_cll, fm_cll_scale, fm_binorm)
@

Using the log-gamma link we can also confirm that a left-skewed latent distribution ($\lambda > 0$) is best supported by the data and that the estimate of $\lambda$ is close to 1 at which the complementary log-log link is obtained:
<<>>=
fm_loggamma <- clm(SURENESS ~ PRODID, data = soup, link="log-gamma")
summary(fm_loggamma)
@
The analysis of link functions shown here can be thought of as providing a framework analogous to that of Box-Cox transformations for linear models. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Profile likelihood}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In addition to facilitating the generally quite accurate profile likelihood confidence intervals which were illustrated in section~\ref{sec:fitting-basic-clm}, the profile likelihood function can also be used to illustrate the relative importance of parameter values. 

As an example, the profile likelihood of model coefficients for \code{temp} and \code{contact} in \code{fm1} can be obtained with
% 
<<profileLikelihood, echo=TRUE>>=
pr1 <- profile(fm1, alpha = 1e-4)
plot(pr1)
@
The resulting plots are provided in Figure~\ref{fig:ProfileLikelihood}.
The \code{alpha} argument controls how
far from the maximum likelihood estimate the likelihood function
should be profiled: the profile strays no further from the MLE when values outside an (\code{1 - alpha})-level profile likelihood confidence interval.

From the relative
profile likelihood in Figure~\ref{fig:ProfileLikelihood} for \code{tempwarm} we see that parameter values
between 1 and 4 are reasonably well supported by the data, and values
outside this range has little likelihood. Values between 2 and 3 are
very well supported by the data and have high likelihood.

\setkeys{Gin}{width=.45\textwidth}
\begin{figure}
  \centering
<<prof1, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=5, height=5>>=
plot(pr1, which.par = 1)
@
<<prof2, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=5, height=5>>=
plot(pr1, which.par = 2)
@
  \caption{Relative profile likelihoods for the regression parameters
    in \code{fm1} for the wine data. Horizontal lines indicate 95\% and 99\%
    confidence bounds.}
  \label{fig:ProfileLikelihood}
\end{figure}

Profiling is implemented for regression ($\beta$) and scale ($\zeta$) parameters but not available for threshold, nominal and flexible link parameters.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Assessment of model convergence}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Likelihood slices}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The maximum likelihood estimates of the parameters in cumulative link
models do not have closed form expressions, so iterative methods have
to be applied to fit the models. Further, CLMs are non-linear models and in general the likelihood function is not guaranteed to be well-behaved or even uni-model. In addition, the special role of the threshold parameters and the restriction on them being ordered can affect the appearance of the likelihood function. 

To confirm that an unequivocal optimum has been reached and that the likelihood function is reasonably well-behaved around the reported optimum we can inspect the likelihood function in a neighborhood around the reported optimum. For these purposes we can display slices of the likelihood function.

The following code produces the slices shown in Figure~\ref{fig:slice1} which displays the shape of the log-likelihood function in a fairly wide neighborhood around the reported MLE; here we use $\lambda=5$ curvature units, as well as it's quadratic approximation. 
<<>>=
slice.fm1 <- slice(fm1, lambda = 5)
par(mfrow = c(2, 3))
plot(slice.fm1)
@
Figure~\ref{fig:slice1} shows that log-likelihood function is fairly well behaved and relatively closely quadratic for most parameters.

\setkeys{Gin}{width=.32\textwidth}
\begin{figure}
  \centering
<<slice11, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=3, height=3>>=
plot(slice.fm1, parm = 1)
@
<<slice12, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=3, height=3>>=
plot(slice.fm1, parm = 2)
@
<<slice13, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=3, height=3>>=
plot(slice.fm1, parm = 3)
@
<<slice14, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=3, height=3>>=
plot(slice.fm1, parm = 4)
@
<<slice15, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=3, height=3>>=
plot(slice.fm1, parm = 5)
@
<<slice16, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=3, height=3>>=
plot(slice.fm1, parm = 6)
@
\caption{Slices of the (negative) log-likelihood function (solid) for
  parameters in \code{fm1} for the wine data. Dashed lines indicate
  quadratic approximations to the log-likelihood function and
  vertical bars indicate maximum likelihood estimates.}
\label{fig:slice1}
\end{figure}

Looking at the log-likelihood function much closer to the reported optimum (using $\lambda = 10^{-5}$) we can probe how accurately the parameter estimates are determined. The likelihood slices in Figure~\ref{fig:slice2} which are produced with the following code shows that the parameters are determined accurately with at least 5 correct decimals. Slices are shown for two parameters and the slices for the remaining 4 parameters are very similar.
<<slice2, echo=TRUE, results=hide>>=
slice2.fm1 <- slice(fm1, parm = 4:5, lambda = 1e-5)
par(mfrow = c(1, 2))
plot(slice2.fm1)
@

\setkeys{Gin}{width=.45\textwidth}
\begin{figure}
  \centering
<<slice24, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=5, height=5>>=
plot(slice2.fm1, parm = 1)
@
<<slice25, echo=FALSE, results=hide, fig=TRUE, include=TRUE, width=5, height=5>>=
plot(slice2.fm1, parm = 2)
@
  \caption{Slices of the (negative) log-likelihood function (solid) for
    parameters in \code{fm1} for the wine data very close
    to the MLEs. Dashed lines (indistinguishable from the solid lines) indicate
    quadratic approximations to the log-likelihood function and
    vertical bars the indicate maximum likelihood estimates.}
\label{fig:slice2}
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Parameter accuracy}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

As discussed in section~\ref{sec:algorithm} the method independent error estimate provides an assessment of the accuracy with which the ML estimates of the parameters have been determined by the fitting algorithm. This error estimate is implemented in the \code{convergence} method which we now illustrate on a model fit:
<<>>=
convergence(fm1)
@
The most important information is the number of correct decimals
(\code{Cor.Dec}) and the number of significant digits
(\code{Sig.Dig}) with which the parameters are determined. In this
case all parameters are very accurately determined, so there is no
reason to lower the convergence tolerance. The \code{logLik.error}
shows that the error in the reported value of the log-likelihood is
below $10^{-10}$, which is by far small enough that likelihood
ratio tests based on this model are accurate.

Note that the assessment of the number of correctly determined decimals and significant digits is only reliable sufficiently close to the optimum so in practice we caution against this assessment if the algorithm did not converge successfully.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Fitted values and predictions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Several types of fitted values and predictions can be extracted from a CLM depending on how it is viewed. 

By \emph{fitted values} we denote the values ($i=1, \ldots, n$)
\begin{equation*}
  \hat{\tilde\pi}_i = \tilde\pi_i(\hat{\bm\psi})
\end{equation*}
that is, the value of $\tilde\pi_i$, cf., Equation~\ref{eq:clm-log-likelihood} evaluated at the ML estimates $\hat{\bm\psi}$. These are the values returned by the \code{fitted} and \code{fitted.values} extractor methods and stored in the \code{fitted.values} component of the model fit.

The values of $\pi_{ij}$ (cf., Equation~\ref{eq:multinom_pmf}) evaluated at the ML estimates of the parameters (i.e., $\hat\pi_{ij}$) can also be thought of as fitted values for the multinomially distributed variable $\bm Y_i^*$. These values can be obtained from the model fit by use of the \code{predict} method:
<<>>=
head(pred <- predict(fm1, newdata = subset(wine, select = -rating))$fit)
@
Note that the original data set should be supplied in the \code{newdata} argument \emph{without} the response variable (here \code{rating}). If the response variable 
is \emph{present} in \code{newdata} predictions are produced for only those rating categories which were observed and we get back the fitted values:
<<>>=
stopifnot(isTRUE(all.equal(fitted(fm1), t(pred)[
  t(col(pred) == wine$rating)])),
  isTRUE(all.equal(fitted(fm1), predict(fm1, newdata = wine)$fit)))
@
Class predictions are also available and defined here as the response class with the highest probability, that is, for the $i$'th observation the class prediction is the mode of $\bm\pi_{i}$. To obtain class predictions use \code{type = "class"} as illustrated in the following small table:
<<>>=
newData <- expand.grid(temp    = levels(wine$temp),
                       contact = levels(wine$contact))
cbind(newData, round(predict(fm1, newdata = newData)$fit, 3),
      "class" = predict(fm1, newdata = newData, type = "class")$fit)
@
Other definitions of class predictions can be applied, e.g., nearest mean predictions:
<<>>=
head(apply(pred, 1, function(x) round(weighted.mean(1:5, x))))
@
which in this case happens to be identical to the default class predictions.
<<echo=FALSE, results=hide>>=
p1 <- apply(predict(fm1, newdata = subset(wine, select=-rating))$fit, 1,
            function(x) round(weighted.mean(1:5, x)))
p2 <- as.numeric(as.character(predict(fm1, type = "class")$fit))
stopifnot(isTRUE(all.equal(p1, p2, check.attributes = FALSE)))
@

Standard errors and confidence intervals of predictions are also
available, for example:
<<>>=
predictions <- predict(fm1, se.fit = TRUE, interval = TRUE)
head(do.call("cbind", predictions))
@
where the default 95\% confidence level can be changed with the \code{level} argument.

Here the standard errors of fitted values or predictions, $\hat{\tilde{\pi}} = \tilde{\pi}(\hat{\bm\psi})$ are obtained by application of the delta method:
\begin{equation*}
  \mathsf{Var}(\hat{\tilde{\bm\pi}}) = \bm C \mathsf{Var}(\hat{\bm\psi}) \bm C^\top, 
    \quad 
    \bm C = \frac{\partial \tilde{\bm\pi}(\bm\psi)}{\partial \bm\psi} 
       \Big|_{\bm\psi = \hat{\bm\psi}}
\end{equation*}
where $\mathsf{Var}(\hat{\bm\psi})$ is the estimated variance-covariance matrix of the parameters $\bm\psi$ evaluated at the ML estimates $\hat{\bm\psi}$ as given by the observed Fisher Information matrix and finally the standard errors are extracted as the square root of the diagonal elements of $\mathsf{Var}(\hat{\tilde{\bm\pi}})$.

Since symmetric confidence intervals for probabilities are not appropriate unless perhaps if they are close to one half a more generally applicable approach is to form symmetric Wald intervals on the logit scale and then subsequently transform the confidence bounds to the probability scale. \code{predict.clm} takes this approach and computes the standard error of $\hat\kappa_i = \mathrm{logit}(\hat{\tilde{\pi}}_i)$ by yet an application of the delta method:
\begin{equation*}
  \mathrm{se}(\hat\kappa_i) = 
     \frac{\partial g(\hat{\tilde{\pi}}_i)}{\partial \hat{\tilde{\pi}}_i} 
       \mathrm{se}(\hat{\tilde{\pi}}_i) = \frac{\mathrm{se}(\hat{\tilde{\pi}}_i)}{%
         \hat{\tilde{\pi}}_i(1 - \hat{\tilde{\pi}}_i)}, \quad
    g(\hat{\tilde{\pi}}_i) = \log \frac{\hat{\tilde{\pi}}_i}{1 - \hat{\tilde{\pi}}_i}.
\end{equation*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Model identifiability}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Unidentifiable models or unidentifiable parameters may happen in CLMs for several reasons some of which are special to the model class. In this section we describe issues around model identifiability and how this is handled by \code{ordinal::clm}. 

Material in the remainder of this section is generally on a more advanced level than up to now.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Complete separation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In binary logistic regression the issue of \emph{complete separation} is well known. This may happen, for example if only ``success'' or only ``failure'' is observed for a level of a treatment factor. In CLMs the issue may appear even when outcomes are observed in more than one response category. This can be illustrated using the \code{wine} data set if we combine the three central categories:
<<>>=
wine <- within(wine, {
  rating_comb3 <- factor(rating, labels = c("1", "2-4", "2-4", "2-4", "5"))
}) 
ftable(rating_comb3 ~ temp, data = wine)
fm.comb3 <- clm(rating_comb3 ~ temp, data = wine)
summary(fm.comb3)
@
Here the true ML estimates of the coefficients for \code{temp} and the second threshold are at infinity but the algorithm in \code{clm} terminates when the likelihood function is sufficiently flat. This means that the reported values of the coefficients for \code{temp} and the second threshold are arbitrary and will change if the convergence criteria are changed or a different optimization method is used. The standard errors of the coefficients are not available because the Hessian is effectively singular and so cannot be inverted to produce the variance-covariance matrix of the parameters. The ill-determined nature of the Hessian is seen from the very large condition number of the Hessian, \code{cond.H}. 

Note, however, that while the model parameters cannot be uniquely determined, the likelihood of the model is well defined and as such it can be compared to the likelihood of other models. For example, we could compare it to a model that excludes \code{temp}
<<>>=
fm.comb3_b <- clm(rating_comb3 ~ 1, data = wine)
anova(fm.comb3, fm.comb3_b)
@
The difference in log-likelihood is substantial, however, the criteria for the validity of the likelihood ratio test are not fulfilled, so the $p$~value should not be taken at face value.

The complete-separation issue may also appear in less obvious situations. If, for example, the following model is considered allowing for nominal effects of \code{temp} the issue shows up:
<<>>=
fm.nom2 <- clm(rating ~ contact, nominal = ~ temp, data = wine)
summary(fm.nom2)
@

Analytical detection of which coefficients suffer from unidentifiability due to \emph{complete separation} is a topic for future research and therefore unavailable in current versions of \pkg{ordinal}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Aliased coefficients} 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Aliased coefficients can occur in all kinds of models that build on a design matrix including linear models as well as generalized linear models. \code{lm} and \code{glm} determine the rank deficiency of the design matrix using the rank-revealing implementation of the QR-decomposition in \code{LINPACK} and displays the aliased coefficients as \code{NA}s\footnote{if the \code{singular.ok = TRUE} which is the default.}. Though the QR decomposition is not used during iterations in \code{clm}, it is used initially to determine aliased coefficients. An example is provided using the \code{soup} data available in the \pkg{ordinal} package:
<<>>=
fm.soup <- clm(SURENESS ~ PRODID * DAY, data = soup)
summary(fm.soup)
@
The source of the singularity is revealed in the following table:
<<>>=
with(soup, table(DAY, PRODID))
@
which shows that the third \code{PRODID} was not presented at the second day.

The issue of aliased coefficients extends in CLMs to nominal effects since the joint design matrix for location and nominal effects will be singular if the same variables are included in both location and nominal formulae. \code{clm} handles this by not estimating the offending coefficients in the location formula as illustrated with the \code{fm.nom2} model fit in section~\ref{sec:nominal-effects}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Over parameterization} \label{sec:over-parameterization}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The scope of model structures allowed in \code{clm} makes it possible to specify models which are over parameterized in ways that do not lead to rank deficient design matrices and as such are not easily detected before fitting the model. An example is given here which includes both additive (location) and multiplicative (scale) effects of \code{contact} for a binomial response variable but the issue can also occur with more than two response categories:
<<>>=
wine <- within(wine, {
  rating_comb2 <- factor(rating, labels = c("1-2", "1-2", "3-5", "3-5", "3-5"))
}) 
ftable(rating_comb2 ~ contact, data = wine)
fm.comb2 <- clm(rating_comb2 ~ contact, scale = ~ contact, data = wine)
summary(fm.comb2)
@

<<echo=false, results=hide>>=
## Example with unidentified parameters with 3 response categories
## not shown in paper:
wine <- within(wine, {
  rating_comb3b <- rating
  levels(rating_comb3b) <- c("1-2", "1-2", "3", "4-5", "4-5")
}) 
wine$rating_comb3b[1] <- "4-5" # Remove the zero here to avoid inf MLE
ftable(rating_comb3b ~ temp + contact, data = wine)

fm.comb3_c <- clm(rating_comb3b ~ contact * temp, 
                  scale   = ~contact * temp, 
                  nominal = ~contact, data = wine)
summary(fm.comb3_c)
convergence(fm.comb3_c)
@


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Customized modelling} \label{sec:customized-modelling}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Using the \code{doFit} argument \code{clm} can be instructed to return a \emph{model environment} that we denote \code{rho}:
<<>>=
rho <- update(fm1, doFit=FALSE)
names(rho)
@
This environment holds a complete specification of the cumulative link models including design matrices \code{B1}, \code{B2}, \code{S} and other components. The environment also contains the cumulative distribution function that defines the inverse link function \code{pfun} and its first and second derivatives, i.e., the corresponding density function \code{dfun} and gradient \code{gfun}. Of direct interest here is the parameter vector \code{par} and functions that readily evaluate the negative log-likelihood (\code{clm.nll}), its gradient with respect to the parameters (\code{clm.grad}) and the Hessian (\code{clm.hess}). The negative log-likelihood and the gradient at the starting values is therefore
<<>>=
rho$clm.nll(rho)
c(rho$clm.grad(rho))
@
Similarly at the MLE they are:
<<>>=
rho$clm.nll(rho, par = coef(fm1))
print(c(rho$clm.grad(rho)), digits = 3)
@
Note that the gradient function \code{clm.grad} assumes that \code{clm.nll} has been evaluated at the current parameter values; similarly, \code{clm.hess} assumes that \code{clm.grad} has been evaluated at the current parameter values. The NR algorithm in \pkg{ordinal} takes advantage of this so as to minimize the computational load. 

If interest is in fitting a \emph{custom} CLM with, say, restrictions on the parameter space, this can be achieved by a combination of a general purpose optimizer and the functions \code{clm.nll} and optionally \code{clm.grad}. Assume for instance we know that the regression parameters can be no larger than 2, then the model can be fitted with the following code:
<<>>=
nll <- function(par, envir) {
  envir$par <- par
  envir$clm.nll(envir)
}
grad <- function(par, envir) {
  envir$par <- par
  envir$clm.nll(envir)
  envir$clm.grad(envir)
}
nlminb(rho$par, nll, grad, upper = c(rep(Inf, 4), 2, 2), envir = rho)$par
@


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Constrained partial proportional odds}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

A type of models which are not implemented in full generality in \pkg{ordinal} are the so-called \emph{constrained} partial proportional odds models proposed by \citet{peterson90}. These models impose restrictions on the nominal effects considered in section~\ref{sec:nominal-effects} and are well suited to illustrate the customisable modelling options available in the \pkg{ordinal} package. We consider an example from \citet{peterson90} in which disease status is tabulated by smoking status:
<<>>=
artery <- data.frame(disease = factor(rep(0:4, 2), ordered = TRUE),
                     smoker  = factor(rep(c("no", "yes"), each = 5)),
                     freq    = c(334, 99, 117, 159, 30, 350, 307, 
                                 345, 481, 67))
addmargins(xtabs(freq ~ smoker + disease, data = artery), margin = 2)
@

The overall odds-ratio of smoking is
<<>>=
fm <- clm(disease ~ smoker, weights = freq, data = artery)
exp(fm$beta)
@
showing that overall the odds of worse disease rating is twice as high for smokers compared to non-smokers. 

Allowing for nominal effects we see that the log odds-ratio for smoking clearly changes with disease status, and that it does so in an almost linearly decreasing manor:
<<>>=
fm.nom <- clm(disease ~ 1, nominal = ~ smoker, weights = freq, 
              data = artery, sign.nominal = "negative")
coef(fm.nom)[5:8]
@
\citet{peterson90} suggested a model which restricts the log odds-ratios to be linearly decreasing with disease status modelling only the intercept (first threshold) and slope of the log odds-ratios:
<<>>=
coef(fm.lm <- lm(I(coef(fm.nom)[5:8]) ~ I(0:3)))
@
We can implement the log-likelihood of this model as follows. As starting values we combine parameter estimates from \code{fm.nom} and the linear model \code{fm.lm}, and finally optimize the log-likelihood utilizing the \code{fm.nom} model environment:
<<>>=
nll2 <- function(par, envir) {
  envir$par <- c(par[1:4], par[5] + par[6] * (0:3))
  envir$clm.nll(envir)
}
start <- unname(c(coef(fm.nom)[1:4], coef(fm.lm)))
fit <- nlminb(start, nll2, envir = update(fm.nom, doFit = FALSE))
round(fit$par[5:6], 2)
@
Thus the log-odds decrease linearly from 1.02 for the first two disease categories by 0.3 per disease category.



%% -- Illustrations ------------------------------------------------------------

%% - Virtually all JSS manuscripts list source code along with the generated
%%   output. The style files provide dedicated environments for this.
%% - In R, the environments {Sinput} and {Soutput} - as produced by Sweave() or
%%   or knitr using the render_sweave() hook - are used (without the need to
%%   load Sweave.sty).
%% - Equivalently, {CodeInput} and {CodeOutput} can be used.
%% - The code input should use "the usual" command prompt in the respective
%%   software system.
%% - For R code, the prompt "R> " should be used with "+  " as the
%%   continuation prompt.
%% - Comments within the code chunks should be avoided - these should be made
%%   within the regular LaTeX text.


%% -- Summary/conclusions/discussion -------------------------------------------

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Conclusions} \label{sec:conclusions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

This paper has described the class of cumulative link models for the analysis of ordinal data and the implementation of such models in the \proglang{R} package \pkg{ordinal}. It is shown how the package supports model building and assessment of CLMs with scale effects, partial proportional odds, structured thresholds, flexible link functions and how models can be costumized to specific needs. A number of examples have been given illustrating analyses of ordinal data using \code{clm} in practice. 

The significant flexibility of model structures available in \pkg{ordinal} is in one respect a clear advantage but it can also be a challenge when particular model variants turn out to be unidentifiable. Analytical detection of unidentifiable models could prove very useful in the analysis of ordinal data, but it is, unfortunately, a difficult question that remains a topic of future research.

In a wider data analysis perspective, cumulative link models have been described as a very rich model class---a class that sits in between, in a sense, the perhaps the two most important model classes in statistics; linear models and logistic regression models. The greater flexibility of CLMs relative to binary logistic regression models facilitates the ability to check assumptions such as the partial proportional odds assumption. A latent variable interpretation connects cumulative link models to linear models in a natural way and also motivates non-linear structures such as scale effects. In addition to nominal effects and the non-linear scale effects, the ordered nature of the thresholds gives rise to computational challenges that we have described here and addressed in the \pkg{ordinal} package. 

In addition to computational challenges, practical data analysis with CLMs can also be challenging. In our experience a top-down approach in which a ``full'' model is fitted and gradually simplified is often problematic, not only because this easily leads to unidentifiable models but also because there are many different ways in which models can be reduced or expanded. A more pragmatic approach is often preferred; understanding the data through plots, tables, and even linear models can aid in finding a suitable intermediate ordinal starting model. 

Attempts to identify a ``correct'' model will also often lead to frustrations; the greater the model framework, the greater the risk that there are multiple models which fit the data (almost) equally well. It is well known statistical wisdom that with enough data many goodness of fit tests become sensitive to even minor deviations of little practical relevance. This is particularly true for tests of partial proportional odds; in the author's experience almost all CLMs on real data show some evidence of non-proportional odds for one or more variables but it is not always the case that models with partial or non-proportional odds are the most useful. Such effects complicate the interpretation and often generalize poorly outside the observed data and models assuming proportional odds or including scale effects are often more appropriate. 


%% -- Optional special unnumbered sections -------------------------------------

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Computational details}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% 
% \begin{leftbar}
% If necessary or useful, information about certain computational details
% such as version numbers, operating systems, or compilers could be included
% in an unnumbered section. Also, auxiliary packages (say, for visualizations,
% maps, tables, \dots) that are not cited in the main text can be credited here.
% \end{leftbar}

The results in this paper were obtained using
\proglang{R}~\Sexpr{paste(R.Version()[6:7], collapse = ".")} with
\pkg{ordinal}, version~\Sexpr{packageVersion("ordinal")}. \proglang{R} itself
and all packages used are available from CRAN at
\url{https://CRAN.R-project.org/}.


% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \section*{Acknowledgments}
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% 

% \begin{leftbar}
% 
% All acknowledgments (note the AE spelling) should be collected in this
% unnumbered section before the references. It may contain the usual information
% about funding and feedback from colleagues/reviewers/etc. Furthermore,
% information such as relative contributions of the authors may be added here
% (if any).
% \end{leftbar}


%% -- Bibliography -------------------------------------------------------------
%% - References need to be provided in a .bib BibTeX database.
%% - All references should be made with \cite, \citet, \citep, \citealp etc.
%%   (and never hard-coded). See the FAQ for details.
%% - JSS-specific markup (\proglang, \pkg, \code) should be used in the .bib.
%% - Titles in the .bib should be in title case.
%% - DOIs should be included where available.

\bibliography{clm_article_refs}


%% -- Appendix (if any) --------------------------------------------------------
%% - After the bibliography with page break.
%% - With proper section titles and _not_ just "Appendix".

\newpage

\begin{appendix}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{A regularized Newton-Raphson algorithm with step halving}
\label{sec:algorithm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The regularized NR algorithm is an iterative method that produce a sequence of estimates $\bm\psi^{(0)}, \ldots, \bm\psi^{(i)}, \ldots$, where parenthesized superscripts denote iterations. From the $i$th estimate, the $(i+1)$'th estimate is given by
%
\begin{equation*}
  \bm\psi^{(i+1)} = \bm\psi^{(i)} - c_1 \bm h^{(i)}, \quad 
  \bm h^{(i)} = \tilde{\bm H}(\bm\psi^{(i)}; \bm y)^{-1}
  \bm g(\bm\psi^{(i)}; \bm y)
\end{equation*}
where
\begin{equation*}
  \tilde{\bm H}(\bm\psi^{(i)}; \bm y) = \bm H(\bm\psi^{(i)}; \bm y) + 
     c_2 (c_3 + \min(\bm e^{(i)})) \bm I,
\end{equation*}
%
% where % $\bm h^{(i)}$ is the step of the $i$th iteration, 
$\bm H(\bm\psi^{(i)} ; \bm y)$ and 
$\bm g(\bm\psi^{(i)}; \bm y)$ are the Hessian and gradient of the negative log-likelihood
function with respect to the parameters evaluated at the current
estimates; 
$\bm e^{(i)}$ is a vector of eigenvalues of $\bm H(\bm\psi^{(i)}; \bm y)$,
$\bm h^{(i)}$ is the $i$'th step, $c_1$ is a scalar parameter which controls the step 
halving, and $c_2$, $c_3$ are scalar parameters which control the regularization 
of the Hessian.

Regularization is only enforced when the Hessian is not positive definite, so $c_2 = 1$ when $\min(\bm e^{(i)}) < \tau$ and zero otherwise, were $\tau$ is an appropriate tolerance. The choice of $c_3$ is to some extent arbitrary (though required positive) and the algorithm in \pkg{ordinal} sets $c_3 = 1$. 

Step-halving is enforced when the full step $\bm h^{(i)}$ causes a decrease in the likelihood function in which case $c_1$ is consecutively halved, $c_1 = \frac{1}{2}, \frac{1}{4}, \frac{1}{8}, \ldots$ until the step $c_1 \bm h^{(i)}$ is small enough to cause an increase in the likelihood or until the maximum allowed number of consecutive step-halvings has been reached.

The algorithm in \pkg{ordinal} also deals with a couple of numerical issues that may occur. For example, the likelihood function may be sufficiently flat that the change in log-likelihood is smaller than what can be represented in double precision, and so, while the new parameters may be closer to the true ML estimates and be associated with a smaller gradient, it is not possible to measure progress by the change in log-likelihood.

The NR algorithm in \pkg{ordinal} has two convergence criteria: (1) an absolute criterion requesting that $\max | \bm g(\bm\psi^{(i)}; \bm y) | < \tau_1$ and (2) a relative criterion requesting that $\max | \bm h^{(i)} | < \tau_2$ where the default thresholds are $\tau_1 = \tau_2 = 10^{-6}$.

Here the first criterion attempts to establish closeness of $\bm\psi^{(i)}$ to the true ML estimates in absolute terms; the second criterion is an estimate of relative closeness of to the true ML estimates.
%
Both convergence criteria are needed if both small (e.g., $\approx 0.0001$) and large (e.g., $\approx 1000$) parameter estimates are to be determined accurately with an appropriate number of correct decimals as well as significant digits.

The NR algorithm in \pkg{ordinal} attempts to satisfy the absolute criterion first and will then only attempt to satisfy the relative criterion if it can take the full un-regularized NR step and then only for a maximum of 5 steps.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Convergence properties and parameter accuracy}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Convergence to a well-defined optimum is achieved when the gradient of the negative log-likelihood function with respect to the parameters is small and the Hessian is positive definite i.e., having only positive eigenvalues away from zero. 
%
Identifiability problems occur when the likelihood function is flat in directions of one or more parameters (or linear functions of the parameters) while well-defined, i.e., pointy in other directions. 
It may happen that a parameter is exactly unidentifiable and \code{clm} is in some cases (including rank-deficient design matrices) able to detect this and exclude the parameter from the optimization procedure. In other cases the likelihood is almost flat in one or more directions. These cases are not uncommon in practice and it is not possible to reduce the parameter space before optimizing the model. 
To measure the degree of empirical identifiability \code{clm} reports the condition number of the Hessian which is the ratio of the largest to the smallest eigenvalue. 
A large condition number of the Hessian does not necessarily mean there is a 
problem with the model, but it can be. A small condition number of the Hessian, say smaller than about $10^4$ or $10^6$, on the
other hand is a good assurance that a well-defined optimum has been reached.

A key problem for optimization methods is when to stop iterating: when have the parameters that determine the optimum of the function been found with sufficient accuracy? The \emph{method independent error estimate} \citep{elden04} provides a way to approximate the error in the parameter estimates. Sufficiently close to the optimum the Newton-Raphson step provides this estimate:
\begin{equation*}
  |\hat{\bm\alpha}^{(i)} - \bm\alpha^*| \lesssim \bm h^{(i)}, \quad 
  \bm h^{(i)} = \bm H(\bm\psi^{(i)}; \bm y)^{-1} \bm g(\bm\psi^{(i)}; \bm y)
\end{equation*}
where $\bm\alpha^*$ is the exact (but unknown) value of the ML estimate, $\hat{\bm\alpha}^{(i)}$ is the ML estimator of $\bm\alpha$ at the $i$'th iteration and $\bm h^{(i)}$ is the full unregularized NR step at the $i$'th iteration. 
%
Since the gradient and Hessian of the negative log-likelihood function with respect to the parameters is already evaluated and part of the model fit at convergence, it is essentially computationally cost-free to approximate the error in the parameter estimates. Based on the error estimate the number of correctly determined decimals and significant digits is determined for each parameter. 

The assessment of the number of correctly determined decimals and significant digits is only reliable sufficiently close to the optimum and when the NR algorithm converges without regularization and step-halving. In practice we caution against this assessment if the algorithm did not converge successfully.

% 
% \begin{leftbar}
% Appendices can be included after the bibliography (with a page break). Each
% section within the appendix should have a proper section title (rather than
% just \emph{Appendix}).
% 
% For more technical style details, please check out JSS's style FAQ at
% \url{https://www.jstatsoft.org/pages/view/style#frequently-asked-questions}
% which includes the following topics:
% \begin{itemize}
%   \item Title vs.\ sentence case.
%   \item Graphics formatting.
%   \item Naming conventions.
%   \item Turning JSS manuscripts into \proglang{R} package vignettes.
%   \item Trouble shooting.
%   \item Many other potentially helpful details\dots
% \end{itemize}
% \end{leftbar}
% 
% 
% \section[Using BibTeX]{Using \textsc{Bib}{\TeX}} \label{app:bibtex}
% 
% \begin{leftbar}
% References need to be provided in a \textsc{Bib}{\TeX} file (\code{.bib}). All
% references should be made with \verb|\cite|, \verb|\citet|, \verb|\citep|,
% \verb|\citealp| etc.\ (and never hard-coded). This commands yield different
% formats of author-year citations and allow to include additional details (e.g.,
% pages, chapters, \dots) in brackets. In case you are not familiar with these
% commands see the JSS style FAQ for details.
% 
% Cleaning up \textsc{Bib}{\TeX} files is a somewhat tedious task -- especially
% when acquiring the entries automatically from mixed online sources. However,
% it is important that informations are complete and presented in a consistent
% style to avoid confusions. JSS requires the following format.
% \begin{itemize}
%   \item JSS-specific markup (\verb|\proglang|, \verb|\pkg|, \verb|\code|) should
%     be used in the references.
%   \item Titles should be in title case.
%   \item Journal titles should not be abbreviated and in title case.
%   \item DOIs should be included where available.
%   \item Software should be properly cited as well. For \proglang{R} packages
%     \code{citation("pkgname")} typically provides a good starting point.
% \end{itemize}
% \end{leftbar}
% 
\end{appendix}

%% -----------------------------------------------------------------------------


\end{document}