File: ordinal_regression.py

package info (click to toggle)
statsmodels 0.14.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 49,956 kB
  • sloc: python: 254,365; f90: 612; sh: 560; javascript: 337; asm: 156; makefile: 145; ansic: 32; xml: 9
file content (319 lines) | stat: -rw-r--r-- 11,665 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/env python

# DO NOT EDIT
# Autogenerated from the notebook ordinal_regression.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT

# # Ordinal Regression

import numpy as np
import pandas as pd
import scipy.stats as stats

from statsmodels.miscmodels.ordinal_model import OrderedModel

# Loading a stata data file from the UCLA website.This notebook is
# inspired by https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/
# which is a R notebook from UCLA.

url = "https://stats.idre.ucla.edu/stat/data/ologit.dta"
data_student = pd.read_stata(url)

data_student.head(5)

data_student.dtypes

data_student['apply'].dtype

# This dataset is about the probability for undergraduate students to
# apply to graduate school given three exogenous variables:
# - their grade point average(`gpa`), a float between 0 and 4.
# - `pared`, a binary that indicates if at least one parent went to
# graduate school.
# - and `public`, a binary that indicates if the current undergraduate
# institution of the student is public or private.
#
# `apply`, the target variable is categorical with ordered categories:
# `unlikely` < `somewhat likely` < `very likely`. It is a `pd.Serie` of
# categorical type, this is preferred over NumPy arrays.

# The model is based on a numerical latent variable $y_{latent}$ that we
# cannot observe but that we can compute thanks to exogenous variables.
# Moreover we can use this $y_{latent}$ to define $y$ that we can observe.
#
# For more details see the the Documentation of OrderedModel,  [the UCLA
# webpage](https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/)
# or this
# [book](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470594001).
#

# ### Probit ordinal regression:

mod_prob = OrderedModel(data_student['apply'],
                        data_student[['pared', 'public', 'gpa']],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

# In our model, we have 3 exogenous variables(the $\beta$s if we keep the
# documentation's notations) so we have 3 coefficients that need to be
# estimated.
#
# Those 3 estimations and their standard errors can be retrieved in the
# summary table.
#
# Since there are 3 categories in the target variable(`unlikely`,
# `somewhat likely`, `very likely`), we have two thresholds to estimate.
# As explained in the doc of the method
# `OrderedModel.transform_threshold_params`, the first estimated threshold
# is the actual value and all the other thresholds are in terms of
# cumulative exponentiated increments. Actual thresholds values can be
# computed as follows:

num_of_thresholds = 2
mod_prob.transform_threshold_params(res_prob.params[-num_of_thresholds:])

# ### Logit ordinal regression:

mod_log = OrderedModel(data_student['apply'],
                       data_student[['pared', 'public', 'gpa']],
                       distr='logit')

res_log = mod_log.fit(method='bfgs', disp=False)
res_log.summary()

predicted = res_log.model.predict(res_log.params,
                                  exog=data_student[['pared', 'public',
                                                     'gpa']])
predicted

pred_choice = predicted.argmax(1)
print('Fraction of correct choice predictions')
print((np.asarray(data_student['apply'].values.codes) == pred_choice).mean())

# ### Ordinal regression with a custom cumulative cLogLog distribution:

# In addition to `logit` and `probit` regression, any continuous
# distribution from `SciPy.stats` package can be used for the `distr`
# argument. Alternatively, one can define its own distribution simply
# creating a subclass from `rv_continuous` and implementing a few methods.

# using a SciPy distribution
res_exp = OrderedModel(data_student['apply'],
                       data_student[['pared', 'public', 'gpa']],
                       distr=stats.expon).fit(method='bfgs', disp=False)
res_exp.summary()


# minimal definition of a custom scipy distribution.
class CLogLog(stats.rv_continuous):
    def _ppf(self, q):
        return np.log(-np.log(1 - q))

    def _cdf(self, x):
        return 1 - np.exp(-np.exp(x))


cloglog = CLogLog()

# definition of the model and fitting
res_cloglog = OrderedModel(data_student['apply'],
                           data_student[['pared', 'public', 'gpa']],
                           distr=cloglog).fit(method='bfgs', disp=False)
res_cloglog.summary()

# ### Using formulas - treatment of endog
#
# Pandas' ordered categorical and numeric values are supported as
# dependent variable in formulas. Other types will raise a ValueError.

modf_logit = OrderedModel.from_formula("apply ~ 0 + pared + public + gpa",
                                       data_student,
                                       distr='logit')
resf_logit = modf_logit.fit(method='bfgs')
resf_logit.summary()

# Using numerical codes for the dependent variable is supported but loses
# the names of the category levels. The levels and names correspond to the
# unique values of the dependent variable sorted in alphanumeric order as in
# the case without using formulas.

data_student["apply_codes"] = data_student['apply'].cat.codes * 2 + 5
data_student["apply_codes"].head()

OrderedModel.from_formula("apply_codes ~ 0 + pared + public + gpa",
                          data_student,
                          distr='logit').fit().summary()

resf_logit.predict(data_student.iloc[:5])

# Using string values directly as dependent variable raises a ValueError.

data_student["apply_str"] = np.asarray(data_student["apply"])
data_student["apply_str"].head()

data_student.apply_str = pd.Categorical(data_student.apply_str, ordered=True)
data_student.public = data_student.public.astype(float)
data_student.pared = data_student.pared.astype(float)

OrderedModel.from_formula("apply_str ~ 0 + pared + public + gpa",
                          data_student,
                          distr='logit')

# ### Using formulas - no constant in model
#
# The parameterization of OrderedModel requires that there is **no**
# constant in the model, neither explicit nor implicit. The constant is
# equivalent to shifting all thresholds and is therefore not separately
# identified.
#
# Patsy's formula specification does not allow a design matrix without
# explicit or implicit constant if there are categorical variables (or maybe
# splines) among explanatory variables. As workaround, statsmodels removes
# an explicit intercept.
#
# Consequently, there are two valid cases to get a design matrix without
# intercept.
#
# - specify a model without explicit and implicit intercept which is
# possible if there are only numerical variables in the model.
# - specify a model with an explicit intercept which statsmodels will
# remove.
#
# Models with an implicit intercept will be overparameterized, the
# parameter estimates will not be fully identified, `cov_params` will not be
# invertible and standard errors might contain nans.
#
# In the following we look at an example with an additional categorical
# variable.
#

nobs = len(data_student)
data_student["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float)

# **explicit intercept**, that will be removed:
#
# Note "1 +" is here redundant because it is patsy's default.

modfd_logit = OrderedModel.from_formula(
    "apply ~ 1 + pared + public + gpa + C(dummy)", data_student, distr='logit')
resfd_logit = modfd_logit.fit(method='bfgs')
print(resfd_logit.summary())

modfd_logit.k_vars

modfd_logit.k_constant

# **implicit intercept** creates overparameterized model
#
# Specifying "0 +" in the formula drops the explicit intercept. However,
# the categorical encoding is now changed to include an implicit intercept.
# In this example, the created dummy variables `C(dummy)[0.0]` and
# `C(dummy)[1.0]` sum to one.

# ```python
# OrderedModel.from_formula("apply ~ 0 + pared + public + gpa + C(dummy)",
# data_student, distr='logit')
# ```

#
# To see what would happen in the overparameterized case, we can avoid the
# constant check in the model by explicitly specifying whether a constant is
# present or not. We use hasconst=False, even though the model has an
# implicit constant.
#
# The parameters of the two dummy variable columns and the first threshold
# are not separately identified. Estimates for those parameters and
# availability of standard errors are arbitrary and depends on numerical
# details that differ across environments.
#
# Some summary measures like log-likelihood value are not affected by
# this, within convergence tolerance and numerical precision. Prediction
# should also be possible. However, inference is not available, or is not
# valid.

modfd2_logit = OrderedModel.from_formula(
    "apply ~ 0 + pared + public + gpa + C(dummy)",
    data_student,
    distr='logit',
    hasconst=False)
resfd2_logit = modfd2_logit.fit(method='bfgs')
print(resfd2_logit.summary())

resfd2_logit.predict(data_student.iloc[:5])

resf_logit.predict()

# ### Binary Model compared to Logit
#
# If there are only two levels of the dependent ordered categorical
# variable, then the model can also be estimated by a Logit model.
#
# The models are (theoretically) identical in this case except for the
# parameterization of the constant. Logit as most other models requires in
# general an intercept. This corresponds to the threshold parameter in the
# OrderedModel, however, with opposite sign.
#
# The implementation differs and not all of the same results statistic and
# post-estimation features are available. Estimated parameters and other
# results statistic differ mainly based on convergence tolerance of the
# optimization.
#

from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools.tools import add_constant

# We drop the middle category from the data and keep the two extreme
# categories.

mask_drop = data_student['apply'] == "somewhat likely"
data2 = data_student.loc[~mask_drop, :]
# we need to remove the category also from the Categorical Index
data2['apply'].cat.remove_categories("somewhat likely", inplace=True)
data2.head()

mod_log = OrderedModel(data2['apply'],
                       data2[['pared', 'public', 'gpa']],
                       distr='logit')

res_log = mod_log.fit(method='bfgs', disp=False)
res_log.summary()

# The Logit model does not have a constant by default, we have to add it
# to our explanatory variables.
#
# The results are essentially identical between Logit and ordered model up
# to numerical precision mainly resulting from convergence tolerance in the
# estimation.
#
# The only difference is in the sign of the constant, Logit and
# OrdereModel have opposite signs of he constant. This is a consequence of
# the parameterization in terms of cut points in OrderedModel instead of
# including and constant column in the design matrix.

ex = add_constant(data2[['pared', 'public', 'gpa']], prepend=False)
mod_logit = Logit(data2['apply'].cat.codes, ex)

res_logit = mod_logit.fit(method='bfgs', disp=False)

res_logit.summary()

# Robust standard errors are also available in OrderedModel in the same
# way as in discrete.Logit.
# As example we specify HAC covariance type even though we have cross-
# sectional data and autocorrelation is not appropriate.

res_logit_hac = mod_logit.fit(method='bfgs',
                              disp=False,
                              cov_type="hac",
                              cov_kwds={"maxlags": 2})
res_log_hac = mod_log.fit(method='bfgs',
                          disp=False,
                          cov_type="hac",
                          cov_kwds={"maxlags": 2})

res_logit_hac.bse.values - res_log_hac.bse