File: glm.py

package info (click to toggle)
statsmodels 0.14.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 49,956 kB
  • sloc: python: 254,365; f90: 612; sh: 560; javascript: 337; asm: 156; makefile: 145; ansic: 32; xml: 9
file content (174 lines) | stat: -rw-r--r-- 4,411 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python
# coding: utf-8

# DO NOT EDIT
# Autogenerated from the notebook glm.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT

# # Generalized Linear Models

import numpy as np
import statsmodels.api as sm
from scipy import stats
from matplotlib import pyplot as plt

plt.rc("figure", figsize=(16, 8))
plt.rc("font", size=14)

# ## GLM: Binomial response data
#
# ### Load Star98 data
#
#  In this example, we use the Star98 dataset which was taken with
# permission
#  from Jeff Gill (2000) Generalized linear models: A unified approach.
# Codebook
#  information can be obtained by typing:

print(sm.datasets.star98.NOTE)

# Load the data and add a constant to the exogenous (independent)
# variables:

data = sm.datasets.star98.load()
data.exog = sm.add_constant(data.exog, prepend=False)

#  The dependent variable is N by 2 (Success: NABOVE, Failure: NBELOW):

print(data.endog.head())

#  The independent variables include all the other variables described
# above, as
#  well as the interaction terms:

print(data.exog.head())

# ### Fit and summary

glm_binom = sm.GLM(data.endog, data.exog, family=sm.families.Binomial())
res = glm_binom.fit()
print(res.summary())

# ### Quantities of interest

print('Total number of trials:', data.endog.iloc[:, 0].sum())
print('Parameters: ', res.params)
print('T-values: ', res.tvalues)

# First differences: We hold all explanatory variables constant at their
# means and manipulate the percentage of low income households to assess its
# impact on the response variables:

means = data.exog.mean(axis=0)
means25 = means.copy()
means25.iloc[0] = stats.scoreatpercentile(data.exog.iloc[:, 0], 25)
means75 = means.copy()
means75.iloc[0] = lowinc_75per = stats.scoreatpercentile(
    data.exog.iloc[:, 0], 75)
resp_25 = res.predict(means25)
resp_75 = res.predict(means75)
diff = resp_75 - resp_25

# The interquartile first difference for the percentage of low income
# households in a school district is:

print("%2.4f%%" % (diff.iloc[0] * 100))

# ### Plots
#
#  We extract information that will be used to draw some interesting
# plots:

nobs = res.nobs
y = data.endog.iloc[:, 0] / data.endog.sum(1)
yhat = res.mu

# Plot yhat vs y:

from statsmodels.graphics.api import abline_plot

fig, ax = plt.subplots()
ax.scatter(yhat, y)
line_fit = sm.OLS(y, sm.add_constant(yhat, prepend=True)).fit()
abline_plot(model_results=line_fit, ax=ax)

ax.set_title('Model Fit Plot')
ax.set_ylabel('Observed values')
ax.set_xlabel('Fitted values')

# Plot yhat vs. Pearson residuals:

fig, ax = plt.subplots()

ax.scatter(yhat, res.resid_pearson)
ax.hlines(0, 0, 1)
ax.set_xlim(0, 1)
ax.set_title('Residual Dependence Plot')
ax.set_ylabel('Pearson Residuals')
ax.set_xlabel('Fitted values')

# Histogram of standardized deviance residuals:

from scipy import stats

fig, ax = plt.subplots()

resid = res.resid_deviance.copy()
resid_std = stats.zscore(resid)
ax.hist(resid_std, bins=25)
ax.set_title('Histogram of standardized deviance residuals')

# QQ Plot of Deviance Residuals:

from statsmodels import graphics

graphics.gofplots.qqplot(resid, line='r')

# ## GLM: Gamma for proportional count response
#
# ### Load Scottish Parliament Voting data
#
#  In the example above, we printed the ``NOTE`` attribute to learn about
# the
#  Star98 dataset. statsmodels datasets ships with other useful
# information. For
#  example:

print(sm.datasets.scotland.DESCRLONG)

#  Load the data and add a constant to the exogenous variables:

data2 = sm.datasets.scotland.load()
data2.exog = sm.add_constant(data2.exog, prepend=False)
print(data2.exog.head())
print(data2.endog.head())

# ### Model Fit and summary

glm_gamma = sm.GLM(data2.endog,
                   data2.exog,
                   family=sm.families.Gamma(sm.families.links.Log()))
glm_results = glm_gamma.fit()
print(glm_results.summary())

# ## GLM: Gaussian distribution with a noncanonical link
#
# ### Artificial data

nobs2 = 100
x = np.arange(nobs2)
np.random.seed(54321)
X = np.column_stack((x, x**2))
X = sm.add_constant(X, prepend=False)
lny = np.exp(-(.03 * x + .0001 * x**2 - 1.0)) + .001 * np.random.rand(nobs2)

# ### Fit and summary (artificial data)

gauss_log = sm.GLM(lny,
                   X,
                   family=sm.families.Gaussian(sm.families.links.Log()))
gauss_log_results = gauss_log.fit()
print(gauss_log_results.summary())