1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
#!/usr/bin/env python
# coding: utf-8
# DO NOT EDIT
# Autogenerated from the notebook glm.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT
# # Generalized Linear Models
import numpy as np
import statsmodels.api as sm
from scipy import stats
from matplotlib import pyplot as plt
plt.rc("figure", figsize=(16, 8))
plt.rc("font", size=14)
# ## GLM: Binomial response data
#
# ### Load Star98 data
#
# In this example, we use the Star98 dataset which was taken with
# permission
# from Jeff Gill (2000) Generalized linear models: A unified approach.
# Codebook
# information can be obtained by typing:
print(sm.datasets.star98.NOTE)
# Load the data and add a constant to the exogenous (independent)
# variables:
data = sm.datasets.star98.load()
data.exog = sm.add_constant(data.exog, prepend=False)
# The dependent variable is N by 2 (Success: NABOVE, Failure: NBELOW):
print(data.endog.head())
# The independent variables include all the other variables described
# above, as
# well as the interaction terms:
print(data.exog.head())
# ### Fit and summary
glm_binom = sm.GLM(data.endog, data.exog, family=sm.families.Binomial())
res = glm_binom.fit()
print(res.summary())
# ### Quantities of interest
print('Total number of trials:', data.endog.iloc[:, 0].sum())
print('Parameters: ', res.params)
print('T-values: ', res.tvalues)
# First differences: We hold all explanatory variables constant at their
# means and manipulate the percentage of low income households to assess its
# impact on the response variables:
means = data.exog.mean(axis=0)
means25 = means.copy()
means25.iloc[0] = stats.scoreatpercentile(data.exog.iloc[:, 0], 25)
means75 = means.copy()
means75.iloc[0] = lowinc_75per = stats.scoreatpercentile(
data.exog.iloc[:, 0], 75)
resp_25 = res.predict(means25)
resp_75 = res.predict(means75)
diff = resp_75 - resp_25
# The interquartile first difference for the percentage of low income
# households in a school district is:
print("%2.4f%%" % (diff.iloc[0] * 100))
# ### Plots
#
# We extract information that will be used to draw some interesting
# plots:
nobs = res.nobs
y = data.endog.iloc[:, 0] / data.endog.sum(1)
yhat = res.mu
# Plot yhat vs y:
from statsmodels.graphics.api import abline_plot
fig, ax = plt.subplots()
ax.scatter(yhat, y)
line_fit = sm.OLS(y, sm.add_constant(yhat, prepend=True)).fit()
abline_plot(model_results=line_fit, ax=ax)
ax.set_title('Model Fit Plot')
ax.set_ylabel('Observed values')
ax.set_xlabel('Fitted values')
# Plot yhat vs. Pearson residuals:
fig, ax = plt.subplots()
ax.scatter(yhat, res.resid_pearson)
ax.hlines(0, 0, 1)
ax.set_xlim(0, 1)
ax.set_title('Residual Dependence Plot')
ax.set_ylabel('Pearson Residuals')
ax.set_xlabel('Fitted values')
# Histogram of standardized deviance residuals:
from scipy import stats
fig, ax = plt.subplots()
resid = res.resid_deviance.copy()
resid_std = stats.zscore(resid)
ax.hist(resid_std, bins=25)
ax.set_title('Histogram of standardized deviance residuals')
# QQ Plot of Deviance Residuals:
from statsmodels import graphics
graphics.gofplots.qqplot(resid, line='r')
# ## GLM: Gamma for proportional count response
#
# ### Load Scottish Parliament Voting data
#
# In the example above, we printed the ``NOTE`` attribute to learn about
# the
# Star98 dataset. statsmodels datasets ships with other useful
# information. For
# example:
print(sm.datasets.scotland.DESCRLONG)
# Load the data and add a constant to the exogenous variables:
data2 = sm.datasets.scotland.load()
data2.exog = sm.add_constant(data2.exog, prepend=False)
print(data2.exog.head())
print(data2.endog.head())
# ### Model Fit and summary
glm_gamma = sm.GLM(data2.endog,
data2.exog,
family=sm.families.Gamma(sm.families.links.Log()))
glm_results = glm_gamma.fit()
print(glm_results.summary())
# ## GLM: Gaussian distribution with a noncanonical link
#
# ### Artificial data
nobs2 = 100
x = np.arange(nobs2)
np.random.seed(54321)
X = np.column_stack((x, x**2))
X = sm.add_constant(X, prepend=False)
lny = np.exp(-(.03 * x + .0001 * x**2 - 1.0)) + .001 * np.random.rand(nobs2)
# ### Fit and summary (artificial data)
gauss_log = sm.GLM(lny,
X,
family=sm.families.Gaussian(sm.families.links.Log()))
gauss_log_results = gauss_log.fit()
print(gauss_log_results.summary())
|