1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
|
#!/usr/bin/env python
# coding: utf-8
# DO NOT EDIT
# Autogenerated from the notebook glm_formula.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT
# # Generalized Linear Models (Formula)
# This notebook illustrates how you can use R-style formulas to fit
# Generalized Linear Models.
#
# To begin, we load the ``Star98`` dataset and we construct a formula and
# pre-process the data:
import statsmodels.api as sm
import statsmodels.formula.api as smf
star98 = sm.datasets.star98.load_pandas().data
formula = "SUCCESS ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT + \
PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF"
dta = star98[[
"NABOVE",
"NBELOW",
"LOWINC",
"PERASIAN",
"PERBLACK",
"PERHISP",
"PCTCHRT",
"PCTYRRND",
"PERMINTE",
"AVYRSEXP",
"AVSALK",
"PERSPENK",
"PTRATIO",
"PCTAF",
]].copy()
endog = dta["NABOVE"] / (dta["NABOVE"] + dta.pop("NBELOW"))
del dta["NABOVE"]
dta["SUCCESS"] = endog
# Then, we fit the GLM model:
mod1 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()
print(mod1.summary())
# Finally, we define a function to operate customized data transformation
# using the formula framework:
def double_it(x):
return 2 * x
formula = "SUCCESS ~ double_it(LOWINC) + PERASIAN + PERBLACK + PERHISP + PCTCHRT + \
PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF"
mod2 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()
print(mod2.summary())
# As expected, the coefficient for ``double_it(LOWINC)`` in the second
# model is half the size of the ``LOWINC`` coefficient from the first model:
print(mod1.params.iloc[1])
print(mod2.params.iloc[1] * 2)
|