File: glm_formula.py

package info (click to toggle)

statsmodels 0.14.6%2Bdfsg-1

links: PTS, VCS
area: main
in suites: sid
size: 49,956 kB
sloc: python: 254,365; f90: 612; sh: 560; javascript: 337; asm: 156; makefile: 145; ansic: 32; xml: 9

file content (69 lines) | stat: -rw-r--r-- 1,783 bytes

parent folder | download | duplicates (2)

#!/usr/bin/env python
# coding: utf-8

# DO NOT EDIT
# Autogenerated from the notebook glm_formula.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT

# # Generalized Linear Models (Formula)

# This notebook illustrates how you can use R-style formulas to fit
# Generalized Linear Models.
#
# To begin, we load the ``Star98`` dataset and we construct a formula and
# pre-process the data:

import statsmodels.api as sm
import statsmodels.formula.api as smf

star98 = sm.datasets.star98.load_pandas().data
formula = "SUCCESS ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT + \
           PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF"

dta = star98[[
    "NABOVE",
    "NBELOW",
    "LOWINC",
    "PERASIAN",
    "PERBLACK",
    "PERHISP",
    "PCTCHRT",
    "PCTYRRND",
    "PERMINTE",
    "AVYRSEXP",
    "AVSALK",
    "PERSPENK",
    "PTRATIO",
    "PCTAF",
]].copy()
endog = dta["NABOVE"] / (dta["NABOVE"] + dta.pop("NBELOW"))
del dta["NABOVE"]
dta["SUCCESS"] = endog

# Then, we fit the GLM model:

mod1 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()
print(mod1.summary())

# Finally, we define a function to operate customized data transformation
# using the formula framework:


def double_it(x):
    return 2 * x


formula = "SUCCESS ~ double_it(LOWINC) + PERASIAN + PERBLACK + PERHISP + PCTCHRT + \
           PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF"

mod2 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()
print(mod2.summary())

# As expected, the coefficient for ``double_it(LOWINC)`` in the second
# model is half the size of the ``LOWINC`` coefficient from the first model:

print(mod1.params.iloc[1])
print(mod2.params.iloc[1] * 2)