File: quasibinomial.py

package info (click to toggle)
statsmodels 0.14.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 49,956 kB
  • sloc: python: 254,365; f90: 612; sh: 560; javascript: 337; asm: 156; makefile: 145; ansic: 32; xml: 9
file content (105 lines) | stat: -rw-r--r-- 3,310 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python

# DO NOT EDIT
# Autogenerated from the notebook quasibinomial.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT

# # Quasi-binomial regression
#
# This notebook demonstrates using custom variance functions and non-
# binary data
# with the quasi-binomial GLM family to perform a regression analysis
# using
# a dependent variable that is a proportion.
#
# The notebook uses the barley leaf blotch data that has been discussed in
# several textbooks. See below for one reference:
#
# https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/v
# iewer.htm#statug_glimmix_sect016.htm

import statsmodels.api as sm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

# The raw data, expressed as percentages.  We will divide by 100
# to obtain proportions.

raw = StringIO("""0.05,0.00,1.25,2.50,5.50,1.00,5.00,5.00,17.50
0.00,0.05,1.25,0.50,1.00,5.00,0.10,10.00,25.00
0.00,0.05,2.50,0.01,6.00,5.00,5.00,5.00,42.50
0.10,0.30,16.60,3.00,1.10,5.00,5.00,5.00,50.00
0.25,0.75,2.50,2.50,2.50,5.00,50.00,25.00,37.50
0.05,0.30,2.50,0.01,8.00,5.00,10.00,75.00,95.00
0.50,3.00,0.00,25.00,16.50,10.00,50.00,50.00,62.50
1.30,7.50,20.00,55.00,29.50,5.00,25.00,75.00,95.00
1.50,1.00,37.50,5.00,20.00,50.00,50.00,75.00,95.00
1.50,12.70,26.25,40.00,43.50,75.00,75.00,75.00,95.00""")

# The regression model is a two-way additive model with
# site and variety effects.  The data are a full unreplicated
# design with 10 rows (sites) and 9 columns (varieties).

df = pd.read_csv(raw, header=None)
df = df.melt()
df["site"] = 1 + np.floor(df.index / 10).astype(int)
df["variety"] = 1 + (df.index % 10)
df = df.rename(columns={"value": "blotch"})
df = df.drop("variable", axis=1)
df["blotch"] /= 100

# Fit the quasi-binomial regression with the standard variance
# function.

model1 = sm.GLM.from_formula("blotch ~ 0 + C(variety) + C(site)",
                             family=sm.families.Binomial(),
                             data=df)
result1 = model1.fit(scale="X2")
print(result1.summary())

# The plot below shows that the default variance function is
# not capturing the variance structure very well. Also note
# that the scale parameter estimate is quite small.

plt.clf()
plt.grid(True)
plt.plot(result1.predict(linear=True), result1.resid_pearson, "o")
plt.xlabel("Linear predictor")
plt.ylabel("Residual")

# An alternative variance function is mu^2 * (1 - mu)^2.


class vf(sm.families.varfuncs.VarianceFunction):
    def __call__(self, mu):
        return mu**2 * (1 - mu)**2

    def deriv(self, mu):
        return 2 * mu - 6 * mu**2 + 4 * mu**3


# Fit the quasi-binomial regression with the alternative variance
# function.

bin = sm.families.Binomial()
bin.variance = vf()
model2 = sm.GLM.from_formula("blotch ~ 0 + C(variety) + C(site)",
                             family=bin,
                             data=df)
result2 = model2.fit(scale="X2")
print(result2.summary())

# With the alternative variance function, the mean/variance relationship
# seems to capture the data well, and the estimated scale parameter is
# close to 1.

plt.clf()
plt.grid(True)
plt.plot(result2.predict(linear=True), result2.resid_pearson, "o")
plt.xlabel("Linear predictor")
plt.ylabel("Residual")