1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
|
#!/usr/bin/env python
# coding: utf-8
# DO NOT EDIT
# Autogenerated from the notebook influence_glm_logit.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT
# # Influence Measures for GLM Logit
#
#
# Based on draft version for GLMInfluence, which will also apply to
# discrete Logit, Probit and Poisson, and eventually be extended to cover
# most models outside of time series analysis.
#
# The example for logistic regression was used by Pregibon (1981)
# "Logistic Regression diagnostics" and is based on data by Finney (1947).
#
# GLMInfluence includes the basic influence measures but still misses some
# measures described in Pregibon (1981), for example those related to
# deviance and effects on confidence intervals.
import os.path
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
plt.rc("figure", figsize=(16, 8))
plt.rc("font", size=14)
import statsmodels.stats.tests.test_influence
test_module = statsmodels.stats.tests.test_influence.__file__
cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module))
file_name = "binary_constrict.csv"
file_path = os.path.join(cur_dir, "results", file_name)
df = pd.read_csv(file_path, index_col=0)
res = GLM(
df["constrict"],
df[["const", "log_rate", "log_volumne"]],
family=families.Binomial(),
).fit(attach_wls=True, atol=1e-10)
print(res.summary())
# ## get the influence measures
#
# GLMResults has a `get_influence` method similar to OLSResults, that
# returns and instance of the GLMInfluence class. This class has methods and
# (cached) attributes to inspect influence and outlier measures.
#
# This measures are based on a one-step approximation to the the results
# for deleting one observation. One-step approximations are usually accurate
# for small changes but underestimate the magnitude of large changes. Event
# though large changes are underestimated, they still show clearly the
# effect of influential observations
#
# In this example observation 4 and 18 have a large standardized residual
# and large Cook's distance, but not a large leverage. Observation 13 has
# the largest leverage but only small Cook's distance and not a large
# studentized residual.
#
# Only the two observations 4 and 18 have a large impact on the parameter
# estimates.
infl = res.get_influence(observed=False)
summ_df = infl.summary_frame()
summ_df.sort_values("cooks_d", ascending=False)[:10]
fig = infl.plot_influence()
fig.tight_layout(pad=1.0)
fig = infl.plot_index(y_var="cooks",
threshold=2 * infl.cooks_distance[0].mean())
fig.tight_layout(pad=1.0)
fig = infl.plot_index(y_var="resid", threshold=1)
fig.tight_layout(pad=1.0)
fig = infl.plot_index(y_var="dfbeta", idx=1, threshold=0.5)
fig.tight_layout(pad=1.0)
fig = infl.plot_index(y_var="dfbeta", idx=2, threshold=0.5)
fig.tight_layout(pad=1.0)
fig = infl.plot_index(y_var="dfbeta", idx=0, threshold=0.5)
fig.tight_layout(pad=1.0)
|