File: statespace_sarimax_internet.py

package info (click to toggle)
statsmodels 0.14.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 49,956 kB
  • sloc: python: 254,365; f90: 612; sh: 560; javascript: 337; asm: 156; makefile: 145; ansic: 32; xml: 9
file content (176 lines) | stat: -rw-r--r-- 5,847 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python
# coding: utf-8

# DO NOT EDIT
# Autogenerated from the notebook statespace_sarimax_internet.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT

# # SARIMAX: Model selection, missing data

# The example mirrors Durbin and Koopman (2012), Chapter 8.4 in
# application of Box-Jenkins methodology to fit ARMA models. The novel
# feature is the ability of the model to work on datasets with missing
# values.

import numpy as np
import pandas as pd
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt

import requests
from io import BytesIO
from zipfile import ZipFile

# Download the dataset
df = pd.read_table(
    "https://raw.githubusercontent.com/jrnold/ssmodels-in-stan/master/StanStateSpace/data-raw/DK-data/internet.dat",
    skiprows=1,
    header=None,
    sep='\s+',
    engine='python',
    names=['internet', 'dinternet'])

# ### Model Selection
#
# As in Durbin and Koopman, we force a number of the values to be missing.

# Get the basic series
dta_full = df.dinternet[1:].values
dta_miss = dta_full.copy()

# Remove datapoints
missing = np.r_[6, 16, 26, 36, 46, 56, 66, 72, 73, 74, 75, 76, 86, 96] - 1
dta_miss[missing] = np.nan

# Then we can consider model selection using the Akaike information
# criteria (AIC), but running the model for each variant and selecting the
# model with the lowest AIC value.
#
# There are a couple of things to note here:
#
# - When running such a large batch of models, particularly when the
# autoregressive and moving average orders become large, there is the
# possibility of poor maximum likelihood convergence. Below we ignore the
# warnings since this example is illustrative.
# - We use the option `enforce_invertibility=False`, which allows the
# moving average polynomial to be non-invertible, so that more of the models
# are estimable.
# - Several of the models do not produce good results, and their AIC value
# is set to NaN. This is not surprising, as Durbin and Koopman note
# numerical problems with the high order models.

import warnings

aic_full = pd.DataFrame(np.zeros((6, 6), dtype=float))
aic_miss = pd.DataFrame(np.zeros((6, 6), dtype=float))

warnings.simplefilter('ignore')

# Iterate over all ARMA(p,q) models with p,q in [0,6]
for p in range(6):
    for q in range(6):
        if p == 0 and q == 0:
            continue

        # Estimate the model with no missing datapoints
        mod = sm.tsa.statespace.SARIMAX(dta_full,
                                        order=(p, 0, q),
                                        enforce_invertibility=False)
        try:
            res = mod.fit(disp=False)
            aic_full.iloc[p, q] = res.aic
        except:
            aic_full.iloc[p, q] = np.nan

        # Estimate the model with missing datapoints
        mod = sm.tsa.statespace.SARIMAX(dta_miss,
                                        order=(p, 0, q),
                                        enforce_invertibility=False)
        try:
            res = mod.fit(disp=False)
            aic_miss.iloc[p, q] = res.aic
        except:
            aic_miss.iloc[p, q] = np.nan

# For the models estimated over the full (non-missing) dataset, the AIC
# chooses ARMA(1,1) or ARMA(3,0). Durbin and Koopman suggest the ARMA(1,1)
# specification is better due to parsimony.
#
# $$
# \text{Replication of:}\\
# \textbf{Table 8.1} ~~ \text{AIC for different ARMA models.}\\
# \newcommand{\r}[1]{{\color{red}{#1}}}
# \begin{array}{lrrrrrr}
# \hline
# q &      0 &      1 &      2 &      3 &      4 &      5 \\
# \hline
# p &     {} &     {} &     {} &     {} &     {} &     {} \\
# 0 &   0.00 & 549.81 & 519.87 & 520.27 & 519.38 & 518.86 \\
# 1 & 529.24 & \r{514.30} & 516.25 & 514.58 & 515.10 & 516.28 \\
# 2 & 522.18 & 516.29 & 517.16 & 515.77 & 513.24 & 514.73 \\
# 3 & \r{511.99} & 513.94 & 515.92 & 512.06 & 513.72 & 514.50 \\
# 4 & 513.93 & 512.89 &    nan &    nan & 514.81 & 516.08 \\
# 5 & 515.86 & 517.64 &    nan &    nan &    nan &    nan \\
# \hline
# \end{array}
# $$
#
# For the models estimated over missing dataset, the AIC chooses ARMA(1,1)
#
# $$
# \text{Replication of:}\\
# \textbf{Table 8.2} ~~ \text{AIC for different ARMA models with missing
# observations.}\\
# \begin{array}{lrrrrrr}
# \hline
# q &      0 &      1 &      2 &      3 &      4 &      5 \\
# \hline
# p &     {} &     {} &     {} &     {} &     {} &     {} \\
# 0 &   0.00 & 488.93 & 464.01 & 463.86 & 462.63 & 463.62 \\
# 1 & 468.01 & \r{457.54} & 459.35 & 458.66 & 459.15 & 461.01 \\
# 2 & 469.68 &    nan & 460.48 & 459.43 & 459.23 & 460.47 \\
# 3 & 467.10 & 458.44 & 459.64 & 456.66 & 459.54 & 460.05 \\
# 4 & 469.00 & 459.52 &    nan & 463.04 & 459.35 & 460.96 \\
# 5 & 471.32 & 461.26 &    nan &    nan & 461.00 & 462.97 \\
# \hline
# \end{array}
# $$
#
# **Note**: the AIC values are calculated differently than in Durbin and
# Koopman, but show overall similar trends.

# ### Postestimation
#
# Using the ARMA(1,1) specification selected above, we perform in-sample
# prediction and out-of-sample forecasting.

# Statespace
mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(1, 0, 1))
res = mod.fit(disp=False)
print(res.summary())

# In-sample one-step-ahead predictions, and out-of-sample forecasts
nforecast = 20
predict = res.get_prediction(end=mod.nobs + nforecast)
idx = np.arange(len(predict.predicted_mean))
predict_ci = predict.conf_int(alpha=0.5)

# Graph
fig, ax = plt.subplots(figsize=(12, 6))
ax.xaxis.grid()
ax.plot(dta_miss, 'k.')

# Plot
ax.plot(idx[:-nforecast], predict.predicted_mean[:-nforecast], 'gray')
ax.plot(idx[-nforecast:],
        predict.predicted_mean[-nforecast:],
        'k--',
        linestyle='--',
        linewidth=2)
ax.fill_between(idx, predict_ci[:, 0], predict_ci[:, 1], alpha=0.15)

ax.set(title='Figure 8.9 - Internet series')