File: deterministics.py

package info (click to toggle)
statsmodels 0.14.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 49,956 kB
  • sloc: python: 254,365; f90: 612; sh: 560; javascript: 337; asm: 156; makefile: 145; ansic: 32; xml: 9
file content (288 lines) | stat: -rw-r--r-- 9,221 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python
# coding: utf-8

# DO NOT EDIT
# Autogenerated from the notebook deterministics.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT

# # Deterministic Terms in Time Series Models

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rc("figure", figsize=(16, 9))
plt.rc("font", size=16)

# ## Basic Use
#
# Basic configurations can be directly constructed through
# `DeterministicProcess`. These can include a constant, a time trend of any
# order, and either a seasonal or a Fourier component.
#
# The process requires an index, which is the index of the full-sample (or
# in-sample).
#
# First, we initialize a deterministic process with a constant, a linear
# time trend, and a 5-period seasonal term. The `in_sample` method returns
# the full set of values that match the index.

from statsmodels.tsa.deterministic import DeterministicProcess

index = pd.RangeIndex(0, 100)
det_proc = DeterministicProcess(index,
                                constant=True,
                                order=1,
                                seasonal=True,
                                period=5)
det_proc.in_sample()

# The `out_of_sample` returns the next `steps` values after the end of the
# in-sample.

det_proc.out_of_sample(15)

# `range(start, stop)` can also be used to produce the deterministic terms
# over any range including in- and out-of-sample.
#
# ### Notes
#
# * When the index is a pandas `DatetimeIndex` or a `PeriodIndex`, then
# `start` and `stop` can be date-like (strings, e.g., "2020-06-01", or
# Timestamp) or integers.
# * `stop` is always included in the range. While this is not very
# Pythonic, it is needed since both statsmodels and Pandas include `stop`
# when working with date-like slices.

det_proc.range(190, 210)

# ## Using a Date-like Index
#
# Next, we show the same steps using a `PeriodIndex`.

index = pd.period_range("2020-03-01", freq="M", periods=60)
det_proc = DeterministicProcess(index, constant=True, fourier=2)
det_proc.in_sample().head(12)

det_proc.out_of_sample(12)

# `range` accepts date-like arguments, which are usually given as strings.

det_proc.range("2025-01", "2026-01")

# This is equivalent to using the integer values 58 and 70.

det_proc.range(58, 70)

# ## Advanced Construction
#
# Deterministic processes with features not supported directly through the
# constructor can be created using `additional_terms` which accepts a list
# of `DetermisticTerm`. Here we create a deterministic process with two
# seasonal components: day-of-week with a 5 day period and an annual
# captured through a Fourier component with a period of 365.25 days.

from statsmodels.tsa.deterministic import Fourier, Seasonality, TimeTrend

index = pd.period_range("2020-03-01", freq="D", periods=2 * 365)
tt = TimeTrend(constant=True)
four = Fourier(period=365.25, order=2)
seas = Seasonality(period=7)
det_proc = DeterministicProcess(index, additional_terms=[tt, seas, four])
det_proc.in_sample().head(28)

# ## Custom Deterministic Terms
#
# The `DetermisticTerm` Abstract Base Class is designed to be subclassed
# to help users write custom deterministic terms.  We next show two
# examples. The first is a broken time trend that allows a break after a
# fixed number of periods. The second is a "trick" deterministic term that
# allows exogenous data, which is not really a deterministic process, to be
# treated as if was deterministic.  This lets use simplify gathering the
# terms needed for forecasting.
#
# These are intended to demonstrate the construction of custom terms. They
# can definitely be improved in terms of input validation.

from statsmodels.tsa.deterministic import DeterministicTerm


class BrokenTimeTrend(DeterministicTerm):

    def __init__(self, break_period: int):
        self._break_period = break_period

    def __str__(self):
        return "Broken Time Trend"

    def _eq_attr(self):
        return (self._break_period, )

    def in_sample(self, index: pd.Index):
        nobs = index.shape[0]
        terms = np.zeros((nobs, 2))
        terms[self._break_period:, 0] = 1
        terms[self._break_period:, 1] = np.arange(self._break_period + 1,
                                                  nobs + 1)
        return pd.DataFrame(terms,
                            columns=["const_break", "trend_break"],
                            index=index)

    def out_of_sample(self,
                      steps: int,
                      index: pd.Index,
                      forecast_index: pd.Index = None):
        # Always call extend index first
        fcast_index = self._extend_index(index, steps, forecast_index)
        nobs = index.shape[0]
        terms = np.zeros((steps, 2))
        # Assume break period is in-sample
        terms[:, 0] = 1
        terms[:, 1] = np.arange(nobs + 1, nobs + steps + 1)
        return pd.DataFrame(terms,
                            columns=["const_break", "trend_break"],
                            index=fcast_index)


btt = BrokenTimeTrend(60)
tt = TimeTrend(constant=True, order=1)
index = pd.RangeIndex(100)
det_proc = DeterministicProcess(index, additional_terms=[tt, btt])
det_proc.range(55, 65)

# Next, we write a simple "wrapper" for some actual exogenous data that
# simplifies constructing out-of-sample exogenous arrays for forecasting.


class ExogenousProcess(DeterministicTerm):

    def __init__(self, data):
        self._data = data

    def __str__(self):
        return "Custom Exog Process"

    def _eq_attr(self):
        return (id(self._data), )

    def in_sample(self, index: pd.Index):
        return self._data.loc[index]

    def out_of_sample(self,
                      steps: int,
                      index: pd.Index,
                      forecast_index: pd.Index = None):
        forecast_index = self._extend_index(index, steps, forecast_index)
        return self._data.loc[forecast_index]


import numpy as np

gen = np.random.default_rng(98765432101234567890)
exog = pd.DataFrame(gen.integers(100, size=(300, 2)),
                    columns=["exog1", "exog2"])
exog.head()

ep = ExogenousProcess(exog)
tt = TimeTrend(constant=True, order=1)
# The in-sample index
idx = exog.index[:200]
det_proc = DeterministicProcess(idx, additional_terms=[tt, ep])

det_proc.in_sample().head()

det_proc.out_of_sample(10)

# ## Model Support
#
# The only model that directly supports `DeterministicProcess` is
# `AutoReg`. A custom term can be set using the `deterministic` keyword
# argument.
#
# **Note**: Using a custom term requires that `trend="n"` and
# `seasonal=False` so that all deterministic components must come from the
# custom deterministic term.

# ### Simulate Some Data
#
# Here we simulate some data that has an weekly seasonality captured by a
# Fourier series.

gen = np.random.default_rng(98765432101234567890)
idx = pd.RangeIndex(200)
det_proc = DeterministicProcess(idx, constant=True, period=52, fourier=2)
det_terms = det_proc.in_sample().to_numpy()
params = np.array([1.0, 3, -1, 4, -2])
exog = det_terms @ params
y = np.empty(200)
y[0] = det_terms[0] @ params + gen.standard_normal()
for i in range(1, 200):
    y[i] = 0.9 * y[i - 1] + det_terms[i] @ params + gen.standard_normal()
y = pd.Series(y, index=idx)
ax = y.plot()

# The model is then fit using the `deterministic` keyword argument.
# `seasonal` defaults to False but `trend` defaults to `"c"` so this needs
# to be changed.

from statsmodels.tsa.api import AutoReg

mod = AutoReg(y, 1, trend="n", deterministic=det_proc)
res = mod.fit()
print(res.summary())

# We can use the `plot_predict` to show the predicted values and their
# prediction interval. The out-of-sample deterministic values are
# automatically produced by the deterministic process passed to `AutoReg`.

fig = res.plot_predict(200, 200 + 2 * 52, True)

auto_reg_forecast = res.predict(200, 211)
auto_reg_forecast

# ## Using with other models
#
# Other models do not support `DeterministicProcess` directly.  We can
# instead manually pass any deterministic terms as `exog` to model that
# support exogenous values.
#
# Note that `SARIMAX` with exogenous variables is OLS with SARIMA errors
# so that the model is
#
# $$
# \begin{align*}
# \nu_t & = y_t - x_t \beta  \\
# (1-\phi(L))\nu_t & = (1+\theta(L))\epsilon_t.
# \end{align*}
# $$
#
# The parameters on deterministic terms are not directly comparable to
# `AutoReg` which evolves according to the equation
#
# $$
# (1-\phi(L)) y_t = x_t \beta + \epsilon_t.
# $$
#
# When $x_t$ contains only deterministic terms, these two representation
# are equivalent (assuming $\theta(L)=0$ so that there is no MA).
#

from statsmodels.tsa.api import SARIMAX

det_proc = DeterministicProcess(idx, period=52, fourier=2)
det_terms = det_proc.in_sample()

mod = SARIMAX(y, order=(1, 0, 0), trend="c", exog=det_terms)
res = mod.fit(disp=False)
print(res.summary())

# The forecasts are similar but differ since the parameters of the
# `SARIMAX` are estimated using MLE while `AutoReg` uses OLS.

sarimax_forecast = res.forecast(12, exog=det_proc.out_of_sample(12))
df = pd.concat([auto_reg_forecast, sarimax_forecast], axis=1)
df.columns = columns = ["AutoReg", "SARIMAX"]
df