1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
|
#!/usr/bin/env python
# coding: utf-8
# DO NOT EDIT
# Autogenerated from the notebook mstl_decomposition.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT
# # Multiple Seasonal-Trend decomposition using LOESS (MSTL)
#
# This notebook illustrates the use of `MSTL` [1] to decompose a time
# series into a: trend component, multiple seasonal components, and a
# residual component. MSTL uses STL (Seasonal-Trend decomposition using
# LOESS) to iteratively extract seasonal components from a time series. The
# key inputs into `MSTL` are:
#
# * `periods` - The period of each seasonal component (e.g., for hourly
# data with daily and weekly seasonality we would have: `periods=(24,
# 24*7)`.
# * `windows` - The lengths of each seasonal smoother with respect to each
# period. If these are large then the seasonal component will show less
# variability over time. Must be odd. If `None` a set of default values
# determined by experiments in the original paper [1] are used.
# * `lmbda` - The lambda parameter for a Box-Cox transformation prior to
# decomposition. If `None` then no transformation is done. If `"auto"` then
# an appropriate value for lambda is automatically selected from the data.
# * `iterate` - Number of iterations to use to refine the seasonal
# component.
# * `stl_kwargs` - All the other parameters which can be passed to STL
# (e.g., `robust`, `seasonal_deg`, etc.). See [STL docs](https://www.statsmo
# dels.org/dev/generated/statsmodels.tsa.seasonal.STL.html).
#
# [1] [K. Bandura, R.J. Hyndman, and C. Bergmeir (2021)
# MSTL: A Seasonal-Trend Decomposition Algorithm for Time Series with
# Multiple
# Seasonal Patterns. arXiv preprint
# arXiv:2107.13462.](https://arxiv.org/pdf/2107.13462.pdf)
#
# Note there are some key differences in this implementation to
# [1](https://arxiv.org/pdf/2107.13462.pdf). Missing data must be handled
# outside of the `MSTL` class. The algorithm proposed in the paper handles a
# case when there is no seasonality. This implementation assumes that there
# is at least one seasonal component.
#
# First we import the required packages, prepare the graphics environment,
# and prepare the data.
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.plotting import register_matplotlib_converters
from statsmodels.tsa.seasonal import MSTL
from statsmodels.tsa.seasonal import DecomposeResult
register_matplotlib_converters()
sns.set_style("darkgrid")
plt.rc("figure", figsize=(16, 12))
plt.rc("font", size=13)
# ## MSTL applied to a toy dataset
# ### Create a toy dataset with multiple seasonalities
# We create a time series with hourly frequency that has a daily and
# weekly seasonality which follow a sine wave. We demonstrate a more real
# world example later in the notebook.
t = np.arange(1, 1000)
daily_seasonality = 5 * np.sin(2 * np.pi * t / 24)
weekly_seasonality = 10 * np.sin(2 * np.pi * t / (24 * 7))
trend = 0.0001 * t**2
y = trend + daily_seasonality + weekly_seasonality + np.random.randn(len(t))
ts = pd.date_range(start="2020-01-01", freq="h", periods=len(t))
df = pd.DataFrame(data=y, index=ts, columns=["y"])
df.head()
# Let's plot the time series
df["y"].plot(figsize=[10, 5])
# ### Decompose the toy dataset with MSTL
# Let's use MSTL to decompose the time series into a trend component,
# daily and weekly seasonal component, and residual component.
mstl = MSTL(df["y"], periods=[24, 24 * 7])
res = mstl.fit()
# If the input is a pandas dataframe then the output for the seasonal
# component is a dataframe. The period for each component is reflect in the
# column names.
res.seasonal.head()
ax = res.plot()
# We see that the hourly and weekly seasonal components have been
# extracted.
# Any of the STL parameters other than `period` and `seasonal` (as they
# are set by `periods` and `windows` in `MSTL`) can also be set by passing
# arg:value pairs as a dictionary to `stl_kwargs` (we will show that in an
# example now).
#
# Here we show that we can still set the trend smoother of STL via `trend`
# and order of the polynomial for the seasonal fit via `seasonal_deg`. We
# will also explicitly set the `windows`, `seasonal_deg`, and `iterate`
# parameter explicitly. We will get a worse fit but this is just an example
# of how to pass these parameters to the `MSTL` class.
mstl = MSTL(
df,
periods=[
24, 24 * 7
], # The periods and windows must be the same length and will correspond to one another.
windows=[
101, 101
], # Setting this large along with `seasonal_deg=0` will force the seasonality to be periodic.
iterate=3,
stl_kwargs={
"trend":
1001, # Setting this large will force the trend to be smoother.
"seasonal_deg":
0, # Means the seasonal smoother is fit with a moving average.
})
res = mstl.fit()
ax = res.plot()
# ## MSTL applied to electricity demand dataset
# ### Prepare the data
# We will use the Victoria electricity demand dataset found here:
# https://github.com/tidyverts/tsibbledata/tree/master/data-raw/vic_elec.
# This dataset is used in the [original MSTL paper
# [1]](https://arxiv.org/pdf/2107.13462.pdf). It is the total electricity
# demand at a half hourly granularity for the state of Victora in Australia
# from 2002 to the start of 2015. A more detailed description of the dataset
# can be found [here](https://rdrr.io/cran/tsibbledata/man/vic_elec.html).
url = "https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/demand.csv"
df = pd.read_csv(url)
df.head()
# The date are integers representing the number of days from an origin
# date. The origin date for this dataset is determined from
# [here](https://github.com/tidyverts/tsibbledata/blob/master/data-
# raw/vic_elec/vic_elec.R) and
# [here](https://robjhyndman.com/hyndsight/electrictsibbles/) and is
# "1899-12-30". The `Period` integers refer to 30 minute intervals in a 24
# hour day, hence there are 48 for each day.
#
#
#
# Let's extract the date and date-time.
df["Date"] = df["Date"].apply(
lambda x: pd.Timestamp("1899-12-30") + pd.Timedelta(x, unit="days"))
df["ds"] = df["Date"] + pd.to_timedelta((df["Period"] - 1) * 30, unit="m")
# We will be interested in `OperationalLessIndustrial` which is the
# electricity demand excluding the demand from certain high energy
# industrial users. We will resample the data to hourly and filter the data
# to the same time period as [original MSTL paper
# [1]](https://arxiv.org/pdf/2107.13462.pdf) which is the first 149 days of
# the year 2012.
timeseries = df[["ds", "OperationalLessIndustrial"]]
timeseries.columns = [
"ds", "y"
] # Rename to OperationalLessIndustrial to y for simplicity.
# Filter for first 149 days of 2012.
start_date = pd.to_datetime("2012-01-01")
end_date = start_date + pd.Timedelta("149D")
mask = (timeseries["ds"] >= start_date) & (timeseries["ds"] < end_date)
timeseries = timeseries[mask]
# Resample to hourly
timeseries = timeseries.set_index("ds").resample("h").sum()
timeseries.head()
# ### Decompose electricity demand using MSTL
# Let's apply MSTL to this dataset.
#
# Note: `stl_kwargs` are set to give results close to
# [[1]](https://arxiv.org/pdf/2107.13462.pdf) which used R and therefore has
# a slightly different default settings for the underlying `STL` parameters.
# It would be rare to manually set `inner_iter` and `outer_iter` explicitly
# in practice.
mstl = MSTL(timeseries["y"],
periods=[24, 24 * 7],
iterate=3,
stl_kwargs={
"seasonal_deg": 0,
"inner_iter": 2,
"outer_iter": 0
})
res = mstl.fit() # Use .fit() to perform and return the decomposition
ax = res.plot()
plt.tight_layout()
# The multiple seasonal components are stored as a pandas dataframe in the
# `seasonal` attribute:
res.seasonal.head()
# Let's inspect the seasonal components in a bit more detail and look at
# the first few days and weeks to examine the daily and weekly seasonality.
fig, ax = plt.subplots(nrows=2, figsize=[10, 10])
res.seasonal["seasonal_24"].iloc[:24 * 3].plot(ax=ax[0])
ax[0].set_ylabel("seasonal_24")
ax[0].set_title("Daily seasonality")
res.seasonal["seasonal_168"].iloc[:24 * 7 * 3].plot(ax=ax[1])
ax[1].set_ylabel("seasonal_168")
ax[1].set_title("Weekly seasonality")
plt.tight_layout()
# We can see that the daily seasonality of electricity demand is well
# captured. This is the first few days in January so during the summer
# months in Australia there is a peak in the afternoon most likely due to
# air conditioning use.
#
# For the weekly seasonality we can see that there is less usage during
# the weekends.
#
# One of the advantages of MSTL is that is allows us to capture
# seasonality which changes over time. So let's look at the seasonality
# during cooler months in May.
fig, ax = plt.subplots(nrows=2, figsize=[10, 10])
mask = res.seasonal.index.month == 5
res.seasonal[mask]["seasonal_24"].iloc[:24 * 3].plot(ax=ax[0])
ax[0].set_ylabel("seasonal_24")
ax[0].set_title("Daily seasonality")
res.seasonal[mask]["seasonal_168"].iloc[:24 * 7 * 3].plot(ax=ax[1])
ax[1].set_ylabel("seasonal_168")
ax[1].set_title("Weekly seasonality")
plt.tight_layout()
# Now we can see an additional peak in the evening! This could be related
# to heating and lighting now required in the evenings. So this makes sense.
# We see that main weekly pattern of lower demand over the weekends
# continue.
# The other components can also be extracted from the `trend` and `resid`
# attribute:
display(res.trend.head()) # trend component
display(res.resid.head()) # residual component
# And that's it! Using MSTL we can perform time series decompostion on a
# multi-seasonal time series!
|