This task is about forecasting how many bikes are rented from the TFL (Transport for London) Cycle Hire scheme.
Specifically, attempting to answer the question “Can national electrical power generation help estimate how many bikes are hired?”
The idea is that these two datasets may be correlated with data we don’t have information on (e.g. the weather).
Note : A clear methodology supported by reasonable justifications is more important than an extremely accurate model.
You should produce a Jupyter Notebook with your solution, and submit a .zip folder with the file in .ipynb and .html formats.
#!python --version
#!pip install pyflux-0.4.17-cp38-cp38-win_amd64.whl
#!pip install msticpy
#!pip install msrestazure
#!pip install pystan
#import pystan
#!pip install fbprophet
##install from conda prompt
#!conda install -c conda-forge fbprophet
import pandas as pd
import numpy as np
from time import gmtime, strftime #DateTime Formatting
import matplotlib as mpl
import matplotlib.pyplot as plt #Basic Plotting
import seaborn as sns #Advanced Plotting
import plotly.express as px #Advanced Plotting
sns.set_theme(style="whitegrid")
import datetime
import warnings
import datetime as dt
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import pyflux
import pystan
from fbprophet import Prophet
import sklearn.metrics as metrics
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.3f}'.format
#provide project path which contains the raw data
project_path = "C:\\Users\\gusahil\\Downloads\\Blockchain"
import os
orig_dir = os.getcwd()
os.chdir(project_path)
os.getcwd()
'C:\\Users\\gusahil\\Downloads\\Blockchain'
file_loc = "tfl-daily-cycle-hires.xlsx"
tfl_hires = pd.read_excel(file_loc, index_col=None, usecols = "A:B", sheet_name = 'Data').sort_values(by=['Day','Number of Bicycle Hires'])
#tfl_hires['month'] = tfl_hires['Day'].dt.strftime('%Y-%m-01')
#tfl_hires['weekday'] = tfl_hires['Day'].dt.weekday
tfl_hires['Day'] = pd.to_datetime(tfl_hires['Day'])
tfl_hires.head()
Day | Number of Bicycle Hires | |
---|---|---|
0 | 2010-07-30 | 6897 |
1 | 2010-07-31 | 5564 |
2 | 2010-08-01 | 4303 |
3 | 2010-08-02 | 6642 |
4 | 2010-08-03 | 7966 |
file_loc = "electrical_power_data.csv"
electrical_power = pd.read_csv(file_loc, sep=",")
electrical_power['SD'] = pd.to_datetime(electrical_power['SD'])
for column in electrical_power.columns[1:]:
electrical_power[column] = electrical_power[column].str.replace(",","").astype(int)
electrical_power.head()
SD | Gas | Coal | Nuclear | Hydro | Net Pumped | Wind | OCGT | Oil | Biomass | French Int | Dutch Int | NI Int | Eire Int | Nemo Int | Net Supply | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2010-01-01 | 418674 | 232291 | 191982 | 4641 | -3854 | 7020 | 451 | 0 | 0 | 45222 | 0 | -8521 | 0 | 0 | 887904 |
1 | 2010-01-02 | 456961 | 274253 | 191019 | 4826 | -2349 | 10138 | 0 | 0 | 0 | 6373 | 0 | -7737 | 0 | 0 | 933483 |
2 | 2010-01-03 | 464772 | 278136 | 190175 | 4123 | -3897 | 4171 | 0 | 0 | 0 | 18786 | 0 | -6051 | 0 | 0 | 950214 |
3 | 2010-01-04 | 482219 | 474204 | 191406 | 4703 | -3632 | 5973 | 240 | 0 | 0 | -31833 | 0 | -7576 | 0 | 0 | 1115702 |
4 | 2010-01-05 | 468306 | 505552 | 189244 | 4025 | -2957 | 12493 | 0 | 4187 | 0 | -36205 | 0 | -7370 | 0 | 0 | 1137275 |
electrical_power.describe()
Gas | Coal | Nuclear | Hydro | Net Pumped | Wind | OCGT | Oil | Biomass | French Int | Dutch Int | NI Int | Eire Int | Nemo Int | Net Supply | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 | 4113.000 |
mean | 296251.636 | 174270.655 | 166523.761 | 9613.852 | -2332.465 | 72784.642 | 115.480 | 34.112 | 27571.659 | 26374.237 | 14814.840 | -2605.797 | -725.331 | 2907.756 | 785597.747 |
std | 96474.591 | 152234.532 | 26948.248 | 4990.095 | 2391.372 | 63741.420 | 410.162 | 377.533 | 22159.892 | 21640.888 | 8956.654 | 3799.474 | 4592.563 | 6709.739 | 125511.063 |
min | 73712.000 | 0.000 | 63955.000 | 1376.000 | -13525.000 | 208.000 | 0.000 | 0.000 | 0.000 | -48656.000 | -14850.000 | -10926.000 | -13749.000 | -17305.000 | 434418.000 |
25% | 222355.000 | 23321.000 | 148315.000 | 5423.000 | -3842.000 | 23541.000 | 0.000 | 0.000 | 1095.000 | 17164.000 | 8726.000 | -5282.000 | -3655.000 | 0.000 | 693251.000 |
50% | 289289.000 | 147463.000 | 170340.000 | 8748.000 | -2330.000 | 53901.000 | 0.000 | 0.000 | 26006.000 | 33311.000 | 18664.000 | -2667.000 | 0.000 | 0.000 | 785513.000 |
75% | 368911.000 | 303809.000 | 187582.000 | 13173.000 | -855.000 | 103298.000 | 33.000 | 0.000 | 47485.000 | 43864.000 | 22221.000 | 0.000 | 944.000 | 0.000 | 866373.000 |
max | 581031.000 | 590132.000 | 223686.000 | 27054.000 | 9143.000 | 328223.000 | 6924.000 | 11761.000 | 79590.000 | 71646.000 | 24729.000 | 9103.000 | 12096.000 | 24100.000 | 1174131.000 |
file_loc = "ukbankholidays.csv"
uk_bank_holidays = pd.read_csv(file_loc, sep=",")
uk_bank_holidays['UK BANK HOLIDAYS'] = pd.to_datetime(uk_bank_holidays['UK BANK HOLIDAYS'])
uk_bank_holidays['holiday_flag'] = 1
uk_bank_holidays.head()
UK BANK HOLIDAYS | holiday_flag | |
---|---|---|
0 | 1998-01-01 | 1 |
1 | 1998-04-10 | 1 |
2 | 1998-04-13 | 1 |
3 | 1998-05-04 | 1 |
4 | 1998-05-25 | 1 |
# Create function to calculate Start Week date
week_start_date = lambda date: date - timedelta(days=date.weekday())
# Apply above function on DataFrame column
uk_bank_holidays['week_start_date'] = uk_bank_holidays['UK BANK HOLIDAYS'].apply(week_start_date)
uk_bank_holidays.head()
UK BANK HOLIDAYS | holiday_flag | week_start_date | |
---|---|---|---|
0 | 1998-01-01 | 1 | 1997-12-29 |
1 | 1998-04-10 | 1 | 1998-04-06 |
2 | 1998-04-13 | 1 | 1998-04-13 |
3 | 1998-05-04 | 1 | 1998-05-04 |
4 | 1998-05-25 | 1 | 1998-05-25 |
# Weekly Agrregation
uk_bank_holidays_weekly_agg = uk_bank_holidays.groupby(['week_start_date'])['holiday_flag'].sum().reset_index()
uk_bank_holidays_weekly_agg = uk_bank_holidays_weekly_agg[(uk_bank_holidays_weekly_agg.week_start_date >= '2010-08-02') & (uk_bank_holidays_weekly_agg.week_start_date < '2021-04-05')]
uk_bank_holidays_weekly_agg.tail()
week_start_date | holiday_flag | |
---|---|---|
158 | 2020-05-25 | 1 |
159 | 2020-08-31 | 1 |
160 | 2020-12-21 | 1 |
161 | 2020-12-28 | 2 |
162 | 2021-03-29 | 1 |
all_data = pd.merge(tfl_hires, electrical_power, left_on='Day', right_on='SD', how='inner')
all_data = pd.merge(all_data, uk_bank_holidays, left_on='Day', right_on='UK BANK HOLIDAYS', how='left')
all_data['holiday_flag'].fillna(0, inplace=True)
#all_data = all_data[all_data['Day'] >= '2012-01-02']
all_data.head()
Day | Number of Bicycle Hires | SD | Gas | Coal | Nuclear | Hydro | Net Pumped | Wind | OCGT | Oil | Biomass | French Int | Dutch Int | NI Int | Eire Int | Nemo Int | Net Supply | UK BANK HOLIDAYS | holiday_flag | week_start_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2010-07-30 | 6897 | 2010-07-30 | 431920 | 184603 | 122467 | 4417 | -4582 | 4841 | 15 | 0 | 0 | 44898 | 0 | -7411 | 0 | 0 | 781168 | NaT | 0.000 | NaT |
1 | 2010-07-31 | 5564 | 2010-07-31 | 406077 | 111091 | 121983 | 4604 | 726 | 7013 | 0 | 0 | 0 | 46443 | 0 | -4932 | 0 | 0 | 693004 | NaT | 0.000 | NaT |
2 | 2010-08-01 | 4303 | 2010-08-01 | 393442 | 109041 | 126746 | 4839 | -6091 | 4264 | 0 | 0 | 0 | 47760 | 0 | -5775 | 0 | 0 | 674225 | NaT | 0.000 | NaT |
3 | 2010-08-02 | 6642 | 2010-08-02 | 429981 | 190693 | 122512 | 3638 | -2698 | 866 | 0 | 0 | 0 | 45391 | 0 | -7895 | 0 | 0 | 782488 | NaT | 0.000 | NaT |
4 | 2010-08-03 | 7966 | 2010-08-03 | 433955 | 182201 | 125603 | 3594 | -4137 | 5358 | 4 | 0 | 0 | 45788 | 0 | -7593 | 0 | 0 | 784771 | NaT | 0.000 | NaT |
# Create function to calculate Start Week date
week_start_date = lambda date: date - timedelta(days=date.weekday())
# Apply above function on DataFrame column
all_data['week_start_date'] = all_data['Day'].apply(week_start_date)
all_data.head()
Day | Number of Bicycle Hires | SD | Gas | Coal | Nuclear | Hydro | Net Pumped | Wind | OCGT | Oil | Biomass | French Int | Dutch Int | NI Int | Eire Int | Nemo Int | Net Supply | UK BANK HOLIDAYS | holiday_flag | week_start_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2010-07-30 | 6897 | 2010-07-30 | 431920 | 184603 | 122467 | 4417 | -4582 | 4841 | 15 | 0 | 0 | 44898 | 0 | -7411 | 0 | 0 | 781168 | NaT | 0.000 | 2010-07-26 |
1 | 2010-07-31 | 5564 | 2010-07-31 | 406077 | 111091 | 121983 | 4604 | 726 | 7013 | 0 | 0 | 0 | 46443 | 0 | -4932 | 0 | 0 | 693004 | NaT | 0.000 | 2010-07-26 |
2 | 2010-08-01 | 4303 | 2010-08-01 | 393442 | 109041 | 126746 | 4839 | -6091 | 4264 | 0 | 0 | 0 | 47760 | 0 | -5775 | 0 | 0 | 674225 | NaT | 0.000 | 2010-07-26 |
3 | 2010-08-02 | 6642 | 2010-08-02 | 429981 | 190693 | 122512 | 3638 | -2698 | 866 | 0 | 0 | 0 | 45391 | 0 | -7895 | 0 | 0 | 782488 | NaT | 0.000 | 2010-08-02 |
4 | 2010-08-03 | 7966 | 2010-08-03 | 433955 | 182201 | 125603 | 3594 | -4137 | 5358 | 4 | 0 | 0 | 45788 | 0 | -7593 | 0 | 0 | 784771 | NaT | 0.000 | 2010-08-02 |
# Weekly Agrregation
all_data_weekly_agg = all_data.groupby(['week_start_date'])['Number of Bicycle Hires','Net Supply','Wind','holiday_flag','Oil','Gas','Coal','Nuclear','Hydro','Biomass'].sum().reset_index()
all_data_weekly_agg = all_data_weekly_agg[(all_data_weekly_agg.week_start_date >= '2010-08-02') & (all_data_weekly_agg.week_start_date < '2021-04-05')]
all_data_weekly_agg.tail()
week_start_date | Number of Bicycle Hires | Net Supply | Wind | holiday_flag | Oil | Gas | Coal | Nuclear | Hydro | Biomass | |
---|---|---|---|---|---|---|---|---|---|---|---|
548 | 2021-01-25 | 89862 | 5729779 | 1159736 | 0.000 | 0 | 2347211 | 231920 | 935919 | 53618 | 445087 |
549 | 2021-02-01 | 96027 | 5614122 | 1475041 | 0.000 | 0 | 2113964 | 161547 | 871946 | 39360 | 370132 |
550 | 2021-02-08 | 78706 | 6019601 | 1685621 | 0.000 | 0 | 2200012 | 227474 | 863094 | 31088 | 448276 |
551 | 2021-02-15 | 146239 | 5123511 | 1686256 | 0.000 | 0 | 1473505 | 144520 | 805809 | 96079 | 412448 |
552 | 2021-02-22 | 189834 | 5005908 | 921676 | 0.000 | 0 | 2089236 | 52005 | 720337 | 117181 | 443975 |
all_data_weekly_agg[1:].describe()
Number of Bicycle Hires | Net Supply | Wind | holiday_flag | Oil | Gas | Coal | Nuclear | Hydro | Biomass | |
---|---|---|---|---|---|---|---|---|---|---|
count | 551.000 | 551.000 | 551.000 | 551.000 | 551.000 | 551.000 | 551.000 | 551.000 | 551.000 | 551.000 |
mean | 181133.209 | 5465831.740 | 530140.341 | 0.156 | 207.817 | 2012965.004 | 1195372.577 | 1171935.414 | 68999.978 | 201628.272 |
std | 55697.534 | 782731.845 | 370262.719 | 0.423 | 1369.211 | 513284.746 | 1053870.308 | 179564.534 | 33362.476 | 144890.191 |
min | 40461.000 | 3563468.000 | 38826.000 | 0.000 | 0.000 | 772756.000 | 0.000 | 542422.000 | 13011.000 | 0.000 |
25% | 142569.000 | 4901843.500 | 242832.500 | 0.000 | 0.000 | 1623279.000 | 166239.500 | 1055058.500 | 40051.500 | 81995.000 |
50% | 173771.000 | 5423785.000 | 442213.000 | 0.000 | 0.000 | 1960075.000 | 1036107.000 | 1199365.000 | 65351.000 | 179465.000 |
75% | 225287.000 | 5990978.500 | 742132.500 | 0.000 | 0.000 | 2388277.500 | 2132900.000 | 1311847.500 | 91900.500 | 333117.500 |
max | 362925.000 | 7662840.000 | 1944082.000 | 2.000 | 23115.000 | 3484628.000 | 3933653.000 | 1546201.000 | 164810.000 | 526591.000 |
fig = px.line(all_data, x='Day', y="Number of Bicycle Hires", title='Daily Number of Bicycle Hires')
fig.show()
fig = px.line(all_data_weekly_agg, x='week_start_date', y="Number of Bicycle Hires", title='Weekly Number of Bicycle Hires')
fig.show()
fig = px.line(all_data, x='SD', y="Net Supply", title='Daily Net Supply Electrical Power')
fig.show()
fig = px.line(all_data_weekly_agg, x='week_start_date', y="Net Supply", title='Weekly Net Supply Electrical Power')
fig.show()
all_data_weekly_agg.iloc[:,1:17].corr()
Number of Bicycle Hires | Net Supply | Wind | holiday_flag | Oil | Gas | Coal | Nuclear | Hydro | Biomass | |
---|---|---|---|---|---|---|---|---|---|---|
Number of Bicycle Hires | 1.000 | -0.732 | -0.073 | -0.204 | -0.194 | -0.150 | -0.483 | -0.147 | -0.413 | 0.248 |
Net Supply | -0.732 | 1.000 | -0.185 | -0.119 | 0.291 | 0.203 | 0.765 | 0.361 | 0.310 | -0.430 |
Wind | -0.073 | -0.185 | 1.000 | -0.025 | -0.137 | -0.113 | -0.525 | -0.410 | 0.447 | 0.660 |
holiday_flag | -0.204 | -0.119 | -0.025 | 1.000 | -0.056 | -0.143 | -0.041 | 0.100 | 0.036 | -0.005 |
Oil | -0.194 | 0.291 | -0.137 | -0.056 | 1.000 | 0.129 | 0.248 | 0.074 | -0.032 | -0.185 |
Gas | -0.150 | 0.203 | -0.113 | -0.143 | 0.129 | 1.000 | -0.256 | -0.050 | -0.132 | 0.111 |
Coal | -0.483 | 0.765 | -0.525 | -0.041 | 0.248 | -0.256 | 1.000 | 0.352 | 0.085 | -0.765 |
Nuclear | -0.147 | 0.361 | -0.410 | 0.100 | 0.074 | -0.050 | 0.352 | 1.000 | -0.057 | -0.365 |
Hydro | -0.413 | 0.310 | 0.447 | 0.036 | -0.032 | -0.132 | 0.085 | -0.057 | 1.000 | 0.159 |
Biomass | 0.248 | -0.430 | 0.660 | -0.005 | -0.185 | 0.111 | -0.765 | -0.365 | 0.159 | 1.000 |
all_data_weekly_agg = all_data_weekly_agg[['week_start_date','Number of Bicycle Hires','Net Supply','Wind','holiday_flag']]
all_data_weekly_agg = all_data_weekly_agg[all_data_weekly_agg['week_start_date'] >= '2012-01-02']
train_end_date = '2018-06-03'
def setup_train_test(data_df = all_data_weekly_agg, train_end_date = '2018-06-03', feat_name = 'Number of Bicycle Hires'):
""" This function splits the input data frame data_df into train and test sets.
Trainining is done on all weeks before train_end_date: [2014, train_end_date)
Testing is done on all weeks. So the dates before train_end_date are in sample
and after train_end_date are out of sample test weeks.
This method needs data_df to have week_ending and feat_name as columns.
Returns train_df and test_df with columns ds and y where ds corresponds to week_ending and y corresponds to feat_name.
"""
sel=data_df.week_start_date < train_end_date;
train_df = data_df.loc[sel].rename(columns={'week_start_date':'ds', feat_name:'y'})
test_df = data_df.loc[~sel].rename(columns={'week_start_date':'ds', feat_name:'y'});
return train_df, test_df
def prophet_fittransform(train_df, test_df):
""" Initialize prohpet model, train using train_df and generate predictions for test_df.
Inputs train_df and test_df are data frames with columns: 'ds', 'y'
Returns fcst_df with columns the following columns:
'ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
'yearly', 'yearly_lower', 'yearly_upper', 'multiplicative_terms',
'multiplicative_terms_lower', 'multiplicative_terms_upper', 'yhat'
"""
m = Prophet();
m.fit(train_df)
fcst_train_df = m.predict(train_df)
future = m.make_future_dataframe(periods = 52*4, freq = "W", include_history = False)
future.ds = future.ds + pd.DateOffset(days=1)
future_df = m.predict(future)
return fcst_train_df, future_df
def prophet_exogenous_fittransform(train_df, test_df):
""" Initialize prohpet model, train using train_df and generate predictions for test_df.
Inputs train_df and test_df are data frames with columns: 'ds', 'y'
Returns fcst_df with columns the following columns:
'ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
'yearly', 'yearly_lower', 'yearly_upper', 'multiplicative_terms',
'multiplicative_terms_lower', 'multiplicative_terms_upper', 'yhat'
"""
m = Prophet();
m.add_regressor('Net Supply')
m.add_regressor('Wind')
m.add_regressor('holiday_flag')
m.fit(train_df)
fcst_train_df = m.predict(train_df)
test_df['holiday_flag'].fillna(0, inplace=True)
future_df = m.predict(test_df)
return fcst_train_df, future_df
train_df, test_df = setup_train_test(all_data_weekly_agg, feat_name = 'Number of Bicycle Hires');
fcst_train_df, future_df = prophet_fittransform(train_df, test_df);
INFO:numexpr.utils:NumExpr defaulting to 4 threads. INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
fcst_train_df.tail(1)
ds | trend | yhat_lower | yhat_upper | trend_lower | trend_upper | additive_terms | additive_terms_lower | additive_terms_upper | yearly | yearly_lower | yearly_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | yhat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
334 | 2018-05-28 | 205674.066 | 204569.499 | 268317.522 | 205674.066 | 205674.066 | 32448.857 | 32448.857 | 32448.857 | 32448.857 | 32448.857 | 32448.857 | 0.000 | 0.000 | 0.000 | 238122.923 |
future_df.head(1)
ds | trend | yhat_lower | yhat_upper | trend_lower | trend_upper | additive_terms | additive_terms_lower | additive_terms_upper | yearly | yearly_lower | yearly_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | yhat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-06-04 | 205785.430 | 208586.191 | 272578.220 | 205785.430 | 205785.430 | 33550.908 | 33550.908 | 33550.908 | 33550.908 | 33550.908 | 33550.908 | 0.000 | 0.000 | 0.000 | 239336.338 |
#fcst_test_df['ds'] = pd.to_datetime()
all_df = pd.merge(fcst_train_df[['ds','yhat_lower','yhat','yhat_upper']].append(future_df[['ds','yhat_lower','yhat','yhat_upper']]), train_df.append(test_df), on='ds', how='outer', indicator=True).sort_values(by = 'ds')
all_df[all_df['_merge'] == 'both'].tail()
ds | yhat_lower | yhat | yhat_upper | y | Net Supply | Wind | holiday_flag | _merge | |
---|---|---|---|---|---|---|---|---|---|
473 | 2021-01-25 | 142140.137 | 174821.595 | 204773.968 | 89862.000 | 5729779.000 | 1159736.000 | 0.000 | both |
474 | 2021-02-01 | 134033.487 | 167645.156 | 200903.099 | 96027.000 | 5614122.000 | 1475041.000 | 0.000 | both |
475 | 2021-02-08 | 132889.431 | 164903.434 | 196646.538 | 78706.000 | 6019601.000 | 1685621.000 | 0.000 | both |
476 | 2021-02-15 | 138297.982 | 169944.940 | 201417.275 | 146239.000 | 5123511.000 | 1686256.000 | 0.000 | both |
477 | 2021-02-22 | 145912.575 | 178920.109 | 211556.887 | 189834.000 | 5005908.000 | 921676.000 | 0.000 | both |
all_df.tail(1)
ds | yhat_lower | yhat | yhat_upper | y | Net Supply | Wind | holiday_flag | _merge | |
---|---|---|---|---|---|---|---|---|---|
542 | 2022-05-23 | 230947.231 | 261068.561 | 293231.810 | NaN | NaN | NaN | NaN | left_only |
all_df.head(1)
ds | yhat_lower | yhat | yhat_upper | y | Net Supply | Wind | holiday_flag | _merge | |
---|---|---|---|---|---|---|---|---|---|
0 | 2012-01-02 | 58395.670 | 91833.433 | 124634.229 | 88068.000 | 6449422.000 | 403325.000 | 1.000 | both |
prophet_predictions = all_df[['ds','yhat']].rename(columns={'yhat':'prophet_forecast'})
prophet_predictions.head()
ds | prophet_forecast | |
---|---|---|
0 | 2012-01-02 | 91833.433 |
1 | 2012-01-09 | 110588.773 |
2 | 2012-01-16 | 125294.257 |
3 | 2012-01-23 | 127627.864 |
4 | 2012-01-30 | 121147.356 |
all_df.loc[all_df['ds'] < train_end_date, 'yhat'] = np.nan
all_df.loc[all_df['ds'] < train_end_date, 'yhat_lower'] = np.nan
all_df.loc[all_df['ds'] < train_end_date, 'yhat_upper'] = np.nan
all_df_melt = pd.melt(all_df[['ds','y','yhat']], id_vars=['ds'], value_vars=all_df[['ds','y','yhat']].columns[1:3])
all_df_melt[(all_df_melt['variable'] == 'y') & (all_df_melt['value'] > 0)].tail()
ds | variable | value | |
---|---|---|---|
473 | 2021-01-25 | y | 89862.000 |
474 | 2021-02-01 | y | 96027.000 |
475 | 2021-02-08 | y | 78706.000 |
476 | 2021-02-15 | y | 146239.000 |
477 | 2021-02-22 | y | 189834.000 |
fig = px.line(all_df_melt, x='ds', y="value", color='variable', title='Actuals vs Forecast')
fig.show()
train_df, test_df = setup_train_test(all_data_weekly_agg, feat_name = 'Net Supply');
fcst_train_df, future_df = prophet_fittransform(train_df, test_df);
net_supply_df = fcst_train_df.append(future_df)[['ds','yhat']].rename(columns={'yhat':'Net Supply Forecast'})
net_supply_df.head()
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
ds | Net Supply Forecast | |
---|---|---|
0 | 2012-01-02 | 6426779.663 |
1 | 2012-01-09 | 6791505.989 |
2 | 2012-01-16 | 7053955.255 |
3 | 2012-01-23 | 7082898.467 |
4 | 2012-01-30 | 6961743.168 |
train_df, test_df = setup_train_test(all_data_weekly_agg, feat_name = 'Wind');
fcst_train_df, future_df = prophet_fittransform(train_df, test_df);
wind_df = fcst_train_df.append(future_df)[['ds','yhat']].rename(columns={'yhat':'Wind Forecast'})
wind_df.head()
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
ds | Wind Forecast | |
---|---|---|
0 | 2012-01-02 | 439172.815 |
1 | 2012-01-09 | 391057.719 |
2 | 2012-01-16 | 374984.594 |
3 | 2012-01-23 | 391115.221 |
4 | 2012-01-30 | 412263.634 |
wind_df.tail()
ds | Wind Forecast | |
---|---|---|
203 | 2022-04-25 | 1226838.891 |
204 | 2022-05-02 | 1220680.546 |
205 | 2022-05-09 | 1190385.426 |
206 | 2022-05-16 | 1144714.717 |
207 | 2022-05-23 | 1105112.537 |
all_data_weekly_agg.tail()
week_start_date | Number of Bicycle Hires | Net Supply | Wind | holiday_flag | |
---|---|---|---|---|---|
548 | 2021-01-25 | 89862 | 5729779 | 1159736 | 0.000 |
549 | 2021-02-01 | 96027 | 5614122 | 1475041 | 0.000 |
550 | 2021-02-08 | 78706 | 6019601 | 1685621 | 0.000 |
551 | 2021-02-15 | 146239 | 5123511 | 1686256 | 0.000 |
552 | 2021-02-22 | 189834 | 5005908 | 921676 | 0.000 |
all_data_weekly_agg_exog = pd.merge(all_data_weekly_agg.rename(columns={'week_start_date':'ds'}), net_supply_df, on=['ds'])
all_data_weekly_agg_exog = all_df[['ds']].merge(wind_df,on='ds',how='left')\
.merge(net_supply_df,on='ds',how='left')\
.merge(all_data_weekly_agg.rename(columns={'week_start_date':'ds'}),on='ds',how='left')
all_data_weekly_agg_exog.tail()
ds | Wind Forecast | Net Supply Forecast | Number of Bicycle Hires | Net Supply | Wind | holiday_flag | |
---|---|---|---|---|---|---|---|
538 | 2022-04-25 | 1226838.891 | 4707844.244 | NaN | NaN | NaN | NaN |
539 | 2022-05-02 | 1220680.546 | 4651678.701 | NaN | NaN | NaN | NaN |
540 | 2022-05-09 | 1190385.426 | 4527937.751 | NaN | NaN | NaN | NaN |
541 | 2022-05-16 | 1144714.717 | 4392453.905 | NaN | NaN | NaN | NaN |
542 | 2022-05-23 | 1105112.537 | 4325390.245 | NaN | NaN | NaN | NaN |
all_data_weekly_agg_exog.loc[all_data_weekly_agg_exog['ds'] >= train_end_date, 'Net Supply'] = all_data_weekly_agg_exog.loc[all_data_weekly_agg_exog['ds'] >= train_end_date, 'Net Supply Forecast']
all_data_weekly_agg_exog.loc[all_data_weekly_agg_exog['ds'] >= train_end_date, 'Wind'] = all_data_weekly_agg_exog.loc[all_data_weekly_agg_exog['ds'] >= train_end_date, 'Wind Forecast']
all_data_weekly_agg_exog.tail()
ds | Wind Forecast | Net Supply Forecast | Number of Bicycle Hires | Net Supply | Wind | holiday_flag | |
---|---|---|---|---|---|---|---|
538 | 2022-04-25 | 1226838.891 | 4707844.244 | NaN | 4707844.244 | 1226838.891 | NaN |
539 | 2022-05-02 | 1220680.546 | 4651678.701 | NaN | 4651678.701 | 1220680.546 | NaN |
540 | 2022-05-09 | 1190385.426 | 4527937.751 | NaN | 4527937.751 | 1190385.426 | NaN |
541 | 2022-05-16 | 1144714.717 | 4392453.905 | NaN | 4392453.905 | 1144714.717 | NaN |
542 | 2022-05-23 | 1105112.537 | 4325390.245 | NaN | 4325390.245 | 1105112.537 | NaN |
train_df.tail()
ds | Number of Bicycle Hires | Net Supply | y | holiday_flag | |
---|---|---|---|---|---|
405 | 2018-04-30 | 227037 | 4766576 | 693956 | 0.000 |
406 | 2018-05-07 | 250491 | 4466088 | 421530 | 1.000 |
407 | 2018-05-14 | 266541 | 4483364 | 460511 | 0.000 |
408 | 2018-05-21 | 252114 | 4443748 | 575373 | 0.000 |
409 | 2018-05-28 | 249072 | 4545699 | 209128 | 1.000 |
train_df, test_df = setup_train_test(all_data_weekly_agg_exog.rename(columns={'ds':'week_start_date'}), feat_name = 'Number of Bicycle Hires');
fcst_train_df, future_df = prophet_exogenous_fittransform(train_df, test_df);
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
all_df = pd.merge(fcst_train_df[['ds','yhat_lower','yhat','yhat_upper']].append(future_df[['ds','yhat_lower','yhat','yhat_upper']]), train_df.append(test_df), on='ds', how='outer', indicator=True).sort_values(by = 'ds')
all_df[all_df['_merge'] == 'both'].tail()
ds | yhat_lower | yhat | yhat_upper | Wind Forecast | Net Supply Forecast | y | Net Supply | Wind | holiday_flag | _merge | |
---|---|---|---|---|---|---|---|---|---|---|---|
538 | 2022-04-25 | 188495.295 | 216549.781 | 246400.555 | 1226838.891 | 4707844.244 | NaN | 4707844.244 | 1226838.891 | 0.000 | both |
539 | 2022-05-02 | 197412.133 | 225206.220 | 253193.347 | 1220680.546 | 4651678.701 | NaN | 4651678.701 | 1220680.546 | 0.000 | both |
540 | 2022-05-09 | 205919.757 | 234841.964 | 263557.229 | 1190385.426 | 4527937.751 | NaN | 4527937.751 | 1190385.426 | 0.000 | both |
541 | 2022-05-16 | 213556.027 | 242159.073 | 269296.515 | 1144714.717 | 4392453.905 | NaN | 4392453.905 | 1144714.717 | 0.000 | both |
542 | 2022-05-23 | 217361.077 | 245212.330 | 271235.146 | 1105112.537 | 4325390.245 | NaN | 4325390.245 | 1105112.537 | 0.000 | both |
all_df.loc[all_df['ds'] < train_end_date, 'yhat'] = np.nan
all_df.loc[all_df['ds'] < train_end_date, 'yhat_lower'] = np.nan
all_df.loc[all_df['ds'] < train_end_date, 'yhat_upper'] = np.nan
all_df_melt = pd.melt(all_df[['ds','y','yhat']], id_vars=['ds'], value_vars=all_df[['ds','y','yhat']].columns[1:3])
all_df_melt[(all_df_melt['variable'] == 'y') & (all_df_melt['value'] > 0)].tail()
ds | variable | value | |
---|---|---|---|
473 | 2021-01-25 | y | 89862.000 |
474 | 2021-02-01 | y | 96027.000 |
475 | 2021-02-08 | y | 78706.000 |
476 | 2021-02-15 | y | 146239.000 |
477 | 2021-02-22 | y | 189834.000 |
fig = px.line(all_df_melt, x='ds', y="value", color='variable', title='Actuals vs Forecast')
fig.show()
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(12).mean()
rolstd = timeseries.rolling(12).std()
#Plot rolling statistics:
fig = plt.figure(figsize=(12, 8))
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show()
#Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
all_data_weekly_agg['seasonal_difference'] = all_data_weekly_agg['Number of Bicycle Hires'] - all_data_weekly_agg['Number of Bicycle Hires'].shift(52)
all_data_weekly_agg.head()
week_start_date | Number of Bicycle Hires | Net Supply | Wind | holiday_flag | seasonal_difference | |
---|---|---|---|---|---|---|
75 | 2012-01-02 | 88068 | 6449422 | 403325 | 1.000 | NaN |
76 | 2012-01-09 | 127540 | 6720281 | 228344 | 0.000 | NaN |
77 | 2012-01-16 | 117606 | 6829803 | 350157 | 0.000 | NaN |
78 | 2012-01-23 | 118809 | 6818013 | 185454 | 0.000 | NaN |
79 | 2012-01-30 | 100319 | 7327197 | 179558 | 0.000 | NaN |
test_stationarity(all_data_weekly_agg['seasonal_difference'].iloc[53:])
Results of Dickey-Fuller Test: Test Statistic -5.656 p-value 0.000 #Lags Used 4.000 Number of Observations Used 420.000 Critical Value (1%) -3.446 Critical Value (5%) -2.868 Critical Value (10%) -2.570 dtype: float64
fig = plt.figure(figsize=(18,16))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(all_data_weekly_agg.iloc[53:420]['seasonal_difference'], lags=170, ax=ax1)
fig = plt.figure(figsize=(18,16))
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(all_data_weekly_agg.iloc[53:420]['seasonal_difference'], lags=170, ax=ax2)
mod = sm.tsa.statespace.SARIMAX(all_df.rename(columns={'y':'Number of Bicycle Hires'}).iloc[:335]['Number of Bicycle Hires'], trend='c', order=(2,0,0), seasonal_order=(3,1,0,52.25))#, exog, = )
results = mod.fit()
print(results.summary())
SARIMAX Results ========================================================================================== Dep. Variable: Number of Bicycle Hires No. Observations: 335 Model: SARIMAX(2, 0, 0)x(3, 1, 0, 52) Log Likelihood -3297.077 Date: Sun, 12 Sep 2021 AIC 6608.153 Time: 20:01:22 BIC 6633.671 Sample: 0 HQIC 6618.385 - 335 Covariance Type: opg ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ intercept 786.2430 2986.142 0.263 0.792 -5066.488 6638.973 ar.L1 0.5861 0.079 7.412 0.000 0.431 0.741 ar.L2 0.2194 0.078 2.807 0.005 0.066 0.373 ar.S.L52 -0.5744 0.079 -7.230 0.000 -0.730 -0.419 ar.S.L104 -0.1669 0.102 -1.632 0.103 -0.367 0.034 ar.S.L156 -0.0914 0.076 -1.202 0.229 -0.240 0.058 sigma2 1.018e+09 0.028 3.7e+10 0.000 1.02e+09 1.02e+09 =================================================================================== Ljung-Box (L1) (Q): 2.31 Jarque-Bera (JB): 3.78 Prob(Q): 0.13 Prob(JB): 0.15 Heteroskedasticity (H): 0.82 Skew: 0.17 Prob(H) (two-sided): 0.35 Kurtosis: 3.46 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step). [2] Covariance matrix is singular or near-singular, with condition number 2.12e+26. Standard errors may be unstable.
len(all_df)
543
all_df['sarima_forecast'] = results.predict(start = 336, end= len(all_df), dynamic= True)
all_df.head()
ds | yhat_lower | yhat | yhat_upper | Wind Forecast | Net Supply Forecast | y | Net Supply | Wind | holiday_flag | _merge | sarima_forecast | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2012-01-02 | NaN | NaN | NaN | 439172.815 | 6426779.663 | 88068.000 | 6449422.000 | 403325.000 | 1.000 | both | NaN |
1 | 2012-01-09 | NaN | NaN | NaN | 391057.719 | 6791505.989 | 127540.000 | 6720281.000 | 228344.000 | 0.000 | both | NaN |
2 | 2012-01-16 | NaN | NaN | NaN | 374984.594 | 7053955.255 | 117606.000 | 6829803.000 | 350157.000 | 0.000 | both | NaN |
3 | 2012-01-23 | NaN | NaN | NaN | 391115.221 | 7082898.467 | 118809.000 | 6818013.000 | 185454.000 | 0.000 | both | NaN |
4 | 2012-01-30 | NaN | NaN | NaN | 412263.634 | 6961743.168 | 100319.000 | 7327197.000 | 179558.000 | 0.000 | both | NaN |
subset_all_df = all_df[['ds','y','sarima_forecast']].rename(columns={'y':'Number of Bicycle Hires'})
all_data_weekly_agg_melt = pd.melt(subset_all_df, id_vars=['ds'], value_vars=subset_all_df.columns[1:3])
fig = px.line(all_data_weekly_agg_melt, x='ds', y="value", color='variable', title='Weekly Number of Bicycle Hires')
fig.show()
exogenous = sm.add_constant(all_df.iloc[:335][['Net Supply','Wind','holiday_flag']])
mod = sm.tsa.statespace.SARIMAX(all_df.rename(columns={'y':'Number of Bicycle Hires'}).iloc[:335]['Number of Bicycle Hires'], order=(2,0,0), seasonal_order=(3,1,0,52.25), exog = exogenous)
results = mod.fit()
print(results.summary())
SARIMAX Results ========================================================================================== Dep. Variable: Number of Bicycle Hires No. Observations: 335 Model: SARIMAX(2, 0, 0)x(3, 1, 0, 52) Log Likelihood -3281.621 Date: Sun, 12 Sep 2021 AIC 6583.242 Time: 20:06:12 BIC 6619.696 Sample: 0 HQIC 6597.859 - 335 Covariance Type: opg ================================================================================ coef std err z P>|z| [0.025 0.975] -------------------------------------------------------------------------------- const 1.285e-05 1.13e+04 1.14e-09 1.000 -2.21e+04 2.21e+04 Net Supply 0.0028 0.003 0.915 0.360 -0.003 0.009 Wind -0.0152 0.009 -1.704 0.088 -0.033 0.002 holiday_flag -3.194e+04 3636.525 -8.782 0.000 -3.91e+04 -2.48e+04 ar.L1 0.6079 0.055 11.127 0.000 0.501 0.715 ar.L2 0.2535 0.058 4.379 0.000 0.140 0.367 ar.S.L52 -0.6432 0.062 -10.374 0.000 -0.765 -0.522 ar.S.L104 -0.3208 0.079 -4.049 0.000 -0.476 -0.166 ar.S.L156 -0.2386 0.072 -3.294 0.001 -0.381 -0.097 sigma2 6.929e+08 0.134 5.17e+09 0.000 6.93e+08 6.93e+08 =================================================================================== Ljung-Box (L1) (Q): 2.17 Jarque-Bera (JB): 5.75 Prob(Q): 0.14 Prob(JB): 0.06 Heteroskedasticity (H): 0.83 Skew: 0.09 Prob(H) (two-sided): 0.36 Kurtosis: 3.67 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step). [2] Covariance matrix is singular or near-singular, with condition number 4.57e+24. Standard errors may be unstable.
exogenous = sm.add_constant(all_df.iloc[336:][['Net Supply','Wind','holiday_flag']])
all_df['sarimax_forecast'] = results.predict(start = 336, end= len(all_df) - 2, dynamic= True, exog = exogenous)
all_df.head()
ds | yhat_lower | yhat | yhat_upper | Wind Forecast | Net Supply Forecast | y | Net Supply | Wind | holiday_flag | _merge | sarima_forecast | sarimax_forecast | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2012-01-02 | NaN | NaN | NaN | 439172.815 | 6426779.663 | 88068.000 | 6449422.000 | 403325.000 | 1.000 | both | NaN | NaN |
1 | 2012-01-09 | NaN | NaN | NaN | 391057.719 | 6791505.989 | 127540.000 | 6720281.000 | 228344.000 | 0.000 | both | NaN | NaN |
2 | 2012-01-16 | NaN | NaN | NaN | 374984.594 | 7053955.255 | 117606.000 | 6829803.000 | 350157.000 | 0.000 | both | NaN | NaN |
3 | 2012-01-23 | NaN | NaN | NaN | 391115.221 | 7082898.467 | 118809.000 | 6818013.000 | 185454.000 | 0.000 | both | NaN | NaN |
4 | 2012-01-30 | NaN | NaN | NaN | 412263.634 | 6961743.168 | 100319.000 | 7327197.000 | 179558.000 | 0.000 | both | NaN | NaN |
subset_all_df = all_df[['ds','y','sarimax_forecast']].rename(columns={'y':'Number of Bicycle Hires'})
all_data_weekly_agg_melt = pd.melt(subset_all_df, id_vars=['ds'], value_vars=subset_all_df.columns[1:3])
fig = px.line(all_data_weekly_agg_melt, x='ds', y="value", color='variable', title='Weekly Number of Bicycle Hires')
fig.show()
model_predictions = all_df[(all_df['ds'] > '2018-06-04') & (all_df['ds'] < '2021-02-22')]
model_predictions = pd.merge(model_predictions, prophet_predictions, how='left', on='ds')
model_predictions.head()
ds | yhat_lower | yhat | yhat_upper | Wind Forecast | Net Supply Forecast | y | Net Supply | Wind | holiday_flag | _merge | sarima_forecast | sarimax_forecast | prophet_forecast | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-06-11 | 215448.972 | 243150.015 | 270379.612 | 596148.738 | 4573046.836 | 255914.000 | 4573046.836 | 596148.738 | 0.000 | both | 268538.025 | 270662.818 | 243945.870 |
1 | 2018-06-18 | 217054.729 | 246460.565 | 272819.841 | 610299.236 | 4545541.651 | 283816.000 | 4545541.651 | 610299.236 | 0.000 | both | 255493.970 | 253671.885 | 251855.594 |
2 | 2018-06-25 | 224162.670 | 251311.331 | 280714.036 | 599463.769 | 4536237.346 | 297449.000 | 4536237.346 | 599463.769 | 0.000 | both | 256897.206 | 260177.879 | 260143.583 |
3 | 2018-07-02 | 229373.500 | 256531.273 | 282949.321 | 575149.318 | 4567812.563 | 292421.000 | 4567812.563 | 575149.318 | 0.000 | both | 288777.506 | 282597.913 | 265920.657 |
4 | 2018-07-09 | 233957.934 | 261807.400 | 289277.200 | 562519.209 | 4595342.697 | 285168.000 | 4595342.697 | 562519.209 | 0.000 | both | 263186.600 | 265599.456 | 269181.038 |
y = model_predictions['y']
yhat = model_predictions['prophet_forecast']
mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse) #mse**(0.5)
r2 = metrics.r2_score(y,yhat)
print("Results of Prophet without exogenous features:\n")
print("MAE:",mae)
print("MAPE:",mape)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)
all_model_results = pd.DataFrame(columns = ['Model Name', 'MAE', 'MAPE', 'RMSE', 'R2'])
# append rows to an empty DataFrame
all_model_results = all_model_results.append({'Model Name' : 'Prophet model without exogenous features', 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2},
ignore_index = True)
Results of Prophet without exogenous features: MAE: 26477.698869256554 MAPE: 0.17219051305630884 MSE: 1273118983.354026 RMSE: 35680.79291935685 R-Squared: 0.627790672062585
y = model_predictions['y']
yhat = model_predictions['yhat']
mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse) #mse**(0.5)
r2 = metrics.r2_score(y,yhat)
print("Results of Prophet with exogenous features:\n")
print("MAE:",mae)
print("MAPE:",mape)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)
all_model_results = all_model_results.append({'Model Name' : 'Prophet model with exogenous features', 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2},
ignore_index = True)
Results of Prophet with exogenous features: MAE: 23066.04699962848 MAPE: 0.14170796846144773 MSE: 1094680829.524918 RMSE: 33085.96121506701 R-Squared: 0.6799588874324884
y = model_predictions['y']
yhat = model_predictions['sarima_forecast']
mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse) #mse**(0.5)
r2 = metrics.r2_score(y,yhat)
print("Results of SARIMA:\n")
print("MAE:",mae)
print("MAPE:",mape)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)
all_model_results = all_model_results.append({'Model Name' : 'SARIMA', 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2},
ignore_index = True)
Results of SARIMA: MAE: 23898.16807298402 MAPE: 0.15087089273006088 MSE: 1134326759.5986726 RMSE: 33679.76780796852 R-Squared: 0.6683679951583588
y = model_predictions['y']
yhat = model_predictions['sarimax_forecast']
mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse) #mse**(0.5)
r2 = metrics.r2_score(y,yhat)
print("Results of SARIMAX:\n")
print("MAE:",mae)
print("MAPE:",mape)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)
all_model_results = all_model_results.append({'Model Name' : 'SARIMAX', 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2},
ignore_index = True)
Results of SARIMAX: MAE: 25249.51330256687 MAPE: 0.1517589216379606 MSE: 1123821570.4851322 RMSE: 33523.448069748614 R-Squared: 0.6714392944091996
all_model_results.head()
Model Name | MAE | MAPE | RMSE | R2 | |
---|---|---|---|---|---|
0 | Prophet model without exogenous features | 26477.699 | 0.172 | 35680.793 | 0.628 |
1 | Prophet model with exogenous features | 23066.047 | 0.142 | 33085.961 | 0.680 |
2 | SARIMA | 23898.168 | 0.151 | 33679.768 | 0.668 |
3 | SARIMAX | 25249.513 | 0.152 | 33523.448 | 0.671 |
cv_dates = ['2017-06-05','2018-06-03','2018-12-31','2019-06-03','2019-12-30']
cv_results = pd.DataFrame(columns = ['Date', 'MAE', 'MAPE', 'RMSE', 'R2'])
for date in cv_dates:
train_df, test_df = setup_train_test(all_data_weekly_agg_exog.rename(columns={'ds':'week_start_date'}), train_end_date = date, feat_name = 'Number of Bicycle Hires');
fcst_train_df, future_df = prophet_exogenous_fittransform(train_df, test_df);
all_df = pd.merge(fcst_train_df[['ds','yhat_lower','yhat','yhat_upper']].append(future_df[['ds','yhat_lower','yhat','yhat_upper']]), train_df.append(test_df), on='ds', how='outer', indicator=True).sort_values(by = 'ds')
model_predictions = all_df[(all_df['ds'] > date) & (all_df['ds'] < pd.to_datetime(date) + pd.DateOffset(days=365))]
y = model_predictions['y']
yhat = model_predictions['yhat']
cv_results = cv_results.append({'Date' : date, 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2},
ignore_index = True)
#print(model_predictions.head())
mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(y,yhat)
cv_results
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Date | MAE | MAPE | RMSE | R2 | |
---|---|---|---|---|---|
0 | 2017-06-05 | 25249.513 | 0.152 | 33523.448 | 0.671 |
1 | 2018-06-03 | 19145.478 | 0.097 | 24375.423 | 0.786 |
2 | 2018-12-31 | 18403.028 | 0.089 | 23517.073 | 0.771 |
3 | 2019-06-03 | 19643.959 | 0.102 | 24980.060 | 0.663 |
4 | 2019-12-30 | 29203.078 | 0.191 | 42366.462 | 0.415 |
print("Mean of 5-fold rolling window cross-validation results:")
cv_results['Model Name'] = 'Prophet model with exogenous features'
cv_results.drop(['Date'], axis=1).groupby(['Model Name']).mean()
Mean of 5-fold rolling window cross-validation results:
MAE | MAPE | RMSE | R2 | |
---|---|---|---|---|
Model Name | ||||
Prophet model with exogenous features | 22329.011 | 0.126 | 29752.493 | 0.661 |