#!python --version
#!pip install pyflux-0.4.17-cp38-cp38-win_amd64.whl
#!pip install msticpy
#!pip install msrestazure
#!pip install pystan
#import pystan
#!pip install fbprophet
##install from conda prompt
#!conda install -c conda-forge fbprophet


import pandas as pd
import numpy as np
from time import gmtime, strftime #DateTime Formatting
import matplotlib as mpl
import matplotlib.pyplot as plt #Basic Plotting
import seaborn as sns #Advanced Plotting
import plotly.express as px #Advanced Plotting
sns.set_theme(style="whitegrid")
import datetime
import warnings
import datetime as dt
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
import statsmodels.api as sm  
from statsmodels.tsa.stattools import acf  
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import pyflux
import pystan
from fbprophet import Prophet
import sklearn.metrics as metrics

%matplotlib inline


warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.3f}'.format


#provide project path which contains the raw data
project_path = "C:\\Users\\gusahil\\Downloads\\Blockchain"

import os
orig_dir = os.getcwd()
os.chdir(project_path)
os.getcwd()

'C:\\Users\\gusahil\\Downloads\\Blockchain'


file_loc = "tfl-daily-cycle-hires.xlsx"
tfl_hires = pd.read_excel(file_loc, index_col=None, usecols = "A:B", sheet_name = 'Data').sort_values(by=['Day','Number of Bicycle Hires'])
#tfl_hires['month'] = tfl_hires['Day'].dt.strftime('%Y-%m-01')
#tfl_hires['weekday'] = tfl_hires['Day'].dt.weekday

tfl_hires['Day'] = pd.to_datetime(tfl_hires['Day']) 

tfl_hires.head()


file_loc = "electrical_power_data.csv"
electrical_power = pd.read_csv(file_loc, sep=",")
electrical_power['SD'] = pd.to_datetime(electrical_power['SD']) 

for column in electrical_power.columns[1:]:
    electrical_power[column] = electrical_power[column].str.replace(",","").astype(int)
electrical_power.head()


electrical_power.describe()


file_loc = "ukbankholidays.csv"
uk_bank_holidays = pd.read_csv(file_loc, sep=",")
uk_bank_holidays['UK BANK HOLIDAYS'] = pd.to_datetime(uk_bank_holidays['UK BANK HOLIDAYS']) 
uk_bank_holidays['holiday_flag'] = 1
uk_bank_holidays.head()


# Create function to calculate Start Week date
week_start_date = lambda date: date - timedelta(days=date.weekday())

# Apply above function on DataFrame column
uk_bank_holidays['week_start_date'] = uk_bank_holidays['UK BANK HOLIDAYS'].apply(week_start_date)

uk_bank_holidays.head()


# Weekly Agrregation
uk_bank_holidays_weekly_agg = uk_bank_holidays.groupby(['week_start_date'])['holiday_flag'].sum().reset_index()
uk_bank_holidays_weekly_agg = uk_bank_holidays_weekly_agg[(uk_bank_holidays_weekly_agg.week_start_date >= '2010-08-02') & (uk_bank_holidays_weekly_agg.week_start_date < '2021-04-05')]
uk_bank_holidays_weekly_agg.tail()


all_data = pd.merge(tfl_hires, electrical_power, left_on='Day', right_on='SD', how='inner')
all_data = pd.merge(all_data, uk_bank_holidays, left_on='Day', right_on='UK BANK HOLIDAYS', how='left')
all_data['holiday_flag'].fillna(0, inplace=True)
#all_data = all_data[all_data['Day'] >= '2012-01-02']
all_data.head()


# Create function to calculate Start Week date
week_start_date = lambda date: date - timedelta(days=date.weekday())

# Apply above function on DataFrame column
all_data['week_start_date'] = all_data['Day'].apply(week_start_date)

all_data.head()


# Weekly Agrregation
all_data_weekly_agg = all_data.groupby(['week_start_date'])['Number of Bicycle Hires','Net Supply','Wind','holiday_flag','Oil','Gas','Coal','Nuclear','Hydro','Biomass'].sum().reset_index()
all_data_weekly_agg = all_data_weekly_agg[(all_data_weekly_agg.week_start_date >= '2010-08-02') & (all_data_weekly_agg.week_start_date < '2021-04-05')]
all_data_weekly_agg.tail()


all_data_weekly_agg[1:].describe()


fig = px.line(all_data, x='Day', y="Number of Bicycle Hires", title='Daily Number of Bicycle Hires')
fig.show()

fig = px.line(all_data_weekly_agg, x='week_start_date', y="Number of Bicycle Hires", title='Weekly Number of Bicycle Hires')
fig.show()


fig = px.line(all_data, x='SD', y="Net Supply", title='Daily Net Supply Electrical Power')
fig.show()

fig = px.line(all_data_weekly_agg, x='week_start_date', y="Net Supply", title='Weekly Net Supply Electrical Power')
fig.show()


all_data_weekly_agg.iloc[:,1:17].corr()


all_data_weekly_agg = all_data_weekly_agg[['week_start_date','Number of Bicycle Hires','Net Supply','Wind','holiday_flag']]
all_data_weekly_agg = all_data_weekly_agg[all_data_weekly_agg['week_start_date'] >= '2012-01-02']


train_end_date = '2018-06-03'

def setup_train_test(data_df = all_data_weekly_agg, train_end_date = '2018-06-03', feat_name = 'Number of Bicycle Hires'):
    """ This function splits the input data frame data_df into train and test sets. 
    Trainining is done on all weeks before train_end_date: [2014, train_end_date)
    Testing is done on all weeks. So the dates before train_end_date are in sample 
    and after train_end_date are out of sample test weeks.
    This method needs data_df to have week_ending and feat_name as columns. 
    Returns train_df and test_df with columns ds and y where ds corresponds to week_ending and y corresponds to feat_name.
    """
    sel=data_df.week_start_date < train_end_date;

    train_df = data_df.loc[sel].rename(columns={'week_start_date':'ds', feat_name:'y'})
    test_df = data_df.loc[~sel].rename(columns={'week_start_date':'ds', feat_name:'y'});
    
    return train_df, test_df


def prophet_fittransform(train_df, test_df):
    """ Initialize prohpet model, train using train_df and generate predictions for test_df.
    Inputs train_df and test_df are data frames with columns: 'ds', 'y'
    Returns fcst_df with columns the following columns:
       'ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
       'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
       'yearly', 'yearly_lower', 'yearly_upper', 'multiplicative_terms',
       'multiplicative_terms_lower', 'multiplicative_terms_upper', 'yhat'
    """
    
    m = Prophet();
    m.fit(train_df)
    fcst_train_df =  m.predict(train_df)

    future = m.make_future_dataframe(periods = 52*4, freq = "W", include_history = False)
    future.ds = future.ds + pd.DateOffset(days=1)
    future_df =  m.predict(future)
    return fcst_train_df, future_df


def prophet_exogenous_fittransform(train_df, test_df):
    """ Initialize prohpet model, train using train_df and generate predictions for test_df.
    Inputs train_df and test_df are data frames with columns: 'ds', 'y'
    Returns fcst_df with columns the following columns:
       'ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
       'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
       'yearly', 'yearly_lower', 'yearly_upper', 'multiplicative_terms',
       'multiplicative_terms_lower', 'multiplicative_terms_upper', 'yhat'
    """
    
    m = Prophet();
    m.add_regressor('Net Supply')
    m.add_regressor('Wind')
    m.add_regressor('holiday_flag')
    m.fit(train_df)
    fcst_train_df =  m.predict(train_df)
    test_df['holiday_flag'].fillna(0, inplace=True)
    future_df =  m.predict(test_df)
    return fcst_train_df, future_df


train_df, test_df = setup_train_test(all_data_weekly_agg, feat_name = 'Number of Bicycle Hires');
fcst_train_df, future_df = prophet_fittransform(train_df, test_df);

INFO:numexpr.utils:NumExpr defaulting to 4 threads.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


fcst_train_df.tail(1)


future_df.head(1)


#fcst_test_df['ds'] = pd.to_datetime()
all_df = pd.merge(fcst_train_df[['ds','yhat_lower','yhat','yhat_upper']].append(future_df[['ds','yhat_lower','yhat','yhat_upper']]), train_df.append(test_df), on='ds', how='outer', indicator=True).sort_values(by = 'ds')
all_df[all_df['_merge'] == 'both'].tail()


all_df.tail(1)


all_df.head(1)


prophet_predictions = all_df[['ds','yhat']].rename(columns={'yhat':'prophet_forecast'})
prophet_predictions.head()


all_df.loc[all_df['ds'] < train_end_date, 'yhat'] = np.nan
all_df.loc[all_df['ds'] < train_end_date, 'yhat_lower'] = np.nan
all_df.loc[all_df['ds'] < train_end_date, 'yhat_upper'] = np.nan


all_df_melt = pd.melt(all_df[['ds','y','yhat']], id_vars=['ds'], value_vars=all_df[['ds','y','yhat']].columns[1:3])
all_df_melt[(all_df_melt['variable'] == 'y') & (all_df_melt['value'] > 0)].tail()


fig = px.line(all_df_melt, x='ds', y="value", color='variable', title='Actuals vs Forecast')
fig.show()


train_df, test_df = setup_train_test(all_data_weekly_agg, feat_name = 'Net Supply');
fcst_train_df, future_df = prophet_fittransform(train_df, test_df);

net_supply_df = fcst_train_df.append(future_df)[['ds','yhat']].rename(columns={'yhat':'Net Supply Forecast'})
net_supply_df.head()

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


train_df, test_df = setup_train_test(all_data_weekly_agg, feat_name = 'Wind');
fcst_train_df, future_df = prophet_fittransform(train_df, test_df);

wind_df = fcst_train_df.append(future_df)[['ds','yhat']].rename(columns={'yhat':'Wind Forecast'})
wind_df.head()

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


wind_df.tail()


all_data_weekly_agg.tail()


all_data_weekly_agg_exog = pd.merge(all_data_weekly_agg.rename(columns={'week_start_date':'ds'}), net_supply_df, on=['ds'])


all_data_weekly_agg_exog = all_df[['ds']].merge(wind_df,on='ds',how='left')\
.merge(net_supply_df,on='ds',how='left')\
.merge(all_data_weekly_agg.rename(columns={'week_start_date':'ds'}),on='ds',how='left')
all_data_weekly_agg_exog.tail()


all_data_weekly_agg_exog.loc[all_data_weekly_agg_exog['ds'] >= train_end_date, 'Net Supply'] = all_data_weekly_agg_exog.loc[all_data_weekly_agg_exog['ds'] >= train_end_date, 'Net Supply Forecast']

all_data_weekly_agg_exog.loc[all_data_weekly_agg_exog['ds'] >= train_end_date, 'Wind'] = all_data_weekly_agg_exog.loc[all_data_weekly_agg_exog['ds'] >= train_end_date, 'Wind Forecast']

all_data_weekly_agg_exog.tail()


train_df.tail()


train_df, test_df = setup_train_test(all_data_weekly_agg_exog.rename(columns={'ds':'week_start_date'}), feat_name = 'Number of Bicycle Hires');
fcst_train_df, future_df = prophet_exogenous_fittransform(train_df, test_df);

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


all_df = pd.merge(fcst_train_df[['ds','yhat_lower','yhat','yhat_upper']].append(future_df[['ds','yhat_lower','yhat','yhat_upper']]), train_df.append(test_df), on='ds', how='outer', indicator=True).sort_values(by = 'ds')
all_df[all_df['_merge'] == 'both'].tail()


all_df.loc[all_df['ds'] < train_end_date, 'yhat'] = np.nan
all_df.loc[all_df['ds'] < train_end_date, 'yhat_lower'] = np.nan
all_df.loc[all_df['ds'] < train_end_date, 'yhat_upper'] = np.nan


all_df_melt = pd.melt(all_df[['ds','y','yhat']], id_vars=['ds'], value_vars=all_df[['ds','y','yhat']].columns[1:3])
all_df_melt[(all_df_melt['variable'] == 'y') & (all_df_melt['value'] > 0)].tail()


fig = px.line(all_df_melt, x='ds', y="value", color='variable', title='Actuals vs Forecast')
fig.show()


def test_stationarity(timeseries):

    #Determing rolling statistics
    rolmean = timeseries.rolling(12).mean()
    rolstd = timeseries.rolling(12).std()

    #Plot rolling statistics:
    fig = plt.figure(figsize=(12, 8))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)


all_data_weekly_agg['seasonal_difference'] = all_data_weekly_agg['Number of Bicycle Hires'] - all_data_weekly_agg['Number of Bicycle Hires'].shift(52) 
all_data_weekly_agg.head()


test_stationarity(all_data_weekly_agg['seasonal_difference'].iloc[53:])

Results of Dickey-Fuller Test:
Test Statistic                 -5.656
p-value                         0.000
#Lags Used                      4.000
Number of Observations Used   420.000
Critical Value (1%)            -3.446
Critical Value (5%)            -2.868
Critical Value (10%)           -2.570
dtype: float64


fig = plt.figure(figsize=(18,16))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(all_data_weekly_agg.iloc[53:420]['seasonal_difference'], lags=170, ax=ax1)


fig = plt.figure(figsize=(18,16))
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(all_data_weekly_agg.iloc[53:420]['seasonal_difference'], lags=170, ax=ax2)


mod = sm.tsa.statespace.SARIMAX(all_df.rename(columns={'y':'Number of Bicycle Hires'}).iloc[:335]['Number of Bicycle Hires'], trend='c', order=(2,0,0), seasonal_order=(3,1,0,52.25))#, exog, = )
results = mod.fit()
print(results.summary())

                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:            Number of Bicycle Hires   No. Observations:                  335
Model:             SARIMAX(2, 0, 0)x(3, 1, 0, 52)   Log Likelihood               -3297.077
Date:                            Sun, 12 Sep 2021   AIC                           6608.153
Time:                                    20:01:22   BIC                           6633.671
Sample:                                         0   HQIC                          6618.385
                                            - 335                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept    786.2430   2986.142      0.263      0.792   -5066.488    6638.973
ar.L1          0.5861      0.079      7.412      0.000       0.431       0.741
ar.L2          0.2194      0.078      2.807      0.005       0.066       0.373
ar.S.L52      -0.5744      0.079     -7.230      0.000      -0.730      -0.419
ar.S.L104     -0.1669      0.102     -1.632      0.103      -0.367       0.034
ar.S.L156     -0.0914      0.076     -1.202      0.229      -0.240       0.058
sigma2      1.018e+09      0.028    3.7e+10      0.000    1.02e+09    1.02e+09
===================================================================================
Ljung-Box (L1) (Q):                   2.31   Jarque-Bera (JB):                 3.78
Prob(Q):                              0.13   Prob(JB):                         0.15
Heteroskedasticity (H):               0.82   Skew:                             0.17
Prob(H) (two-sided):                  0.35   Kurtosis:                         3.46
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 2.12e+26. Standard errors may be unstable.


len(all_df)

543


all_df['sarima_forecast'] = results.predict(start = 336, end= len(all_df), dynamic= True)  
all_df.head()


subset_all_df = all_df[['ds','y','sarima_forecast']].rename(columns={'y':'Number of Bicycle Hires'})
all_data_weekly_agg_melt = pd.melt(subset_all_df, id_vars=['ds'], value_vars=subset_all_df.columns[1:3])
fig = px.line(all_data_weekly_agg_melt, x='ds', y="value", color='variable', title='Weekly Number of Bicycle Hires')
fig.show()


exogenous = sm.add_constant(all_df.iloc[:335][['Net Supply','Wind','holiday_flag']])
mod = sm.tsa.statespace.SARIMAX(all_df.rename(columns={'y':'Number of Bicycle Hires'}).iloc[:335]['Number of Bicycle Hires'], order=(2,0,0), seasonal_order=(3,1,0,52.25), exog = exogenous)
results = mod.fit()
print(results.summary())

                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:            Number of Bicycle Hires   No. Observations:                  335
Model:             SARIMAX(2, 0, 0)x(3, 1, 0, 52)   Log Likelihood               -3281.621
Date:                            Sun, 12 Sep 2021   AIC                           6583.242
Time:                                    20:06:12   BIC                           6619.696
Sample:                                         0   HQIC                          6597.859
                                            - 335                                         
Covariance Type:                              opg                                         
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const         1.285e-05   1.13e+04   1.14e-09      1.000   -2.21e+04    2.21e+04
Net Supply       0.0028      0.003      0.915      0.360      -0.003       0.009
Wind            -0.0152      0.009     -1.704      0.088      -0.033       0.002
holiday_flag -3.194e+04   3636.525     -8.782      0.000   -3.91e+04   -2.48e+04
ar.L1            0.6079      0.055     11.127      0.000       0.501       0.715
ar.L2            0.2535      0.058      4.379      0.000       0.140       0.367
ar.S.L52        -0.6432      0.062    -10.374      0.000      -0.765      -0.522
ar.S.L104       -0.3208      0.079     -4.049      0.000      -0.476      -0.166
ar.S.L156       -0.2386      0.072     -3.294      0.001      -0.381      -0.097
sigma2        6.929e+08      0.134   5.17e+09      0.000    6.93e+08    6.93e+08
===================================================================================
Ljung-Box (L1) (Q):                   2.17   Jarque-Bera (JB):                 5.75
Prob(Q):                              0.14   Prob(JB):                         0.06
Heteroskedasticity (H):               0.83   Skew:                             0.09
Prob(H) (two-sided):                  0.36   Kurtosis:                         3.67
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.57e+24. Standard errors may be unstable.


exogenous = sm.add_constant(all_df.iloc[336:][['Net Supply','Wind','holiday_flag']])
all_df['sarimax_forecast'] = results.predict(start = 336, end= len(all_df) - 2, dynamic= True, exog = exogenous)  
all_df.head()


subset_all_df = all_df[['ds','y','sarimax_forecast']].rename(columns={'y':'Number of Bicycle Hires'})
all_data_weekly_agg_melt = pd.melt(subset_all_df, id_vars=['ds'], value_vars=subset_all_df.columns[1:3])
fig = px.line(all_data_weekly_agg_melt, x='ds', y="value", color='variable', title='Weekly Number of Bicycle Hires')
fig.show()


model_predictions = all_df[(all_df['ds'] > '2018-06-04') & (all_df['ds'] < '2021-02-22')]


model_predictions = pd.merge(model_predictions, prophet_predictions, how='left', on='ds')
model_predictions.head()


y = model_predictions['y']
yhat = model_predictions['prophet_forecast']

mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse) #mse**(0.5)  
r2 = metrics.r2_score(y,yhat)

print("Results of Prophet without exogenous features:\n")
print("MAE:",mae)
print("MAPE:",mape)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)

all_model_results = pd.DataFrame(columns = ['Model Name', 'MAE', 'MAPE', 'RMSE', 'R2'])
# append rows to an empty DataFrame
all_model_results = all_model_results.append({'Model Name' : 'Prophet model without exogenous features', 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2}, 
                ignore_index = True)

Results of Prophet without exogenous features:

MAE: 26477.698869256554
MAPE: 0.17219051305630884
MSE: 1273118983.354026
RMSE: 35680.79291935685
R-Squared: 0.627790672062585


y = model_predictions['y']
yhat = model_predictions['yhat']

mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse) #mse**(0.5)  
r2 = metrics.r2_score(y,yhat)

print("Results of Prophet with exogenous features:\n")
print("MAE:",mae)
print("MAPE:",mape)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)

all_model_results = all_model_results.append({'Model Name' : 'Prophet model with exogenous features', 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2}, 
                ignore_index = True)

Results of Prophet with exogenous features:

MAE: 23066.04699962848
MAPE: 0.14170796846144773
MSE: 1094680829.524918
RMSE: 33085.96121506701
R-Squared: 0.6799588874324884


y = model_predictions['y']
yhat = model_predictions['sarima_forecast']

mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse) #mse**(0.5)  
r2 = metrics.r2_score(y,yhat)

print("Results of SARIMA:\n")
print("MAE:",mae)
print("MAPE:",mape)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)

all_model_results = all_model_results.append({'Model Name' : 'SARIMA', 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2}, 
                ignore_index = True)

Results of SARIMA:

MAE: 23898.16807298402
MAPE: 0.15087089273006088
MSE: 1134326759.5986726
RMSE: 33679.76780796852
R-Squared: 0.6683679951583588


y = model_predictions['y']
yhat = model_predictions['sarimax_forecast']

mae = metrics.mean_absolute_error(y, yhat)
mape = metrics.mean_absolute_percentage_error(y, yhat)
mse = metrics.mean_squared_error(y, yhat)
rmse = np.sqrt(mse) #mse**(0.5)  
r2 = metrics.r2_score(y,yhat)

print("Results of SARIMAX:\n")
print("MAE:",mae)
print("MAPE:",mape)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)

all_model_results = all_model_results.append({'Model Name' : 'SARIMAX', 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2}, 
                ignore_index = True)

Results of SARIMAX:

MAE: 25249.51330256687
MAPE: 0.1517589216379606
MSE: 1123821570.4851322
RMSE: 33523.448069748614
R-Squared: 0.6714392944091996


all_model_results.head()


cv_dates = ['2017-06-05','2018-06-03','2018-12-31','2019-06-03','2019-12-30']
cv_results = pd.DataFrame(columns = ['Date', 'MAE', 'MAPE', 'RMSE', 'R2'])

for date in cv_dates:
    train_df, test_df = setup_train_test(all_data_weekly_agg_exog.rename(columns={'ds':'week_start_date'}), train_end_date = date, feat_name = 'Number of Bicycle Hires');
    fcst_train_df, future_df = prophet_exogenous_fittransform(train_df, test_df);
    
    all_df = pd.merge(fcst_train_df[['ds','yhat_lower','yhat','yhat_upper']].append(future_df[['ds','yhat_lower','yhat','yhat_upper']]), train_df.append(test_df), on='ds', how='outer', indicator=True).sort_values(by = 'ds')
    model_predictions = all_df[(all_df['ds'] > date) & (all_df['ds'] < pd.to_datetime(date) + pd.DateOffset(days=365))]
    
    y = model_predictions['y']
    yhat = model_predictions['yhat']
    
    cv_results = cv_results.append({'Date' : date, 'MAE' : mae, 'MAPE' : mape, 'RMSE' : rmse, 'R2' : r2}, 
                ignore_index = True)

    #print(model_predictions.head())
    mae = metrics.mean_absolute_error(y, yhat)
    mape = metrics.mean_absolute_percentage_error(y, yhat)
    mse = metrics.mean_squared_error(y, yhat)
    rmse = np.sqrt(mse)
    r2 = metrics.r2_score(y,yhat)
    
    
cv_results

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


print("Mean of 5-fold rolling window cross-validation results:")

cv_results['Model Name'] = 'Prophet model with exogenous features'
cv_results.drop(['Date'], axis=1).groupby(['Model Name']).mean()

Mean of 5-fold rolling window cross-validation results:

	Day	Number of Bicycle Hires
0	2010-07-30	6897
1	2010-07-31	5564
2	2010-08-01	4303
3	2010-08-02	6642
4	2010-08-03	7966

	SD	Gas	Coal	Nuclear	Hydro	Net Pumped	Wind	OCGT	Oil	French Int	NI Int	Net Supply
0	2010-01-01	418674	232291	191982	4641	-3854	7020	451	0	45222	-8521	887904
1	2010-01-02	456961	274253	191019	4826	-2349	10138	0	0	6373	-7737	933483
2	2010-01-03	464772	278136	190175	4123	-3897	4171	0	0	18786	-6051	950214
3	2010-01-04	482219	474204	191406	4703	-3632	5973	240	0	-31833	-7576	1115702
4	2010-01-05	468306	505552	189244	4025	-2957	12493	0	4187	-36205	-7370	1137275

	Gas	Coal	Nuclear	Hydro	Net Pumped	Wind	OCGT	Oil	Biomass	French Int	Dutch Int	NI Int	Eire Int	Nemo Int	Net Supply
count	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000	4113.000
mean	296251.636	174270.655	166523.761	9613.852	-2332.465	72784.642	115.480	34.112	27571.659	26374.237	14814.840	-2605.797	-725.331	2907.756	785597.747
std	96474.591	152234.532	26948.248	4990.095	2391.372	63741.420	410.162	377.533	22159.892	21640.888	8956.654	3799.474	4592.563	6709.739	125511.063
min	73712.000	0.000	63955.000	1376.000	-13525.000	208.000	0.000	0.000	0.000	-48656.000	-14850.000	-10926.000	-13749.000	-17305.000	434418.000
25%	222355.000	23321.000	148315.000	5423.000	-3842.000	23541.000	0.000	0.000	1095.000	17164.000	8726.000	-5282.000	-3655.000	0.000	693251.000
50%	289289.000	147463.000	170340.000	8748.000	-2330.000	53901.000	0.000	0.000	26006.000	33311.000	18664.000	-2667.000	0.000	0.000	785513.000
75%	368911.000	303809.000	187582.000	13173.000	-855.000	103298.000	33.000	0.000	47485.000	43864.000	22221.000	0.000	944.000	0.000	866373.000
max	581031.000	590132.000	223686.000	27054.000	9143.000	328223.000	6924.000	11761.000	79590.000	71646.000	24729.000	9103.000	12096.000	24100.000	1174131.000

	UK BANK HOLIDAYS	holiday_flag
0	1998-01-01	1
1	1998-04-10	1
2	1998-04-13	1
3	1998-05-04	1
4	1998-05-25	1

	UK BANK HOLIDAYS	holiday_flag	week_start_date
0	1998-01-01	1	1997-12-29
1	1998-04-10	1	1998-04-06
2	1998-04-13	1	1998-04-13
3	1998-05-04	1	1998-05-04
4	1998-05-25	1	1998-05-25

Time Series Forecasting with Exogenous Features¶

Description¶

Data Sources (attached):¶

Deliverables¶

Your solution should include:¶

1 Setting up the notebook¶

2 Notes from Exploratory Data Analysis¶

3 Setting up the Prophet models¶

3.1 Prophet Model¶

3.2 Prophet Model with Exogenous Features¶

4 Setting up Seasonal ARIMA based models¶

4.0 ARIMA non seasonal and season orders¶

4.1 SARIMA Model¶

4.2 SARIMAX Model¶

5 Summary for all 4 models' accuracy performance¶

6 Rolling Window Cross-Validation¶

7 Notes on Methodology¶

{1}Setting up the notebook¶

Loading the packages¶

Setting-up the working directory¶

Loading the raw data¶

Merging the TFL hires, Electrical power and UK Bank holidays data¶

{2}Notes from Exploratory Data Analysis¶

{3}Setting up the Prophet models¶

{3.1}Prophet Model¶

{3.2}Prophet Model with Exogenous Features¶

{4}Setting up Seasonal ARIMA based models¶

{4.0}ARIMA non seasonal and season orders¶

{4.1}SARIMA Model¶

{4.2}SARIMAX Model¶

{5}Summary for all 4 models' accuracy performance¶

{6}Rolling Window Cross Validation¶

{7}Notes on Methodology¶

	week_start_date	holiday_flag
158	2020-05-25	1
159	2020-08-31	1
160	2020-12-21	1
161	2020-12-28	2
162	2021-03-29	1

	week_start_date	Number of Bicycle Hires	Net Supply	Wind	Gas	Coal	Nuclear	Hydro	Biomass
548	2021-01-25	89862	5729779	1159736	2347211	231920	935919	53618	445087
549	2021-02-01	96027	5614122	1475041	2113964	161547	871946	39360	370132
550	2021-02-08	78706	6019601	1685621	2200012	227474	863094	31088	448276
551	2021-02-15	146239	5123511	1686256	1473505	144520	805809	96079	412448
552	2021-02-22	189834	5005908	921676	2089236	52005	720337	117181	443975

	Number of Bicycle Hires	Net Supply	Wind	holiday_flag	Oil	Gas	Coal	Nuclear	Hydro	Biomass
count	551.000	551.000	551.000	551.000	551.000	551.000	551.000	551.000	551.000	551.000
mean	181133.209	5465831.740	530140.341	0.156	207.817	2012965.004	1195372.577	1171935.414	68999.978	201628.272
std	55697.534	782731.845	370262.719	0.423	1369.211	513284.746	1053870.308	179564.534	33362.476	144890.191
min	40461.000	3563468.000	38826.000	0.000	0.000	772756.000	0.000	542422.000	13011.000	0.000
25%	142569.000	4901843.500	242832.500	0.000	0.000	1623279.000	166239.500	1055058.500	40051.500	81995.000
50%	173771.000	5423785.000	442213.000	0.000	0.000	1960075.000	1036107.000	1199365.000	65351.000	179465.000
75%	225287.000	5990978.500	742132.500	0.000	0.000	2388277.500	2132900.000	1311847.500	91900.500	333117.500
max	362925.000	7662840.000	1944082.000	2.000	23115.000	3484628.000	3933653.000	1546201.000	164810.000	526591.000

	Number of Bicycle Hires	Net Supply	Wind	holiday_flag	Oil	Gas	Coal	Nuclear	Hydro	Biomass
Number of Bicycle Hires	1.000	-0.732	-0.073	-0.204	-0.194	-0.150	-0.483	-0.147	-0.413	0.248
Net Supply	-0.732	1.000	-0.185	-0.119	0.291	0.203	0.765	0.361	0.310	-0.430
Wind	-0.073	-0.185	1.000	-0.025	-0.137	-0.113	-0.525	-0.410	0.447	0.660
holiday_flag	-0.204	-0.119	-0.025	1.000	-0.056	-0.143	-0.041	0.100	0.036	-0.005
Oil	-0.194	0.291	-0.137	-0.056	1.000	0.129	0.248	0.074	-0.032	-0.185
Gas	-0.150	0.203	-0.113	-0.143	0.129	1.000	-0.256	-0.050	-0.132	0.111
Coal	-0.483	0.765	-0.525	-0.041	0.248	-0.256	1.000	0.352	0.085	-0.765
Nuclear	-0.147	0.361	-0.410	0.100	0.074	-0.050	0.352	1.000	-0.057	-0.365
Hydro	-0.413	0.310	0.447	0.036	-0.032	-0.132	0.085	-0.057	1.000	0.159
Biomass	0.248	-0.430	0.660	-0.005	-0.185	0.111	-0.765	-0.365	0.159	1.000

	ds	yhat_lower	yhat	yhat_upper	y	Net Supply	Wind	_merge
473	2021-01-25	142140.137	174821.595	204773.968	89862.000	5729779.000	1159736.000	both
474	2021-02-01	134033.487	167645.156	200903.099	96027.000	5614122.000	1475041.000	both
475	2021-02-08	132889.431	164903.434	196646.538	78706.000	6019601.000	1685621.000	both
476	2021-02-15	138297.982	169944.940	201417.275	146239.000	5123511.000	1686256.000	both
477	2021-02-22	145912.575	178920.109	211556.887	189834.000	5005908.000	921676.000	both

	ds	prophet_forecast
0	2012-01-02	91833.433
1	2012-01-09	110588.773
2	2012-01-16	125294.257
3	2012-01-23	127627.864
4	2012-01-30	121147.356

	ds	Net Supply Forecast
0	2012-01-02	6426779.663
1	2012-01-09	6791505.989
2	2012-01-16	7053955.255
3	2012-01-23	7082898.467
4	2012-01-30	6961743.168

	ds	Wind Forecast
0	2012-01-02	439172.815
1	2012-01-09	391057.719
2	2012-01-16	374984.594
3	2012-01-23	391115.221
4	2012-01-30	412263.634

	ds	Wind Forecast
203	2022-04-25	1226838.891
204	2022-05-02	1220680.546
205	2022-05-09	1190385.426
206	2022-05-16	1144714.717
207	2022-05-23	1105112.537

	ds	Wind Forecast	Net Supply Forecast	Number of Bicycle Hires	Net Supply	Wind	holiday_flag
538	2022-04-25	1226838.891	4707844.244	NaN	NaN	NaN	NaN
539	2022-05-02	1220680.546	4651678.701	NaN	NaN	NaN	NaN
540	2022-05-09	1190385.426	4527937.751	NaN	NaN	NaN	NaN
541	2022-05-16	1144714.717	4392453.905	NaN	NaN	NaN	NaN
542	2022-05-23	1105112.537	4325390.245	NaN	NaN	NaN	NaN

	Day	Number of Bicycle Hires	SD	Gas	Coal	Nuclear	Hydro	Net Pumped	Wind	OCGT	French Int	NI Int	Net Supply	UK BANK HOLIDAYS	week_start_date
0	2010-07-30	6897	2010-07-30	431920	184603	122467	4417	-4582	4841	15	44898	-7411	781168	NaT	NaT
1	2010-07-31	5564	2010-07-31	406077	111091	121983	4604	726	7013	0	46443	-4932	693004	NaT	NaT
2	2010-08-01	4303	2010-08-01	393442	109041	126746	4839	-6091	4264	0	47760	-5775	674225	NaT	NaT
3	2010-08-02	6642	2010-08-02	429981	190693	122512	3638	-2698	866	0	45391	-7895	782488	NaT	NaT
4	2010-08-03	7966	2010-08-03	433955	182201	125603	3594	-4137	5358	4	45788	-7593	784771	NaT	NaT

	Day	Number of Bicycle Hires	SD	Gas	Coal	Nuclear	Hydro	Net Pumped	Wind	OCGT	French Int	NI Int	Net Supply	UK BANK HOLIDAYS	week_start_date
0	2010-07-30	6897	2010-07-30	431920	184603	122467	4417	-4582	4841	15	44898	-7411	781168	NaT	2010-07-26
1	2010-07-31	5564	2010-07-31	406077	111091	121983	4604	726	7013	0	46443	-4932	693004	NaT	2010-07-26
2	2010-08-01	4303	2010-08-01	393442	109041	126746	4839	-6091	4264	0	47760	-5775	674225	NaT	2010-07-26
3	2010-08-02	6642	2010-08-02	429981	190693	122512	3638	-2698	866	0	45391	-7895	782488	NaT	2010-08-02
4	2010-08-03	7966	2010-08-03	433955	182201	125603	3594	-4137	5358	4	45788	-7593	784771	NaT	2010-08-02

	ds	Number of Bicycle Hires	Net Supply	y	holiday_flag
405	2018-04-30	227037	4766576	693956	0.000
406	2018-05-07	250491	4466088	421530	1.000
407	2018-05-14	266541	4483364	460511	0.000
408	2018-05-21	252114	4443748	575373	0.000
409	2018-05-28	249072	4545699	209128	1.000

	ds	yhat_lower	yhat	yhat_upper	Wind Forecast	Net Supply Forecast	y	Net Supply	Wind	_merge	sarima_forecast	sarimax_forecast	prophet_forecast
0	2018-06-11	215448.972	243150.015	270379.612	596148.738	4573046.836	255914.000	4573046.836	596148.738	both	268538.025	270662.818	243945.870
1	2018-06-18	217054.729	246460.565	272819.841	610299.236	4545541.651	283816.000	4545541.651	610299.236	both	255493.970	253671.885	251855.594
2	2018-06-25	224162.670	251311.331	280714.036	599463.769	4536237.346	297449.000	4536237.346	599463.769	both	256897.206	260177.879	260143.583
3	2018-07-02	229373.500	256531.273	282949.321	575149.318	4567812.563	292421.000	4567812.563	575149.318	both	288777.506	282597.913	265920.657
4	2018-07-09	233957.934	261807.400	289277.200	562519.209	4595342.697	285168.000	4595342.697	562519.209	both	263186.600	265599.456	269181.038

	ds	yhat_lower	yhat	yhat_upper	Wind Forecast	Net Supply Forecast	y	Net Supply	Wind	_merge
538	2022-04-25	188495.295	216549.781	246400.555	1226838.891	4707844.244	NaN	4707844.244	1226838.891	both
539	2022-05-02	197412.133	225206.220	253193.347	1220680.546	4651678.701	NaN	4651678.701	1220680.546	both
540	2022-05-09	205919.757	234841.964	263557.229	1190385.426	4527937.751	NaN	4527937.751	1190385.426	both
541	2022-05-16	213556.027	242159.073	269296.515	1144714.717	4392453.905	NaN	4392453.905	1144714.717	both
542	2022-05-23	217361.077	245212.330	271235.146	1105112.537	4325390.245	NaN	4325390.245	1105112.537	both

	week_start_date	Number of Bicycle Hires	Net Supply	Wind	holiday_flag	seasonal_difference
75	2012-01-02	88068	6449422	403325	1.000	NaN
76	2012-01-09	127540	6720281	228344	0.000	NaN
77	2012-01-16	117606	6829803	350157	0.000	NaN
78	2012-01-23	118809	6818013	185454	0.000	NaN
79	2012-01-30	100319	7327197	179558	0.000	NaN

	Model Name	MAE	MAPE	RMSE	R2
0	Prophet model without exogenous features	26477.699	0.172	35680.793	0.628
1	Prophet model with exogenous features	23066.047	0.142	33085.961	0.680
2	SARIMA	23898.168	0.151	33679.768	0.668
3	SARIMAX	25249.513	0.152	33523.448	0.671

	Date	MAE	MAPE	RMSE	R2
0	2017-06-05	25249.513	0.152	33523.448	0.671
1	2018-06-03	19145.478	0.097	24375.423	0.786
2	2018-12-31	18403.028	0.089	23517.073	0.771
3	2019-06-03	19643.959	0.102	24980.060	0.663
4	2019-12-30	29203.078	0.191	42366.462	0.415

	MAE	MAPE	RMSE	R2
Model Name
Prophet model with exogenous features	22329.011	0.126	29752.493	0.661