import pandas as pd
import numpy as np
import statsmodels.api as sm
import sklearn.preprocessing as pre
import sklearn.metrics as metric
import sklearn.linear_model as lm
import sklearn as skl
import statsmodels.tsa as tsa
import seaborn as sns
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 90)
raw_data = pd.read_csv("Test_data.csv")
raw_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 92 entries, 0 to 91 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 92 non-null object 1 sales 92 non-null float64 2 m_tv 92 non-null float64 3 m_rd 92 non-null float64 4 m_online 92 non-null float64 5 price 92 non-null float64 6 promo 92 non-null int64 7 holidays 30 non-null float64 dtypes: float64(6), int64(1), object(1) memory usage: 5.9+ KB
raw_data.describe()
sales | m_tv | m_rd | m_online | price | promo | holidays | |
---|---|---|---|---|---|---|---|
count | 92.000000 | 92.000000 | 92.000000 | 92.000000 | 92.000000 | 92.000000 | 30.0 |
mean | 5.184400 | 0.560531 | 0.418528 | 0.209329 | -0.001423 | 10.532609 | 1.0 |
std | 0.108638 | 0.064835 | 0.038475 | 0.019127 | 0.060020 | 4.031336 | 0.0 |
min | 4.857496 | 0.366586 | 0.294887 | 0.142494 | -0.140803 | 2.000000 | 1.0 |
25% | 5.116671 | 0.514974 | 0.396311 | 0.198650 | -0.040600 | 8.000000 | 1.0 |
50% | 5.201595 | 0.578128 | 0.426288 | 0.212973 | 0.001642 | 10.000000 | 1.0 |
75% | 5.265476 | 0.611251 | 0.442458 | 0.224060 | 0.035743 | 13.000000 | 1.0 |
max | 5.385093 | 0.656982 | 0.484950 | 0.236512 | 0.151495 | 22.000000 | 1.0 |
raw_data.head(5)
date | sales | m_tv | m_rd | m_online | price | promo | holidays | |
---|---|---|---|---|---|---|---|---|
0 | 3/26/2017 | 5.206100 | 0.596845 | 0.471012 | 0.227368 | -0.128877 | 9 | NaN |
1 | 4/2/2017 | 5.385093 | 0.618830 | 0.482700 | 0.226155 | -0.078913 | 15 | NaN |
2 | 4/9/2017 | 5.230386 | 0.606462 | 0.468500 | 0.210219 | -0.016652 | 9 | NaN |
3 | 4/16/2017 | 5.078445 | 0.547290 | 0.452152 | 0.197514 | 0.054442 | 14 | 1.0 |
4 | 4/23/2017 | 5.216146 | 0.469708 | 0.434500 | 0.200274 | 0.121796 | 8 | NaN |
Looks like Sunday marks as holiday as well
pd.to_datetime(raw_data.date,format="%m/%d/%Y")
raw_data.index = pd.to_datetime(raw_data.date,format="%m/%d/%Y")
raw_data.index.freq = "W"
raw_data = raw_data.drop("date",axis=1)
sns.heatmap(raw_data.isnull(),yticklabels=False,cbar=False,cmap="viridis");
raw_data = raw_data.fillna(0)
EDA - Exploratory Data Analysis¶
Plot the sales series¶
ax = raw_data.sales.plot(figsize=(20,6))
for x in raw_data.query('holidays==1').index:
ax.axvline(x=x, color='k', alpha = 0.3);
ax.autoscale()
Vertical line makes the holidays
Explore seasonality¶
season = tsa.seasonal.seasonal_decompose(raw_data.sales,freq=7)
<ipython-input-111-3156addd6116>:1: FutureWarning: the 'freq'' keyword is deprecated, use 'period' instead season = tsa.seasonal.seasonal_decompose(raw_data.sales,freq=7)
ax = season.plot();
ax.set_figheight(8)
ax.set_figwidth(12)
Seasonality only explains very little portion (about 0.025m of sales) of the data. We can assume no strong seanality in the sales series.
#Augmented Dickey-Fuller to test whether the sales series is stational or not
result = tsa.stattools.adfuller(raw_data.sales,autolag='AIC')
result[1]
3.942673135297742e-05
P values is less than 0.01. It's significant under 99% confidence. Sales data observes no unit root. It's stationary
fig, ax = plt.subplots(2,3,figsize = (12,8))
# fig.set_figheight(8)
# fig.set_figwidth(17)
sns.distplot(raw_data.sales,ax=ax[0][0])
sns.distplot(raw_data.m_tv,ax=ax[0][1])
sns.distplot(raw_data.promo,ax=ax[0][2])
sns.distplot(raw_data.m_rd,ax=ax[1][0])
sns.distplot(raw_data.m_online,ax=ax[1][1])
sns.distplot(raw_data.price,ax=ax[1][2])
<matplotlib.axes._subplots.AxesSubplot at 0x7ffb7bfeeaf0>
The feature and sales series show basically normal but skew to the left
Find if there's any outliers¶
sns.boxplot(x="variable",y="value",data=pd.melt(raw_data.drop(["promo","holidays"],axis=1)))
<matplotlib.axes._subplots.AxesSubplot at 0x7ffb7c13ed00>
fig, ax = plt.subplots(2,3,figsize = (12,8))
# fig.set_figheight(8)
# fig.set_figwidth(17)
sns.boxplot(raw_data.sales,ax=ax[0][0] ,orient="v")
sns.boxplot(raw_data.m_tv,ax=ax[0][1] ,orient="v" )
sns.boxplot(raw_data.promo,ax=ax[0][2] ,orient="v" )
sns.boxplot(raw_data.m_rd,ax=ax[1][0] ,orient="v" )
sns.boxplot(raw_data.m_online,ax=ax[1][1] ,orient="v" )
sns.boxplot(raw_data.price,ax=ax[1][2] ,orient="v")
<matplotlib.axes._subplots.AxesSubplot at 0x7ffb7c719cd0>
There're not a lot of outliers. Let's keep them for now.
Train Test Split¶
## Train Test Split
train = raw_data[:-20]
test = raw_data[-20:]
x_train = train.drop("sales",axis=1)
y_train = train.sales
x_test = test.drop("sales",axis=1)
y_test = test.sales
Start with a simple model with all feature available¶
model = skl.linear_model.LinearRegression().fit(x_train,y_train)
pd.DataFrame(model.coef_,x_train.columns,columns=['Coefficient'])
prediction = model.predict(x_train)
print ("R Square is %.4f"%metric.r2_score(y_train,prediction))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_train,prediction)))
R Square is 0.6440 RMSE is 0.0644
prediction_test = model.predict(x_test)
print ("R Square is %.4f"%metric.r2_score(y_test,prediction_test))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_test,prediction_test)))
R Square is 0.6680 RMSE is 0.0600
sns.residplot(y_train,prediction-y_train);
As we can see some small clusters in the residual plot, we can assume that there's information that was not explained by the features.
# Look at whether all features are significant or not
ip = sm.add_constant(x_train)
dp = y_train
sm.OLS(dp,ip).fit().summary()
Dep. Variable: | sales | R-squared: | 0.644 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.611 |
Method: | Least Squares | F-statistic: | 19.60 |
Date: | Sun, 20 Dec 2020 | Prob (F-statistic): | 6.54e-13 |
Time: | 23:00:11 | Log-Likelihood: | 95.323 |
No. Observations: | 72 | AIC: | -176.6 |
Df Residuals: | 65 | BIC: | -160.7 |
Df Model: | 6 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 4.2199 | 0.122 | 34.455 | 0.000 | 3.975 | 4.464 |
m_tv | 0.9603 | 0.152 | 6.313 | 0.000 | 0.657 | 1.264 |
m_rd | 0.6732 | 0.233 | 2.893 | 0.005 | 0.209 | 1.138 |
m_online | 0.6775 | 0.502 | 1.350 | 0.182 | -0.325 | 1.680 |
price | 0.5054 | 0.144 | 3.518 | 0.001 | 0.218 | 0.792 |
promo | 0.0012 | 0.002 | 0.593 | 0.555 | -0.003 | 0.005 |
holidays | -0.0195 | 0.019 | -1.027 | 0.308 | -0.057 | 0.018 |
Omnibus: | 0.269 | Durbin-Watson: | 2.146 |
---|---|---|---|
Prob(Omnibus): | 0.874 | Jarque-Bera (JB): | 0.359 |
Skew: | 0.137 | Prob(JB): | 0.836 |
Kurtosis: | 2.789 | Cond. No. | 750. |
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x_train_v2 = x_train.drop(["m_online","promo","holidays"],axis=1)
model_v2 = skl.linear_model.LinearRegression().fit(x_train_v2,y_train)
prediction_v2 = model_v2.predict(x_train_v2)
print ("In-sample valuation\n")
print ("R Square is %.4f"%metric.r2_score(y_train,prediction_v2))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_train,prediction_v2)))
In-sample valuation R Square is 0.6230 RMSE is 0.0663
## Out of sample test V2
x_test_v2 = x_test.drop(["m_online","promo","holidays"],axis=1)
prediction_v2_test = model_v2.predict(x_test_v2)
print ("Out-of-sample valuation")
print ("R Square is %.4f"%metric.r2_score(y_test,prediction_v2_test))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_test,prediction_v2_test)))
Out-of-sample valuation R Square is 0.6750 RMSE is 0.0594
Try to remove outliers since we are not doing time series¶
raw_data_noout = raw_data[raw_data.sales >= 4.9]
raw_data_noout = raw_data_noout[raw_data_noout.m_tv >= 0.4]
raw_data_noout = raw_data_noout[raw_data_noout.m_rd >= 0.32]
## Train Test Split
train = raw_data_noout[:-20]
test = raw_data_noout [-20:]
x_train_noout = train.drop("sales",axis=1)
y_train_noout = train.sales
# V2 independent variables
x_train_v2_noout = x_train_noout.drop(["m_online","promo","holidays"],axis=1)
model_v2_noout = skl.linear_model.LinearRegression().fit(x_train_v2_noout,y_train_noout)
print (pd.DataFrame(model_v2_noout.coef_,x_train_v2_noout.columns,columns=['Coefficient']))
prediction_v2_noout = model_v2_noout.predict(x_train_v2)
print ("In-sample valuation\n")
print ("R Square is %.4f"%metric.r2_score(y_train,prediction_v2_noout))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_train,prediction_v2_noout)))
Coefficient m_tv 0.854639 m_rd 0.889650 price 0.378369 In-sample valuation R Square is 0.6152 RMSE is 0.0669
No significant improvement after removing outliers
Standardizing variables¶
# Standardized
norm = pre.StandardScaler().fit_transform(raw_data.drop(["holidays"],axis=1))
raw_data_norm = pd.DataFrame(norm,columns=raw_data.drop(["holidays"],axis=1).columns).set_index(raw_data.index)
raw_data_norm = raw_data_norm.join(raw_data[["holidays"]])
fig, ax = plt.subplots(2,3,figsize = (12,8))
# fig.set_figheight(8)
# fig.set_figwidth(17)
sns.distplot(raw_data_norm.sales,ax=ax[0][0])
sns.distplot(raw_data_norm.m_tv,ax=ax[0][1])
sns.distplot(raw_data_norm.promo,ax=ax[0][2])
sns.distplot(raw_data_norm.m_rd,ax=ax[1][0])
sns.distplot(raw_data_norm.m_online,ax=ax[1][1])
sns.distplot(raw_data_norm.price,ax=ax[1][2])
<matplotlib.axes._subplots.AxesSubplot at 0x7ffb78eb0df0>
train = raw_data_norm[:-20]
test = raw_data_norm [-20:]
x_train_norm = train.drop("sales",axis=1)
y_train_norm = train.sales
# V2 independent variables
x_train_v2_norm = x_train_norm.drop(["m_online","promo","holidays"],axis=1)
x_test_norm = test.drop("sales",axis=1)
x_test_norm = x_test_norm.drop(["m_online","promo","holidays"],axis=1)
y_test_norm = test.sales
# In sample
model_v2_norm = skl.linear_model.LinearRegression().fit(x_train_v2_norm,y_train_norm)
print (pd.DataFrame(model_v2_norm.coef_,x_train_v2_norm.columns,columns=['Coefficient']))
prediction_v2_norm = model_v2_norm.predict(x_train_v2_norm)
print ("In-sample valuation\n")
print ("R Square is %.4f"%metric.r2_score(y_train_norm,prediction_v2_norm))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_train_norm,prediction_v2_norm)))
Coefficient m_tv 0.602596 m_rd 0.288124 price 0.249482 In-sample valuation R Square is 0.6230 RMSE is 0.6132
# Out of sample
prediction_v2_norm = model_v2_norm.predict(x_test_norm)
print ("out-of-sample valuation\n")
print ("R Square is %.4f"%metric.r2_score(y_test_norm,prediction_v2_norm))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_test_norm,prediction_v2_norm)))
out-of-sample valuation R Square is 0.6750 RMSE is 0.5497
plot_data = pd.DataFrame({"Real":y_test_norm,"Prediction":prediction_v2_norm})
plot_data.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fa45ded7e50>
sns.residplot(y_train_norm,prediction_v2_norm-y_train_norm)
<matplotlib.axes._subplots.AxesSubplot at 0x7fa45ab9b400>
Also, standardizing data didn't give us some improvement as well
ARMA Model¶
Since we don't have other feature gathered at this point. It's not a bad idea to look at the sales time series. We have already known that the sales series is stationary and the holiday variable is not significant. We can use ARMA model to fit the sales time series.
# Train Test Split
train = raw_data[:-20]
test = raw_data[-20:]
train_arma = train.sales
test_arma = test.sales
By comparing different orders, we found ARMA(2,2) is the best
model_arma = tsa.arima_model.ARIMA(train_arma,order=(2,0,2)).fit()
model_arma.summary()
Dep. Variable: | sales | No. Observations: | 72 |
---|---|---|---|
Model: | ARMA(2, 2) | Log Likelihood | 68.623 |
Method: | css-mle | S.D. of innovations | 0.090 |
Date: | Sun, 20 Dec 2020 | AIC | -125.246 |
Time: | 23:11:19 | BIC | -111.586 |
Sample: | 03-26-2017 | HQIC | -119.808 |
- 08-05-2018 |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 5.1917 | 0.012 | 434.728 | 0.000 | 5.168 | 5.215 |
ar.L1.sales | 1.1234 | 0.024 | 47.128 | 0.000 | 1.077 | 1.170 |
ar.L2.sales | -0.9838 | 0.018 | -53.400 | 0.000 | -1.020 | -0.948 |
ma.L1.sales | -1.0280 | 0.060 | -17.274 | 0.000 | -1.145 | -0.911 |
ma.L2.sales | 0.9998 | 0.081 | 12.296 | 0.000 | 0.840 | 1.159 |
Real | Imaginary | Modulus | Frequency | |
---|---|---|---|---|
AR.1 | 0.5709 | -0.8310j | 1.0082 | -0.1542 |
AR.2 | 0.5709 | +0.8310j | 1.0082 | 0.1542 |
MA.1 | 0.5141 | -0.8579j | 1.0001 | -0.1641 |
MA.2 | 0.5141 | +0.8579j | 1.0001 | 0.1641 |
predictions_arima = model_arma.predict(start=len(train_arma),
end=len(train_arma)+len(test_arma)-1,
dynamic=False,
typ='levels')
print ("R Square is %.4f"%metric.r2_score(test_arma,predictions_arima))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(test_arma,predictions_arima)))
R Square is 0.1870 RMSE is 0.0939
plot_data = pd.DataFrame({"Real":test_arma,"Prediction":predictions_arima})
plot_data.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7ffb774b4340>
predictions_arima = model_arma.predict(start=0,
end=len(train_arma)+len(test_arma)-1,
dynamic=False,
typ='levels')
predictions_arima
2017-03-26 5.191661 2017-04-02 5.195336 2017-04-09 5.244598 2017-04-16 5.191784 2017-04-23 5.120293 ... 2018-11-25 5.145701 2018-12-02 5.100121 2018-12-09 5.134042 2018-12-16 5.216988 2018-12-23 5.276798 Freq: W-SUN, Length: 92, dtype: float64
Add a smoothing festure and ARMA predition to V2 model¶
raw_data_ = raw_data
raw_data_["sale_pred"] = predictions_arima
train = raw_data_[:-20]
test = raw_data_[-20:]
train.loc[:,"sales_MA"] = train.sales.rolling(2).mean()
test.loc[:,"sales_MA"] = test.sales.rolling(2).mean()
train = train.dropna()
test = test.dropna()
x_train_v3 = train.drop("sales",axis=1).drop(["m_online","promo","holidays"],axis=1)
y_train_v3 = train.sales
x_test_v3 = test.drop("sales",axis=1).drop(["m_online","promo","holidays"],axis=1)
y_test_v3 = test.sales
/Users/ken/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py:845: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[key] = _infer_fill_value(value) /Users/ken/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[item] = s
model_v3 = skl.linear_model.LinearRegression().fit(x_train_v3,y_train_v3)
prediction_v3 = model_v3.predict(x_train_v3)
print ("In-sample valuation\n")
print ("R Square is %.4f"%metric.r2_score(y_train_v3,prediction_v3))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_train_v3,prediction_v3)))
In-sample valuation R Square is 0.7654 RMSE is 0.0526
prediction_v3 = model_v3.predict(x_test_v3)
print ("Out-of-sample valuation\n")
print ("R Square is %.4f"%metric.r2_score(y_test_v3,prediction_v3))
print ("RMSE is %.4f"%np.sqrt(metric.mean_squared_error(y_test_v3,prediction_v3)))
Out-of-sample valuation R Square is 0.7220 RMSE is 0.0558
ip = sm.add_constant(x_train_v3)
dp = y_train_v3
sm.OLS(dp,ip).fit().summary()
Dep. Variable: | sales | R-squared: | 0.765 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.747 |
Method: | Least Squares | F-statistic: | 42.42 |
Date: | Sun, 20 Dec 2020 | Prob (F-statistic): | 3.43e-19 |
Time: | 21:25:34 | Log-Likelihood: | 108.32 |
No. Observations: | 71 | AIC: | -204.6 |
Df Residuals: | 65 | BIC: | -191.1 |
Df Model: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 3.5799 | 1.017 | 3.519 | 0.001 | 1.548 | 5.611 |
m_tv | 0.6535 | 0.180 | 3.629 | 0.001 | 0.294 | 1.013 |
m_rd | 0.6625 | 0.206 | 3.213 | 0.002 | 0.251 | 1.074 |
price | 0.0917 | 0.125 | 0.732 | 0.467 | -0.159 | 0.342 |
sale_pred | -0.5319 | 0.193 | -2.757 | 0.008 | -0.917 | -0.147 |
sales_MA | 0.7183 | 0.122 | 5.896 | 0.000 | 0.475 | 0.962 |
Omnibus: | 0.682 | Durbin-Watson: | 2.326 |
---|---|---|---|
Prob(Omnibus): | 0.711 | Jarque-Bera (JB): | 0.666 |
Skew: | -0.223 | Prob(JB): | 0.717 |
Kurtosis: | 2.837 | Cond. No. | 1.19e+03 |
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.19e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
sns.residplot(y_test_v3,prediction_v3-y_test_v3);
plot_data = pd.DataFrame({"Real":y_test_v3,"Prediction":prediction_v3})
plot_data.plot();