Ethereum analysis and forecasting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import matplotlib.dates as mdates
import scipy.stats
import pylab
import lightgbm as lgb
from lightgbm import LGBMRegressor
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from dateutil.parser import parse
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("ETH-USD.csv")
df.describe()
Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|
count | 2032.000000 | 2032.000000 | 2032.000000 | 2032.000000 | 2032.000000 | 2.032000e+03 |
mean | 1172.894714 | 1209.016718 | 1132.495742 | 1173.463282 | 1173.463282 | 1.253992e+10 |
std | 1154.144130 | 1188.670880 | 1114.491666 | 1153.653971 | 1153.653971 | 1.047928e+10 |
min | 84.279694 | 85.342743 | 82.829887 | 84.308296 | 84.308296 | 6.217330e+08 |
25% | 220.315762 | 226.314495 | 213.840340 | 220.328876 | 220.328876 | 4.845131e+09 |
50% | 623.848511 | 648.783997 | 596.979004 | 630.632447 | 630.632447 | 9.944210e+09 |
75% | 1808.816773 | 1840.087402 | 1763.558655 | 1810.730591 | 1810.730591 | 1.749615e+10 |
max | 4810.071289 | 4891.704590 | 4718.039063 | 4812.087402 | 4812.087402 | 8.448291e+10 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2032 entries, 0 to 2031
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 2032 non-null object
1 Open 2032 non-null float64
2 High 2032 non-null float64
3 Low 2032 non-null float64
4 Close 2032 non-null float64
5 Adj Close 2032 non-null float64
6 Volume 2032 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 111.2+ KB
df.head()
Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|
0 | 2017-11-09 | 308.644989 | 329.451996 | 307.056000 | 320.884003 | 320.884003 | 893249984 |
1 | 2017-11-10 | 320.670990 | 324.717987 | 294.541992 | 299.252991 | 299.252991 | 885985984 |
2 | 2017-11-11 | 298.585999 | 319.453003 | 298.191986 | 314.681000 | 314.681000 | 842300992 |
3 | 2017-11-12 | 314.690002 | 319.153015 | 298.513000 | 307.907990 | 307.907990 | 1613479936 |
4 | 2017-11-13 | 307.024994 | 328.415009 | 307.024994 | 316.716003 | 316.716003 | 1041889984 |
df.tail()
Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|
2027 | 2023-05-29 | 1909.297485 | 1926.421753 | 1879.077515 | 1893.078125 | 1893.078125 | 5884674572 |
2028 | 2023-05-30 | 1893.093140 | 1916.574951 | 1883.934692 | 1901.026611 | 1901.026611 | 5363439784 |
2029 | 2023-05-31 | 1901.098267 | 1907.035400 | 1852.094727 | 1874.130493 | 1874.130493 | 5984512548 |
2030 | 2023-06-01 | 1873.914673 | 1887.705322 | 1846.227417 | 1862.201416 | 1862.201416 | 5640027197 |
2031 | 2023-06-02 | 1862.266724 | 1909.625732 | 1851.964722 | 1906.706543 | 1906.706543 | 6151811584 |
# find null values
df.isnull().sum()
Date 0
Open 0
High 0
Low 0
Close 0
Adj Close 0
Volume 0
dtype: int64
missing_val = df.isnull().sum()
missing_val_percentile = (100 * missing_val)/len(df)
missing_val_table = pd.concat([missing_val,missing_val_percentile], axis = 1)
missing_val_table = missing_val_table.rename(columns ={ 0: 'Missing Values', 1: 'Percentage'})
missing_val_table = missing_val_table.sort_values(by = 'Missing Values', ascending = False)
missing_val_table
Missing Values | Percentage | |
---|---|---|
Date | 0 | 0.0 |
Open | 0 | 0.0 |
High | 0 | 0.0 |
Low | 0 | 0.0 |
Close | 0 | 0.0 |
Adj Close | 0 | 0.0 |
Volume | 0 | 0.0 |
data = df.copy()
df['Date']=pd.to_datetime(df['Date'])
df.set_index(['Date'],inplace=True)
EDA
fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
autosize=False,
width=1000,
height=500,
title='Volume over time',
template="simple_white",
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()
Open,close,High,low prices over time
cols_plot = ['Open', 'Close', 'High','Low']
axes = df[cols_plot].plot(figsize=(11, 9), subplots=True)
for ax in axes:
ax.set_ylabel('Daily trade')
Volume over Time
fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
autosize=False,
width=1000,
height=500,
template='simple_white',
title='Volume over time'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()
fig = go.Figure([go.Scatter(x=df.loc['2022', 'Volume'].index,y=df.loc['2022', 'Volume'])])
fig.update_layout(
autosize=False,
width=1000,
height=500,
template='simple_white',
title='Volume'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()
Seasonal decomposition
plt.rcParams.update({'figure.figsize': (10,10)})
y = df['Adj Close'].to_frame()
# Multiplicative Decomposition
result_mul = seasonal_decompose(y, model='multiplicative',period = 52)
# Additive Decomposition
result_add = seasonal_decompose(y, model='additive',period = 52)
# Plot
plt.rcParams.update({'figure.figsize': (10,10)})
result_mul.plot().suptitle('Multiplicative Decompose', fontsize=22)
result_add.plot().suptitle('Additive Decompose', fontsize=22)
plt.show()
Stationarity
def adf_test(series,title=''):
"""
Pass in a time series and an optional title, returns an ADF report
"""
print(f'Augmented Dickey-Fuller Test: {title}')
result = adfuller(series.dropna(),autolag='AIC')
labels = ['ADF test statistic','p-value','# lags used','# observations']
out = pd.Series(result[0:4],index=labels)
for key,val in result[4].items():
out[f'critical value ({key})']=val
print(out.to_string(), '\n')
if result[1] <= 0.05:
print("Strong evidence against the null hypothesis")
print("Reject the null hypothesis")
print("Data has no unit root and is stationary")
else:
print("Weak evidence against the null hypothesis")
print("Fail to reject the null hypothesis")
print("Data has a unit root and is non-stationary")
return out
adf_test(df['Adj Close'],title='Reliance Data');
Augmented Dickey-Fuller Test: Reliance Data
ADF test statistic -1.405220
p-value 0.579771
# lags used 17.000000
# observations 2014.000000
critical value (1%) -3.433601
critical value (5%) -2.862976
critical value (10%) -2.567535
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary
Plotting ACF and PACF
plt.rcParams.update({'figure.figsize': (20,6)})
sm.graphics.tsa.plot_acf(df['Adj Close'], lags=30,
title='Auto correlation of Adj Close',zero=False);
sm.graphics.tsa.plot_pacf(df['Adj Close'], lags=30,
title='Partial auto correlation of Adj Close',zero=False);
data.Date = pd.to_datetime(data.Date, format="%Y-%m-%d")
data["month"] = data.Date.dt.month
data["week"] = data.Date.dt.week
data["day"] = data.Date.dt.day
data["day_of_week"] = data.Date.dt.dayofweek
data = data.rename(columns = {'Adj Close': 'adj_close'})
df_train = data[data.Date < "2022"]
df_valid = data[data.Date >= "2022"]
exogenous_features = ["Open","High","Low","Close","Volume","month","week","day","day_of_week"]
df_valid['Date'].describe()
count 518
unique 518
top 2022-01-01 00:00:00
freq 1
first 2022-01-01 00:00:00
last 2023-06-02 00:00:00
Name: Date, dtype: object
model = DummyRegressor().fit(df_train[exogenous_features], df_train['adj_close'])
df_valid['Dummy_preds'] = model.predict(df_valid[exogenous_features])
print('RMSE:', np.sqrt(mean_squared_error(df_valid['adj_close'], df_valid['Dummy_preds'])))
print('MAE:', mean_absolute_error(df_valid['adj_close'], df_valid['Dummy_preds']))
df_valid[['adj_close', 'Dummy_preds']].plot();
RMSE: 1188.7560160602443
MAE: 981.8256955373223
model = LGBMRegressor().fit(df_train[exogenous_features], df_train['adj_close'])
df_valid['LGBM_preds'] = model.predict(df_valid[exogenous_features])
print('RMSE:', np.sqrt(mean_squared_error(df_valid['adj_close'], df_valid['LGBM_preds'])))
print('MAE:', mean_absolute_error(df_valid['adj_close'], df_valid['LGBM_preds']))
df_valid[['adj_close', 'LGBM_preds']].plot();
RMSE: 24.667576357416404
MAE: 19.52191702437567
%%time
model = auto_arima(
df_train.adj_close, exogenous=df_train[exogenous_features],
trace=True, error_action="ignore", suppress_warnings=True
).fit(df_train.adj_close, exogenous=df_train[exogenous_features])
df_valid["ARIMAX_preds"] = model.predict(n_periods=len(df_valid),
exogenous=df_valid[exogenous_features])
print('RMSE:', np.sqrt(mean_squared_error(df_valid['adj_close'], df_valid['ARIMAX_preds'])))
print('MAE:', mean_absolute_error(df_valid['adj_close'], df_valid['ARIMAX_preds']))
df_valid[['adj_close', 'ARIMAX_preds']].plot();
Performing stepwise search to minimize aic
ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=17393.975, Time=3.35 sec
ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=17408.280, Time=0.09 sec
ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=17395.662, Time=0.15 sec
ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=17395.970, Time=0.21 sec
ARIMA(0,1,0)(0,0,0)[0] : AIC=17407.568, Time=0.05 sec
ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=17392.527, Time=2.62 sec
ARIMA(0,1,2)(0,0,0)[0] intercept : AIC=17397.619, Time=0.69 sec
ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=17397.629, Time=0.22 sec
ARIMA(1,1,3)(0,0,0)[0] intercept : AIC=17390.939, Time=2.56 sec
ARIMA(0,1,3)(0,0,0)[0] intercept : AIC=17399.523, Time=1.08 sec
ARIMA(2,1,3)(0,0,0)[0] intercept : AIC=17356.784, Time=4.48 sec
ARIMA(3,1,3)(0,0,0)[0] intercept : AIC=17370.609, Time=5.51 sec
ARIMA(2,1,4)(0,0,0)[0] intercept : AIC=17358.991, Time=5.23 sec
ARIMA(1,1,4)(0,0,0)[0] intercept : AIC=17379.214, Time=3.57 sec
ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=17358.284, Time=4.45 sec
ARIMA(3,1,4)(0,0,0)[0] intercept : AIC=17360.151, Time=5.60 sec
ARIMA(2,1,3)(0,0,0)[0] : AIC=17356.338, Time=2.31 sec
ARIMA(1,1,3)(0,0,0)[0] : AIC=17390.591, Time=0.97 sec
ARIMA(2,1,2)(0,0,0)[0] : AIC=17393.532, Time=1.87 sec
ARIMA(3,1,3)(0,0,0)[0] : AIC=17369.305, Time=3.11 sec
ARIMA(2,1,4)(0,0,0)[0] : AIC=17357.924, Time=2.69 sec
ARIMA(1,1,2)(0,0,0)[0] : AIC=17398.000, Time=0.96 sec
ARIMA(1,1,4)(0,0,0)[0] : AIC=17378.590, Time=1.11 sec
ARIMA(3,1,2)(0,0,0)[0] : AIC=17396.234, Time=1.37 sec
ARIMA(3,1,4)(0,0,0)[0] : AIC=17359.593, Time=3.75 sec
Best model: ARIMA(2,1,3)(0,0,0)[0]
Total fit time: 58.012 seconds
RMSE: 1896.846491521281
MAE: 1776.197608212806
CPU times: total: 1min
Wall time: 1min
<AxesSubplot:>
model.summary()
Dep. Variable: | y | No. Observations: | 1514 |
---|---|---|---|
Model: | SARIMAX(2, 1, 3) | Log Likelihood | -8672.169 |
Date: | Sun, 04 Jun 2023 | AIC | 17356.338 |
Time: | 19:18:50 | BIC | 17388.269 |
Sample: | 0 | HQIC | 17368.229 |
- 1514 | | |
|
Covariance Type: | opg | | |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ar.L1 | -1.8285 | 0.014 | -128.313 | 0.000 | -1.856 | -1.801 |
ar.L2 | -0.9083 | 0.013 | -67.761 | 0.000 | -0.935 | -0.882 |
ma.L1 | 1.7608 | 0.018 | 99.850 | 0.000 | 1.726 | 1.795 |
ma.L2 | 0.7893 | 0.027 | 28.813 | 0.000 | 0.736 | 0.843 |
ma.L3 | -0.0798 | 0.014 | -5.735 | 0.000 | -0.107 | -0.052 |
sigma2 | 5570.9644 | 69.164 | 80.547 | 0.000 | 5435.405 | 5706.524 |
Ljung-Box (L1) (Q): | 0.00 | Jarque-Bera (JB): | 27411.90 |
---|---|---|---|
Prob(Q): | 1.00 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 13.60 | Skew: | -0.85 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 23.78 |
Error Analysis
df_valid['ARIMAX_error'] = df_valid['adj_close'] - df_valid['ARIMAX_preds']
df_valid['LGBM_error'] = df_valid['adj_close'] - df_valid['LGBM_preds']
ax = df_valid[['LGBM_error', 'ARIMAX_error']].plot()
ax.set_title('Errors', fontsize=18);
The full example is on my Kaggle account. This is the link.
https://www.kaggle.com/code/mixmore/ethereum-analysis-and-forecasting
This is just an explanation of the example on Kaggle.