Ethereum analysis and forecasting

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import matplotlib.dates as mdates
import scipy.stats
import pylab
import lightgbm as lgb
from lightgbm import LGBMRegressor

import missingno as msno
import plotly.express as px
import plotly.graph_objects as go

from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from dateutil.parser import parse

from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose

from pmdarima import auto_arima

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("ETH-USD.csv")
df.describe()

	Open	High	Low	Close	Adj Close	Volume
count	2032.000000	2032.000000	2032.000000	2032.000000	2032.000000	2.032000e+03
mean	1172.894714	1209.016718	1132.495742	1173.463282	1173.463282	1.253992e+10
std	1154.144130	1188.670880	1114.491666	1153.653971	1153.653971	1.047928e+10
min	84.279694	85.342743	82.829887	84.308296	84.308296	6.217330e+08
25%	220.315762	226.314495	213.840340	220.328876	220.328876	4.845131e+09
50%	623.848511	648.783997	596.979004	630.632447	630.632447	9.944210e+09
75%	1808.816773	1840.087402	1763.558655	1810.730591	1810.730591	1.749615e+10
max	4810.071289	4891.704590	4718.039063	4812.087402	4812.087402	8.448291e+10

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2032 entries, 0 to 2031
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2032 non-null   object 
 1   Open       2032 non-null   float64
 2   High       2032 non-null   float64
 3   Low        2032 non-null   float64
 4   Close      2032 non-null   float64
 5   Adj Close  2032 non-null   float64
 6   Volume     2032 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 111.2+ KB

df.head()

	Date	Open	High	Low	Close	Adj Close	Volume
0	2017-11-09	308.644989	329.451996	307.056000	320.884003	320.884003	893249984
1	2017-11-10	320.670990	324.717987	294.541992	299.252991	299.252991	885985984
2	2017-11-11	298.585999	319.453003	298.191986	314.681000	314.681000	842300992
3	2017-11-12	314.690002	319.153015	298.513000	307.907990	307.907990	1613479936
4	2017-11-13	307.024994	328.415009	307.024994	316.716003	316.716003	1041889984

df.tail()

	Date	Open	High	Low	Close	Adj Close	Volume
2027	2023-05-29	1909.297485	1926.421753	1879.077515	1893.078125	1893.078125	5884674572
2028	2023-05-30	1893.093140	1916.574951	1883.934692	1901.026611	1901.026611	5363439784
2029	2023-05-31	1901.098267	1907.035400	1852.094727	1874.130493	1874.130493	5984512548
2030	2023-06-01	1873.914673	1887.705322	1846.227417	1862.201416	1862.201416	5640027197
2031	2023-06-02	1862.266724	1909.625732	1851.964722	1906.706543	1906.706543	6151811584

 
    # find null values
df.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

missing_val = df.isnull().sum()
missing_val_percentile = (100 * missing_val)/len(df)
missing_val_table = pd.concat([missing_val,missing_val_percentile], axis = 1)
missing_val_table = missing_val_table.rename(columns ={ 0: 'Missing Values', 1: 'Percentage'})
missing_val_table = missing_val_table.sort_values(by = 'Missing Values', ascending = False)
missing_val_table

	Missing Values	Percentage
Date	0	0.0
Open	0	0.0
High	0	0.0
Low	0	0.0
Close	0	0.0
Adj Close	0	0.0
Volume	0	0.0

data = df.copy()

df['Date']=pd.to_datetime(df['Date'])
df.set_index(['Date'],inplace=True)

EDA

fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    title='Volume over time',
    template="simple_white",
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

Open,close,High,low prices over time

cols_plot = ['Open', 'Close', 'High','Low']
axes = df[cols_plot].plot(figsize=(11, 9), subplots=True)
for ax in axes:
ax.set_ylabel('Daily trade')

Volume over Time

fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    template='simple_white',
    title='Volume over time'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

fig = go.Figure([go.Scatter(x=df.loc['2022', 'Volume'].index,y=df.loc['2022', 'Volume'])])
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    template='simple_white',
    title='Volume'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

Seasonal decomposition

plt.rcParams.update({'figure.figsize': (10,10)})
y = df['Adj Close'].to_frame()

# Multiplicative Decomposition
result_mul = seasonal_decompose(y, model='multiplicative',period = 52)

# Additive Decomposition
result_add = seasonal_decompose(y, model='additive',period = 52)

# Plot
plt.rcParams.update({'figure.figsize': (10,10)})
result_mul.plot().suptitle('Multiplicative Decompose', fontsize=22)
result_add.plot().suptitle('Additive Decompose', fontsize=22)
plt.show()

Stationarity

def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC')

    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out[f'critical value ({key})']=val

    print(out.to_string(), '\n')

    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

    return out

adf_test(df['Adj Close'],title='Reliance Data');

Augmented Dickey-Fuller Test: Reliance Data
ADF test statistic        -1.405220
p-value                    0.579771
# lags used               17.000000
# observations          2014.000000
critical value (1%)       -3.433601
critical value (5%)       -2.862976
critical value (10%)      -2.567535 

Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary

Plotting ACF and PACF

plt.rcParams.update({'figure.figsize': (20,6)})

sm.graphics.tsa.plot_acf(df['Adj Close'], lags=30,

title='Auto correlation of Adj Close',zero=False);
sm.graphics.tsa.plot_pacf(df['Adj Close'], lags=30,

title='Partial auto correlation of Adj Close',zero=False);

data.Date = pd.to_datetime(data.Date, format="%Y-%m-%d")
data["month"] = data.Date.dt.month
data["week"] = data.Date.dt.week
data["day"] = data.Date.dt.day
data["day_of_week"] = data.Date.dt.dayofweek

data = data.rename(columns = {'Adj Close': 'adj_close'})

df_train = data[data.Date < "2022"]
df_valid = data[data.Date >= "2022"]

exogenous_features = ["Open","High","Low","Close","Volume","month","week","day","day_of_week"]

df_valid['Date'].describe()

count                     518
unique                    518
top       2022-01-01 00:00:00
freq                        1
first     2022-01-01 00:00:00
last      2023-06-02 00:00:00
Name: Date, dtype: object

model = DummyRegressor().fit(df_train[exogenous_features], df_train['adj_close'])
df_valid['Dummy_preds'] = model.predict(df_valid[exogenous_features])
print('RMSE:', np.sqrt(mean_squared_error(df_valid['adj_close'], df_valid['Dummy_preds'])))
print('MAE:', mean_absolute_error(df_valid['adj_close'], df_valid['Dummy_preds']))
df_valid[['adj_close', 'Dummy_preds']].plot();

RMSE: 1188.7560160602443
MAE: 981.8256955373223

model = LGBMRegressor().fit(df_train[exogenous_features], df_train['adj_close'])
df_valid['LGBM_preds'] = model.predict(df_valid[exogenous_features])
print('RMSE:', np.sqrt(mean_squared_error(df_valid['adj_close'], df_valid['LGBM_preds'])))
print('MAE:', mean_absolute_error(df_valid['adj_close'], df_valid['LGBM_preds']))
df_valid[['adj_close', 'LGBM_preds']].plot();

RMSE: 24.667576357416404
MAE: 19.52191702437567

  
    %%time
model = auto_arima(
    df_train.adj_close, exogenous=df_train[exogenous_features], 
    trace=True, error_action="ignore", suppress_warnings=True
).fit(df_train.adj_close, exogenous=df_train[exogenous_features])

df_valid["ARIMAX_preds"] = model.predict(n_periods=len(df_valid),  

exogenous=df_valid[exogenous_features])

print('RMSE:', np.sqrt(mean_squared_error(df_valid['adj_close'], df_valid['ARIMAX_preds'])))
print('MAE:', mean_absolute_error(df_valid['adj_close'], df_valid['ARIMAX_preds']))
df_valid[['adj_close', 'ARIMAX_preds']].plot();
    

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=17393.975, Time=3.35 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=17408.280, Time=0.09 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=17395.662, Time=0.15 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=17395.970, Time=0.21 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=17407.568, Time=0.05 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=17392.527, Time=2.62 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=17397.619, Time=0.69 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=17397.629, Time=0.22 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=17390.939, Time=2.56 sec
 ARIMA(0,1,3)(0,0,0)[0] intercept   : AIC=17399.523, Time=1.08 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=17356.784, Time=4.48 sec
 ARIMA(3,1,3)(0,0,0)[0] intercept   : AIC=17370.609, Time=5.51 sec
 ARIMA(2,1,4)(0,0,0)[0] intercept   : AIC=17358.991, Time=5.23 sec
 ARIMA(1,1,4)(0,0,0)[0] intercept   : AIC=17379.214, Time=3.57 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=17358.284, Time=4.45 sec
 ARIMA(3,1,4)(0,0,0)[0] intercept   : AIC=17360.151, Time=5.60 sec
 ARIMA(2,1,3)(0,0,0)[0]             : AIC=17356.338, Time=2.31 sec
 ARIMA(1,1,3)(0,0,0)[0]             : AIC=17390.591, Time=0.97 sec
 ARIMA(2,1,2)(0,0,0)[0]             : AIC=17393.532, Time=1.87 sec
 ARIMA(3,1,3)(0,0,0)[0]             : AIC=17369.305, Time=3.11 sec
 ARIMA(2,1,4)(0,0,0)[0]             : AIC=17357.924, Time=2.69 sec
 ARIMA(1,1,2)(0,0,0)[0]             : AIC=17398.000, Time=0.96 sec
 ARIMA(1,1,4)(0,0,0)[0]             : AIC=17378.590, Time=1.11 sec
 ARIMA(3,1,2)(0,0,0)[0]             : AIC=17396.234, Time=1.37 sec
 ARIMA(3,1,4)(0,0,0)[0]             : AIC=17359.593, Time=3.75 sec

Best model:  ARIMA(2,1,3)(0,0,0)[0]          
Total fit time: 58.012 seconds
RMSE: 1896.846491521281
MAE: 1776.197608212806
CPU times: total: 1min
Wall time: 1min

<AxesSubplot:>

model.summary()

SARIMAX Results
Dep. Variable:	y	No. Observations:	1514
Model:	SARIMAX(2, 1, 3)	Log Likelihood	-8672.169
Date:	Sun, 04 Jun 2023	AIC	17356.338
Time:	19:18:50	BIC	17388.269
Sample:	0	HQIC	17368.229
	- 1514
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
ar.L1	-1.8285	0.014	-128.313	0.000	-1.856	-1.801
ar.L2	-0.9083	0.013	-67.761	0.000	-0.935	-0.882
ma.L1	1.7608	0.018	99.850	0.000	1.726	1.795
ma.L2	0.7893	0.027	28.813	0.000	0.736	0.843
ma.L3	-0.0798	0.014	-5.735	0.000	-0.107	-0.052
sigma2	5570.9644	69.164	80.547	0.000	5435.405	5706.524

Ljung-Box (L1) (Q):	0.00	Jarque-Bera (JB):	27411.90
Prob(Q):	1.00	Prob(JB):	0.00
Heteroskedasticity (H):	13.60	Skew:	-0.85
Prob(H) (two-sided):	0.00	Kurtosis:	23.78

Error Analysis

 
    df_valid['ARIMAX_error'] = df_valid['adj_close'] - df_valid['ARIMAX_preds']
df_valid['LGBM_error'] = df_valid['adj_close'] - df_valid['LGBM_preds']
ax = df_valid[['LGBM_error', 'ARIMAX_error']].plot()
ax.set_title('Errors', fontsize=18);

The full example is on my Kaggle account. This is the link.

https://www.kaggle.com/code/mixmore/ethereum-analysis-and-forecasting

This is just an explanation of the example on Kaggle.

Deep Side

10/03/2023

Ethereum analysis and forecasting

Ethereum analysis and forecasting

df = pd.read_csv("ETH-USD.csv")
df.describe()

df.info()

df.head()

df.tail()

# find null values
df.isnull().sum()

data = df.copy()

df['Date']=pd.to_datetime(df['Date'])
df.set_index(['Date'],inplace=True)

fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    title='Volume over time',
    template="simple_white",
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

Open,close,High,low prices over time

cols_plot = ['Open', 'Close', 'High','Low']
axes = df[cols_plot].plot(figsize=(11, 9), subplots=True)
for ax in axes:
ax.set_ylabel('Daily trade')

Volume over Time

fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    template='simple_white',
    title='Volume over time'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

fig = go.Figure([go.Scatter(x=df.loc['2022', 'Volume'].index,y=df.loc['2022', 'Volume'])])
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    template='simple_white',
    title='Volume'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

Seasonal decomposition

plt.rcParams.update({'figure.figsize': (20,6)})

sm.graphics.tsa.plot_acf(df['Adj Close'], lags=30,

title='Auto correlation of Adj Close',zero=False);
sm.graphics.tsa.plot_pacf(df['Adj Close'], lags=30,

title='Partial auto correlation of Adj Close',zero=False);

model.summary()

df_valid['ARIMAX_error'] = df_valid['adj_close'] - df_valid['ARIMAX_preds']
df_valid['LGBM_error'] = df_valid['adj_close'] - df_valid['LGBM_preds']
ax = df_valid[['LGBM_error', 'ARIMAX_error']].plot()
ax.set_title('Errors', fontsize=18);

10/03/2023

Ethereum analysis and forecasting

Ethereum analysis and forecasting

df = pd.read_csv("ETH-USD.csv")df.describe()

df.info()

df.head()

df.tail()

# find null valuesdf.isnull().sum()

data = df.copy()df['Date']=pd.to_datetime(df['Date'])df.set_index(['Date'],inplace=True)

fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])fig.update_layout( autosize=False, width=1000, height=500, title='Volume over time', template="simple_white",)fig.update_xaxes(title="Date")fig.update_yaxes(title="Volume")fig.show()

Open,close,High,low prices over time

cols_plot = ['Open', 'Close', 'High','Low']axes = df[cols_plot].plot(figsize=(11, 9), subplots=True)for ax in axes: ax.set_ylabel('Daily trade')

Volume over Time

fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])fig.update_layout( autosize=False, width=1000, height=500, template='simple_white', title='Volume over time')fig.update_xaxes(title="Date")fig.update_yaxes(title="Volume")fig.show()

fig = go.Figure([go.Scatter(x=df.loc['2022', 'Volume'].index,y=df.loc['2022', 'Volume'])])fig.update_layout( autosize=False, width=1000, height=500, template='simple_white', title='Volume')fig.update_xaxes(title="Date")fig.update_yaxes(title="Volume")fig.show()

Seasonal decomposition

plt.rcParams.update({'figure.figsize': (20,6)})sm.graphics.tsa.plot_acf(df['Adj Close'], lags=30, title='Auto correlation of Adj Close',zero=False);sm.graphics.tsa.plot_pacf(df['Adj Close'], lags=30, title='Partial auto correlation of Adj Close',zero=False);

model.summary()

df_valid['ARIMAX_error'] = df_valid['adj_close'] - df_valid['ARIMAX_preds']df_valid['LGBM_error'] = df_valid['adj_close'] - df_valid['LGBM_preds']ax = df_valid[['LGBM_error', 'ARIMAX_error']].plot()ax.set_title('Errors', fontsize=18);

df = pd.read_csv("ETH-USD.csv")
df.describe()

# find null values
df.isnull().sum()

data = df.copy()

df['Date']=pd.to_datetime(df['Date'])
df.set_index(['Date'],inplace=True)

fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
autosize=False,
width=1000,
height=500,
title='Volume over time',
template="simple_white",
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

cols_plot = ['Open', 'Close', 'High','Low']
axes = df[cols_plot].plot(figsize=(11, 9), subplots=True)
for ax in axes:
ax.set_ylabel('Daily trade')

fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
autosize=False,
width=1000,
height=500,
template='simple_white',
title='Volume over time'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

fig = go.Figure([go.Scatter(x=df.loc['2022', 'Volume'].index,y=df.loc['2022', 'Volume'])])
fig.update_layout(
autosize=False,
width=1000,
height=500,
template='simple_white',
title='Volume'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="Volume")
fig.show()

plt.rcParams.update({'figure.figsize': (20,6)})

sm.graphics.tsa.plot_acf(df['Adj Close'], lags=30,

title='Auto correlation of Adj Close',zero=False);
sm.graphics.tsa.plot_pacf(df['Adj Close'], lags=30,

title='Partial auto correlation of Adj Close',zero=False);

df_valid['ARIMAX_error'] = df_valid['adj_close'] - df_valid['ARIMAX_preds']
df_valid['LGBM_error'] = df_valid['adj_close'] - df_valid['LGBM_preds']
ax = df_valid[['LGBM_error', 'ARIMAX_error']].plot()
ax.set_title('Errors', fontsize=18);