The following graphs are the codes for the figures included in Chapters 6 and 7 of the e-book Statistics with Python.
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import StandardScaler
import FinanceDataReader as fdr
import yfinance as yf
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
#fig 611
st=pd.Timestamp(2024,1, 1)
et=pd.Timestamp(2024, 5, 30)
code=["^KS11", "^KQ11", "^DJI", "KRW=X"]
nme=['kos','kq','dj','WonDol']
da=pd.DataFrame()
for i in code:
x=yf.download(i,st, et)['Close']
x1=x.pct_change()
da=pd.concat([da, x1], axis=1)
da.columns=nme
da1=da.dropna()
da1.index=range(len(da1))
da2=pd.melt(da1, value_vars=['kos', 'kq', 'dj', 'WonDol'], var_name="idx", value_name="val")
model=ols("val~C(idx)", data=da2).fit()
res=model.resid
plt.figure(figsize=(3,2))
varQQplot1=stats.probplot(res, plot=plt)
plt.show()
#fig 612, fig 611 데이터에서 이상치 제외한 모델의 잔차에 대한 QQ plot
from statsmodels.stats.outliers_influence import outlier_test
outl=outlier_test(model)
out2=np.where(outl.iloc[:,2]<1)[0]
da3=da2.drop(out2)
model1=ols("val~C(idx)", data=da3).fit()
res1=model1.resid
plt.figure(figsize=(3,2))
varQQplot1=stats.probplot(res1, plot=plt)
plt.show()
#fig 721
st=pd.Timestamp(2021,1, 1)
et=pd.Timestamp(2024, 5, 10)
nd=yf.download('^IXIC',st, et)[["Open","Close"]]
nd.columns=["Open", "Close"]
nd.index=range(len(nd))
X=nd.values[:,0].reshape(-1,1)
y=nd.values[:,1].reshape(-1,1)
from sklearn.preprocessing import StandardScaler
xScaler=StandardScaler().fit(X)
X_n=xScaler.transform(X)
yScaler=StandardScaler().fit(y)
y_n=yScaler.transform(y)
plt.figure(figsize=(4,3))
plt.scatter(X_n, y_n, label="Data")
plt.plot(X_n, 0.998*X_n, color="red", label="Regression line")
plt.legend(loc="best", frameon=False)
plt.xlabel("Open", weight="bold")
plt.ylabel('Close', weight="bold")
plt.show()
#data using to fig 758 from fig 752
st=pd.Timestamp(2021,1, 1)
et=pd.Timestamp(2024, 5, 10)
nd=yf.download('^IXIC',st, et)[["Open","Close"]]
nd.columns=["Open", "Close"]
nd.index=range(len(nd))
X=nd.values[:,0].reshape(-1,1)
y=nd.values[:,1].reshape(-1,1)
from sklearn.preprocessing import StandardScaler
xScaler=StandardScaler().fit(X)
X_n=xScaler.transform(X)
yScaler=StandardScaler().fit(y)
y_n=yScaler.transform(y)
#fig 752 mod = LinearRegression() mod.fit(X_n, y_n) pre=mod.predict(X_n) err=pre-y_n plt.figure(figsize=(3,2)) errorRe=stats.probplot(err.ravel(), plot=plt) plt.show()
#fig753
X_n0=sm.add_constant(X_n)
reg=sm.OLS(y_n, X_n0).fit()
influence=reg.get_influence()
infSummary=influence.summary_frame()
hat=infSummary["hat_diag"]
plt.figure(figsize=(6, 2))
plt.stem(hat)
plt.axhline(np.mean(hat), c='g', ls="--")
plt.axhline(np.mean(hat)*2, c='brown', ls="--")
plt.title("leverage")
plt.show()
#fig754
out1id=np.where(hat>2*hat.mean())[0]
X_n0Hat=np.delete(X_n0, out1id, axis=0)
y_nHat=np.delete(y_n, out1id, axis=0)
reg_hat=sm.OLS(y_nHat, X_n0Hat).fit()
plt.figure(figsize=(6, 2))
plt.subplots_adjust(wspace=0.5)
plt.subplot(1,2,1)
stats.probplot(reg.resid, plot=plt)
plt.title("a) Probability plot")
plt.subplot(1,2,2)
stats.probplot(reg_hat.resid, plot=plt)
plt.title("b) Probability plot")
plt.show()
#fig 755
res_std=infSummary["student_resid"]
plt.figure(figsize=(6, 3))
plt.subplots_adjust(hspace=0.6)
plt.subplot(2,1,1)
plt.stem(reg.resid)
plt.title("a) Residual")
plt.subplot(2,1,2)
plt.stem(res_std)
plt.axhline(3, c='g', ls='--')
plt.axhline(-3, c='g', ls='--')
plt.title("b) Studenized Residual")
plt.show()
#fig 756 out_stres=np.where(np.abs(res_std)>3)[0] ind_stres=np.delete(X_n0, out_stres, axis=0) de_stres=np.delete(y_n, out_stres, axis=0) reg_stres=sm.OLS(de_stres, ind_stres).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_stres.resid, plot=plt) plt.show()
#fig757 cook=infSummary["cooks_d"] cd_ref=4/(len(reg.resid)-2-1) cd_idx=np.where(cook>cd_ref)[0] ind_cd=np.delete(X_n0, cd_idx, axis=0) de_cd=np.delete(y_n, cd_idx, axis=0) reg_cd=sm.OLS(de_cd, ind_cd).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_cd.resid, plot=plt) plt.show()
#fig758 dffi=infSummary["dffits"] dff_ref=2*np.sqrt(3/(len(reg.resid)-2+1)) dff_idx=np.where(dffi>dff_ref)[0] ind_dff=np.delete(X_n0, dff_idx, axis=0) de_dff=np.delete(y_n, dff_idx, axis=0) reg_dff=sm.OLS(de_dff, ind_dff).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_dff.resid, plot=plt) plt.show()
#fig 761 ~ fig 763에 사용된 데이터
st=pd.Timestamp(2023,1, 10)
et=pd.Timestamp(2024, 5, 30)
code=["^DJI", "^IXIC", "^spx", "^VIX","DX-Y.NYB","^sox"]
nme=["dj","nd","sp", "vix", "dollar","sox" ]
da={}
for i, j in zip(nme,code):
da[i]=yf.download(j,st, et)["Close"]
da2=pd.concat(da.values(), axis=1)
da2.columns=da.keys()
da2.index=pd.DatetimeIndex(da2.index.date)
da2=da2.ffill()
ind=da2.values[:-1,:-1]
de=da2.values[1:,-1].reshape(-1,1)
final=da2.values[-1, :-1].reshape(1,-1)
indScaler=StandardScaler().fit(ind)
deScaler=StandardScaler().fit(de)
indNor=indScaler.transform(ind)
finalNor=indScaler.transform(final)
deNor=deScaler.transform(de)
da3=pd.DataFrame(np.c_[indNor, deNor.reshape(-1,1)])
da3.columns=da2.columns
form='sox~dj+nd+sp+vix+dollar'
reg=ols(form, data=da3).fit()
mod=LinearRegression().fit(indNor, deNor)
#fig 761 plt.figure(figsize=(4, 3)) plt.plot(deNor, c="g", ls="--", label="data") plt.plot(mod.predict(indNor), c="b", label="regression") plt.legend(loc="best", frameon=False) plt.show()
#fig762 err=reg.resid plt.figure(figsize=(4,3)) stats.probplot(err, plot=plt, rvalue=True) plt.show()
#fig763
fig=plt.figure(figsize=(4, 3))
plt.stem(reg.resid_pearson)
plt.axhline(3, c="g", ls="--")
plt.axhline(-3, c="g", ls="--")
plt.title("studented Residuals")
plt.show()






댓글
댓글 쓰기