The following graphs are the codes for the figures included in Chapters 6 and 7 of the e-book Statistics with Python.
import numpy as np import pandas as pd from scipy import stats from sklearn.preprocessing import StandardScaler import FinanceDataReader as fdr import yfinance as yf import statsmodels.api as sm from statsmodels.formula.api import ols from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt import seaborn as sns sns.set_style("darkgrid")
#fig 611 st=pd.Timestamp(2024,1, 1) et=pd.Timestamp(2024, 5, 30) code=["^KS11", "^KQ11", "^DJI", "KRW=X"] nme=['kos','kq','dj','WonDol'] da=pd.DataFrame() for i in code: x=yf.download(i,st, et)['Close'] x1=x.pct_change() da=pd.concat([da, x1], axis=1) da.columns=nme da1=da.dropna() da1.index=range(len(da1)) da2=pd.melt(da1, value_vars=['kos', 'kq', 'dj', 'WonDol'], var_name="idx", value_name="val") model=ols("val~C(idx)", data=da2).fit() res=model.resid plt.figure(figsize=(3,2)) varQQplot1=stats.probplot(res, plot=plt) plt.show()
#fig 612, fig 611 데이터에서 이상치 제외한 모델의 잔차에 대한 QQ plot from statsmodels.stats.outliers_influence import outlier_test outl=outlier_test(model) out2=np.where(outl.iloc[:,2]<1)[0] da3=da2.drop(out2) model1=ols("val~C(idx)", data=da3).fit() res1=model1.resid plt.figure(figsize=(3,2)) varQQplot1=stats.probplot(res1, plot=plt) plt.show()
#fig 721 st=pd.Timestamp(2021,1, 1) et=pd.Timestamp(2024, 5, 10) nd=yf.download('^IXIC',st, et)[["Open","Close"]] nd.columns=["Open", "Close"] nd.index=range(len(nd)) X=nd.values[:,0].reshape(-1,1) y=nd.values[:,1].reshape(-1,1) from sklearn.preprocessing import StandardScaler xScaler=StandardScaler().fit(X) X_n=xScaler.transform(X) yScaler=StandardScaler().fit(y) y_n=yScaler.transform(y) plt.figure(figsize=(4,3)) plt.scatter(X_n, y_n, label="Data") plt.plot(X_n, 0.998*X_n, color="red", label="Regression line") plt.legend(loc="best", frameon=False) plt.xlabel("Open", weight="bold") plt.ylabel('Close', weight="bold") plt.show()
#data using to fig 758 from fig 752 st=pd.Timestamp(2021,1, 1) et=pd.Timestamp(2024, 5, 10) nd=yf.download('^IXIC',st, et)[["Open","Close"]] nd.columns=["Open", "Close"] nd.index=range(len(nd)) X=nd.values[:,0].reshape(-1,1) y=nd.values[:,1].reshape(-1,1) from sklearn.preprocessing import StandardScaler xScaler=StandardScaler().fit(X) X_n=xScaler.transform(X) yScaler=StandardScaler().fit(y) y_n=yScaler.transform(y)
#fig 752 mod = LinearRegression() mod.fit(X_n, y_n) pre=mod.predict(X_n) err=pre-y_n plt.figure(figsize=(3,2)) errorRe=stats.probplot(err.ravel(), plot=plt) plt.show()
#fig753 X_n0=sm.add_constant(X_n) reg=sm.OLS(y_n, X_n0).fit() influence=reg.get_influence() infSummary=influence.summary_frame() hat=infSummary["hat_diag"] plt.figure(figsize=(6, 2)) plt.stem(hat) plt.axhline(np.mean(hat), c='g', ls="--") plt.axhline(np.mean(hat)*2, c='brown', ls="--") plt.title("leverage") plt.show()
#fig754 out1id=np.where(hat>2*hat.mean())[0] X_n0Hat=np.delete(X_n0, out1id, axis=0) y_nHat=np.delete(y_n, out1id, axis=0) reg_hat=sm.OLS(y_nHat, X_n0Hat).fit() plt.figure(figsize=(6, 2)) plt.subplots_adjust(wspace=0.5) plt.subplot(1,2,1) stats.probplot(reg.resid, plot=plt) plt.title("a) Probability plot") plt.subplot(1,2,2) stats.probplot(reg_hat.resid, plot=plt) plt.title("b) Probability plot") plt.show()
#fig 755 res_std=infSummary["student_resid"] plt.figure(figsize=(6, 3)) plt.subplots_adjust(hspace=0.6) plt.subplot(2,1,1) plt.stem(reg.resid) plt.title("a) Residual") plt.subplot(2,1,2) plt.stem(res_std) plt.axhline(3, c='g', ls='--') plt.axhline(-3, c='g', ls='--') plt.title("b) Studenized Residual") plt.show()
#fig 756 out_stres=np.where(np.abs(res_std)>3)[0] ind_stres=np.delete(X_n0, out_stres, axis=0) de_stres=np.delete(y_n, out_stres, axis=0) reg_stres=sm.OLS(de_stres, ind_stres).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_stres.resid, plot=plt) plt.show()
#fig757 cook=infSummary["cooks_d"] cd_ref=4/(len(reg.resid)-2-1) cd_idx=np.where(cook>cd_ref)[0] ind_cd=np.delete(X_n0, cd_idx, axis=0) de_cd=np.delete(y_n, cd_idx, axis=0) reg_cd=sm.OLS(de_cd, ind_cd).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_cd.resid, plot=plt) plt.show()
#fig758 dffi=infSummary["dffits"] dff_ref=2*np.sqrt(3/(len(reg.resid)-2+1)) dff_idx=np.where(dffi>dff_ref)[0] ind_dff=np.delete(X_n0, dff_idx, axis=0) de_dff=np.delete(y_n, dff_idx, axis=0) reg_dff=sm.OLS(de_dff, ind_dff).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_dff.resid, plot=plt) plt.show()
#fig 761 ~ fig 763에 사용된 데이터 st=pd.Timestamp(2023,1, 10) et=pd.Timestamp(2024, 5, 30) code=["^DJI", "^IXIC", "^spx", "^VIX","DX-Y.NYB","^sox"] nme=["dj","nd","sp", "vix", "dollar","sox" ] da={} for i, j in zip(nme,code): da[i]=yf.download(j,st, et)["Close"] da2=pd.concat(da.values(), axis=1) da2.columns=da.keys() da2.index=pd.DatetimeIndex(da2.index.date) da2=da2.ffill() ind=da2.values[:-1,:-1] de=da2.values[1:,-1].reshape(-1,1) final=da2.values[-1, :-1].reshape(1,-1) indScaler=StandardScaler().fit(ind) deScaler=StandardScaler().fit(de) indNor=indScaler.transform(ind) finalNor=indScaler.transform(final) deNor=deScaler.transform(de) da3=pd.DataFrame(np.c_[indNor, deNor.reshape(-1,1)]) da3.columns=da2.columns form='sox~dj+nd+sp+vix+dollar' reg=ols(form, data=da3).fit() mod=LinearRegression().fit(indNor, deNor)
#fig 761 plt.figure(figsize=(4, 3)) plt.plot(deNor, c="g", ls="--", label="data") plt.plot(mod.predict(indNor), c="b", label="regression") plt.legend(loc="best", frameon=False) plt.show()
#fig762 err=reg.resid plt.figure(figsize=(4,3)) stats.probplot(err, plot=plt, rvalue=True) plt.show()
#fig763 fig=plt.figure(figsize=(4, 3)) plt.stem(reg.resid_pearson) plt.axhline(3, c="g", ls="--") plt.axhline(-3, c="g", ls="--") plt.title("studented Residuals") plt.show()
댓글
댓글 쓰기