다음 그래프들은 전자책 파이썬과 함께하는 통계이야기 6 장과 7장에 수록된 그림들의 코드들입니다.
import numpy as np import pandas as pd from scipy import stats from sklearn.preprocessing import StandardScaler import FinanceDataReader as fdr import yfinance as yf import statsmodels.api as sm from statsmodels.formula.api import ols from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt import seaborn as sns sns.set_style("darkgrid")
#fig 611 st=pd.Timestamp(2024,1, 1) et=pd.Timestamp(2024, 5, 30) code=["^KS11", "^KQ11", "^DJI", "KRW=X"] nme=['kos','kq','dj','WonDol'] da=pd.DataFrame() for i in code: x=yf.download(i,st, et)['Close'] x1=x.pct_change() da=pd.concat([da, x1], axis=1) da.columns=nme da1=da.dropna() da1.index=range(len(da1)) da2=pd.melt(da1, value_vars=['kos', 'kq', 'dj', 'WonDol'], var_name="idx", value_name="val") model=ols("val~C(idx)", data=da2).fit() res=model.resid plt.figure(figsize=(3,2)) varQQplot1=stats.probplot(res, plot=plt) plt.show()
#fig 612, fig 611 데이터에서 이상치 제외한 모델의 잔차에 대한 QQ plot from statsmodels.stats.outliers_influence import outlier_test outl=outlier_test(model) out2=np.where(outl.iloc[:,2]<1)[0] da3=da2.drop(out2) model1=ols("val~C(idx)", data=da3).fit() res1=model1.resid plt.figure(figsize=(3,2)) varQQplot1=stats.probplot(res1, plot=plt) plt.show()
#fig 711 h=np.linspace(0, 5) f=0.1*9.8*h plt.figure(figsize=(4,3)) plt.plot(h, f, color="g", label="F=mgh\nm:0.1 kg") plt.xlabel("h(m)") plt.ylabel("F(N)") plt.legend(loc="best") plt.show()
#fig 712 np.random.seed(3) x=np.linspace(-1, 5, 100) y=0.3*x+np.random.rand(100) y1=0.56+0.4*x y2=0.45+0.32*x y3=0.2+0.44*x y4=0.7+0.2*x col=["brown",'b','r','k'] plt.figure(figsize=(4,3)) plt.scatter(x, y, color="g", s=20) for i, j in enumerate([y1, y2, y3, y4]): plt.plot(x, j, color=col[i]) plt.xlabel("x") plt.ylabel("y") plt.show()
#fig 721 st=pd.Timestamp(2021,1, 1) et=pd.Timestamp(2024, 5, 10) kos=fdr.DataReader('KS11',st, et)[["Open","Close"]] kos.index=range(len(kos)) X=kos.values[:,0].reshape(-1,1) y=kos.values[:,1].reshape(-1,1) #독립변수 정규화(표준화) xScaler=StandardScaler().fit(X) X_n=xScaler.transform(X) #반응변수 정규화(표준화) yScaler=StandardScaler().fit(y) y_n=yScaler.transform(y) plt.figure(figsize=(4,2)) plt.scatter(X_n, y_n, label="Data") plt.plot(X_n, 0.998*X_n, color="red", label="Regression line") plt.legend(loc="best", frameon=False) plt.xlabel("Open", weight="bold") plt.ylabel('Close', weight="bold") plt.show()
#fig 731 x=np.linspace(-2, 2, 100) y=x**2 x1=np.linspace(-2, 0, 50) y1=-2*x1-1 x2=np.linspace(0, 2, 50) y2=2*x2-1 plt.figure(figsize=(4,3)) plt.plot(x, y, color="g", label=r"SSE=$(y_i-bx_i)^2$") plt.plot(x1, y1, color="brown", ls="--", label=r"$\frac{d\;SSE}{dx}>0$" ) plt.plot(x2, y2, color="b", ls="--", label=r"$\frac{d\;SSE}{dx}<0$" ) plt.hlines(0, -2, 2, ls="--", color="k", label=r"$\frac{d\;SSE}{dx}=0$") plt.xlabel("x") plt.ylabel("SSE") plt.legend(loc="best",prop={'size':10}, labelcolor='linecolor', frameon=False) plt.show()
#fig 751 x=np.linspace(-1, 2, 100) y=x+0.5 plt.figure(figsize=(4,3)) plt.plot(x, y, color="g", label="regression") plt.hlines(1.7, -1, 2, color="k", ls="--", label="mean line") plt.scatter(0.25, 1.7, s=20, color="k", label=r"$\bar{y}$") plt.scatter(0.25, 0.75, s=20, color="r", label=r"$\hat{y}$") plt.scatter(0.25, 0, s=20, color="b", label=r"$y_{obs}$") plt.vlines(0.25, 0.75, 1.7, color="r", ls="--") plt.vlines(0.25, 0, 0.75, color="b", ls="--") plt.vlines(0.18, 0, 1.7, color="k", ls="--", alpha=0.7) plt.text(0.3, 1.2, r"SSReg=$(\bar{y}-\hat{y})^2$", color="r", weight="bold", size=8) plt.text(0.3, 0.25, r"SSE=$(y_{obs}-\hat{y})^2$", color="b", weight="bold", size=8) plt.text(-0.9, 0.7, r"SST=$(y_{obs}-\bar{y})^2$", color="k", weight="bold", size=8) plt.xlabel("x") plt.ylabel("y") plt.xlim(-1.1, 2.5) plt.legend(loc="lower right", prop={'size':8}, frameon=False) plt.show()
#fig 752 ~ fig 758에 사용되는 데이터 st=pd.Timestamp(2021,1, 1) et=pd.Timestamp(2024, 5, 10) kos=fdr.DataReader('KS11',st, et)[["Open","Close"]] kos.index=range(len(kos)) X=kos.values[:,0].reshape(-1,1) y=kos.values[:,1].reshape(-1,1) #독립변수 정규화(표준화) xScaler=StandardScaler().fit(X) X_n=xScaler.transform(X) #반응변수 정규화(표준화) yScaler=StandardScaler().fit(y) y_n=yScaler.transform(y)
#fig 752 mod = LinearRegression() mod.fit(X_n, y_n) pre=mod.predict(X_n) err=pre-y_n plt.figure(figsize=(3,2)) errorRe=stats.probplot(err.ravel(), plot=plt) plt.show()
#fig753 X_n0=sm.add_constant(X_n) reg=sm.OLS(y_n, X_n0).fit() influence=reg.get_influence() infSummary=influence.summary_frame() hat=infSummary["hat_diag"] plt.figure(figsize=(6, 2)) plt.stem(hat) plt.axhline(np.mean(hat), c='g', ls="--", label=r"$\mu_{lv}$") plt.axhline(np.mean(hat)*2, c='brown', ls="--", label=r"$2\mu_{lv}$") plt.xlabel('Data index') plt.ylabel("Leverage (lv)") plt.legend(loc="best", labelcolor="linecolor") plt.title("leverage") plt.show()
#fig754 out1id=np.where(hat>2*hat.mean())[0] X_n0Hat=np.delete(X_n0, out1id, axis=0) y_nHat=np.delete(y_n, out1id, axis=0) reg_hat=sm.OLS(y_nHat, X_n0Hat).fit() plt.figure(figsize=(6, 2)) plt.subplots_adjust(wspace=0.5) plt.subplot(1,2,1) stats.probplot(reg.resid, plot=plt) plt.title("a) Probability plot") plt.subplot(1,2,2) stats.probplot(reg_hat.resid, plot=plt) plt.title("b) Probability plot") plt.show()
res_std=infSummary["student_resid"] plt.figure(figsize=(6, 3)) plt.subplots_adjust(hspace=0.6) plt.subplot(2,1,1) plt.stem(reg.resid) plt.ylabel("Resid.") plt.title("a) Residual") plt.subplot(2,1,2) plt.stem(res_std) plt.axhline(3, c='g', ls='--') plt.axhline(-3, c='g', ls='--') plt.xlabel("Data index") plt.ylabel("Student Resid.") plt.yticks([-3, 0, 3]) plt.title("b) Studenized Residual") plt.show()
#fig 756 out_stres=np.where(np.abs(res_std)>3)[0] ind_stres=np.delete(X_n0, out_stres, axis=0) de_stres=np.delete(y_n, out_stres, axis=0) reg_stres=sm.OLS(de_stres, ind_stres).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_stres.resid, plot=plt) plt.show()
#fig757 cook=infSummary["cooks_d"] cd_ref=4/(len(reg.resid)-2-1) cd_idx=np.where(cook>cd_ref)[0] ind_cd=np.delete(X_n0, cd_idx, axis=0) de_cd=np.delete(y_n, cd_idx, axis=0) reg_cd=sm.OLS(de_cd, ind_cd).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_cd.resid, plot=plt) plt.show()
#fig758 dffi=infSummary["dffits"] dff_ref=2*np.sqrt(3/(len(reg.resid)-2+1)) dff_idx=np.where(dffi>dff_ref)[0] ind_dff=np.delete(X_n0, dff_idx, axis=0) de_dff=np.delete(y_n, dff_idx, axis=0) reg_dff=sm.OLS(de_dff, ind_dff).fit() plt.figure(figsize=(3,2)) fig=stats.probplot(reg_dff.resid, plot=plt) plt.show()
#fig 761 ~ fig 763에 사용된 데이터 st=pd.Timestamp(2023,1, 10) et=pd.Timestamp(2024, 5, 30) code=["^KS11", "^KQ11", "122630.KS", "114800.KS","KRW=X","005930.KS"] nme=["kos","kq","kl", "ki", "WonDol","sam" ] da=pd.DataFrame() for i, j in zip(nme,code): x=yf.download(j,st, et)["Close"] da=pd.concat([da, x], axis=1 ) da.columns=nme da.index=pd.DatetimeIndex(da.index.date) da2=da.ffill() ind=da2.values[:-1,:-1] de=da2.values[1:,-1].reshape(-1,1) final=da2.values[-1, :-1].reshape(1,-1) indScaler=StandardScaler().fit(ind) deScaler=StandardScaler().fit(de) indNor=indScaler.transform(ind) finalNor=indScaler.transform(final) deNor=deScaler.transform(de) da3=pd.DataFrame(np.c_[indNor, deNor.reshape(-1,1)]) da3.columns=da2.columns form='sam~kos+kq+kl+ki+WonDol' reg=ols(form, data=da3).fit() re=reg.summary() mod=LinearRegression().fit(indNor, deNor)
#fig 761 plt.figure(figsize=(4, 3)) plt.plot(deNor, c="g", ls="--", label="data") plt.plot(mod.predict(indNor), c="b", label="regression") plt.xlabel("Data index") plt.ylabel("Standardized response variables") plt.legend(loc="best", frameon=False) plt.show()
#fig762 err=reg.resid plt.figure(figsize=(4,3)) stats.probplot(err, plot=plt, rvalue=True) plt.show()
#fig763 fig=plt.figure(figsize=(4, 3)) plt.stem(reg.resid_pearson) plt.axhline(3, c="g", ls="--") plt.axhline(-3, c="g", ls="--") plt.xlabel("Data index") plt.ylabel("Student Resid.") plt.yticks([-3, 0, 3, 6]) plt.title("student Resid.") plt.show()
댓글
댓글 쓰기