선형회귀 모델에 의한 주가 예측
sklearn.linear_model.LinearRegression() 클래스를 사용합니다.
import numpy as np
import pandas as pd
import pandas_ta as ta
import matplotlib.pyplot as plt
import FinanceDataReader as fdr
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
st=pd.Timestamp(2023, 1,1)
et=pd.Timestamp(2025, 5,20)
trgnme="000660"
trg=fdr.DataReader(trgnme, st, et)
df=trg[["Open", "High", "Low", "Close", "Volume"]]
df.tail(1)
|
Open |
High |
Low |
Close |
Volume |
Date |
|
|
|
|
|
2025-05-20 |
202500 |
208000 |
201500 |
202000 |
2820546 |
data=df.copy()
data["ema5"]=df.ta.ema(5)
data["ema20"]=df.ta.ema(20)
data[['bbl', 'bbm','bbu']]=df.ta.bbands().iloc[:,:3]
data.dropna(inplace=True)
data.head(1).round(0)
|
Open |
High |
Low |
Close |
Volume |
ema5 |
ema20 |
bbl |
bbm |
bbu |
Date |
|
|
|
|
|
|
|
|
|
|
2023-01-31 |
90800 |
90800 |
86800 |
88500 |
5185088 |
89697 |
85675 |
88292 |
90880 |
93467 |
ind=df.drop(labels="Close", axis=1).values
de=df["Close"].values
scalerX=StandardScaler().fit(ind[:-1, :])
X=scalerX.transform(ind[:-1,:])
Xfinal=scalerX.transform(ind[-1,:].reshape(-1, ind.shape[1]))
scalery=StandardScaler()
y=scalery.fit_transform(de[1:].reshape(-1,1)).flatten()
Xtr, Xte, ytr, yte=train_test_split(X, y, test_size=0.3, random_state=7)
reg=LinearRegression().fit(Xtr, ytr)
pd.DataFrame([reg.score(Xtr, ytr), reg.score(Xte, yte)], index=["train", "test"])
|
0 |
train |
0.987542 |
test |
0.986368 |
위 과정 Open, High, Low, Close을 추정하기 위해 적용합니다. 추정을 위해 위 과정을 함수 lrPredict()를 작성합니다. 또한 당일의 시가(Open)를 적용하기 위한 함수 consider_Op()를 작성합니다.
#t선형회귀에 의한 예측을 위한 함수
def lrPredict(data, deCol="Close", testSize=0.3, randomState=7):
ind=data.values
de=data[deCol].values
scalerX=StandardScaler().fit(ind[:-1, :])
X=scalerX.transform(ind[:-1,:])
Xfinal=scalerX.transform(ind[-1,:].reshape(-1, ind.shape[1]))
scalery=StandardScaler()
y=scalery.fit_transform(de[1:].reshape(-1,1)).flatten()
Xtr, Xte, ytr, yte=train_test_split(X, y, test_size=testSize, random_state=randomState)
reg=LinearRegression().fit(Xtr, ytr)
pre=reg.predict(Xfinal)
pre_o=scalery.inverse_transform(pre.reshape(-1,1)).flatten()
R2=pd.DataFrame([reg.score(Xtr, ytr), reg.score(Xte, yte)], index=["train", "test"]).round(4)
return (pre_o, R2)
#당일 시가를 고려하여 위 함수의 결과를 수정하는 함수
def consider_Op(result, op):
open_op=result["Open"]-op
r=pd.DataFrame()
for i, j in enumerate(result.index):
r=pd.concat([r, result.iloc[i,:]-open_op.iloc[i]], axis=1)
return r.T
result_lr1={}
result_lr2={}
R2=pd.DataFrame()
for i in ["Open","High","Low","Close"]:
result_lr1[i]=lrPredict(df, deCol=i)[0]
result_lr2[i]=lrPredict(data, deCol=i)[0]
result_lr1=pd.DataFrame(result_lr1)
result_lr2=pd.DataFrame(result_lr2)
result_lr=pd.concat([result_lr1, result_lr2])
result_lr.index=['OHLC', 'addIndex']
result_lr.round(0)
|
Open |
High |
Low |
Close |
OHLC |
201555.0 |
204977.0 |
198732.0 |
202090.0 |
addIndex |
201750.0 |
205197.0 |
199246.0 |
202364.0 |
op=204500
consider_Op(result_lr, op).round(0)
|
Open |
High |
Low |
Close |
OHLC |
204500.0 |
207922.0 |
201677.0 |
205035.0 |
addIndex |
204500.0 |
207947.0 |
201996.0 |
205114.0 |
댓글
댓글 쓰기