一、简介
使用决策树, 线性回归, 向量机等机器学习的方法进行股票价格预测。
二、获取数据的方法
打开大智慧的股票界面,右键->复制数据,然后粘贴到Excel中即可。
然后在指标窗格切换指标,再复制到Excel中即可。
三、知识点
1.classification_report
其中列表左边的一列为分类的标签名(label),
precision recall f1-score三列分别为各个类别的精确度/召回率及 F1值.右边support列为每个标签的出现次数.
avg / total行为各列的均值(support列为总和)。
2.MinMaxScaler (归一化)
关于使用sklearn进行数据预处理,有归一化/标准化/正则化三种方法。
MinMaxScaler就是将属性缩放到一个指定的最大和最小值(通常是1-0)之间。

别人的测试结果

通过以上数据可以看出,除归一化处理的效果不好之外,其他三个方式都能有效提升模型性能。
3.zip 打包为元组的列表

四、结果



五、源代码
import pandas as pd
import time
# 加载数据集
data = pd.read_excel('data.xlsx')
# 拆分数据集
from sklearn.model_selection import train_test_split
y = data['label'].values #标签单独存给Y
X = data.drop(['label'],axis=1).values #其他数据都是X
# 拆分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print('X_train的数据结构:{0}; \nX_test的数据结构:{1}; \ny_train的数据结构:{2}; \ny_test的数据结构:{3};'.format(
X_train.shape, X_test.shape, y_train.shape, y_test.shape))
# 生成决策树模型的结果
from sklearn.tree import DecisionTreeClassifier
clf2 = DecisionTreeClassifier()
t =time.time()
clf2.fit(X_train, y_train)
train_score = clf2.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
from sklearn.metrics import classification_report
# 模型结果验证及各性能指标
t0 =time.time()
pred = clf2.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('模型预测能力性能报告:')
print(clf2)
print(classification_report(y_test,pred ))
from sklearn.linear_model import LogisticRegression
# 生成线性回归模型的结果
t =time.time()
clf1 = LogisticRegression()
clf1.fit(X_train, y_train)
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf1.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('模型预测能力性能报告:')
print(clf1)
print(classification_report(y_test,pred ))
# 生成向量机模型的结果
from sklearn.svm import SVC
clf3 = SVC()
t =time.time()
clf3.fit(X_train, y_train)
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf3.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('模型预测能力性能报告:')
print(clf3)
print(classification_report(y_test,pred ))
# 特征缩放
from sklearn.preprocessing import MinMaxScaler
X_train = MinMaxScaler().fit_transform(X_train)
X_test = MinMaxScaler().fit_transform(X_test)
# 用for循环,批量带入基本模型中进行验证
model_name = ['决策树', '线性回归', '向量机']
for clf, name in zip([clf2, clf1, clf3], model_name):
clf.fit(X_train, y_train)
t =time.time()
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('模型预测能力性能报告:')
print(clf)
print(classification_report(y_test,pred ))
print("*"*100)
六、所有源代码

import pandas as pd
import numpy as np
import time
# 加载数据集
data = pd.read_excel('data.xlsx')
# 拆分数据集
from sklearn.model_selection import train_test_split
y = data['label'].values #标签单独存给Y,打印出来是个Ndarray(更高级的列表)
X = data.drop(['label'],axis=1).values #其他数据都是X
# 拆分数据,30%作为test数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print('X_train的数据结构:{0}; \nX_test的数据结构:{1}; \ny_train的数据结构:{2}; \ny_test的数据结构:{3};'.format(
X_train.shape, X_test.shape, y_train.shape, y_test.shape))
# 生成决策树模型的结果
from sklearn.tree import DecisionTreeClassifier
clf2 = DecisionTreeClassifier()
t =time.time()
clf2.fit(X_train, y_train)
train_score = clf2.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
from sklearn.metrics import classification_report
# 模型结果验证及各性能指标
t0 =time.time()
pred = clf2.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('决策树模型预测能力性能报告:')
print(clf2)
print(classification_report(y_test,pred ))
# 生成线性回归模型的结果
from sklearn.linear_model import LogisticRegression
t =time.time()
clf1 = LogisticRegression()
clf1.fit(X_train, y_train)
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf1.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('线性回归模型预测能力性能报告:')
print(clf1)
print(classification_report(y_test,pred ))
# 生成向量机模型的结果
from sklearn.svm import SVC
clf3 = SVC()
t =time.time()
clf3.fit(X_train, y_train)
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf3.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('向量机模型预测能力性能报告:')
print(clf3)
print(classification_report(y_test,pred ))
#开始进行特征工程,特征工程的主要流程包括:数据预处理 -> 特征选择 两个主要内容。
# 特征缩放
from sklearn.preprocessing import MinMaxScaler
X_train = MinMaxScaler().fit_transform(X_train)
X_test = MinMaxScaler().fit_transform(X_test)
# 用for循环,批量带入基本模型中进行验证
model_name = ['决策树', '线性回归', '向量机']
for clf, name in zip([clf2, clf1, clf3], model_name):
clf.fit(X_train, y_train)
t =time.time()
train_score = clf1.score(X_train, y_train)
print('模型名称:{0},训练分数:{1}'.format(name,train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print(name+'模型预测能力性能报告:')
print(clf)
print(classification_report(y_test,pred ))
print("*"*100)
X_name = data.drop(['label'],axis=1).columns
print('所有的标签的名称是:{}'.format(X_name))
# 使用固定比例50%的自动选择
from sklearn.feature_selection import SelectPercentile
select1 = SelectPercentile(percentile=50)
select1.fit(X_train, y_train)
mask1 = select1.get_support()
print('使用固定比例50%的自动选择模型自动选出的特征分别是{0},一共有{1}个'.format(X_name[mask1], len(X_name[mask1])))
# 使用迭代特征进行选择——自动选择的高级模式
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100,random_state=42), n_features_to_select=21)
select.fit(X_train, y_train)
mask = select.get_support()
print('使用迭代特征模型自动选出的特征分别是{0},一共有{1}个'.format(X_name[mask], len(X_name[mask])))
# 挑选共有项的特征
new_mask = []
for ii in list(range(0,len(mask))):
if mask[ii] == True and mask1[ii]== True:
i = True
else:
i = False
new_mask.append(i)
print('模型自动选出的特征分别是{0},一共有{1}个'.format(X_name[new_mask], len(X_name[new_mask])))
# 按照自动选出来的特征组合出数据集
X_train_mask = select.transform(X_train)
X_test_mask = select.transform(X_test)
X_train_mask1 = select1.transform(X_train)
X_test_mask1 = select1.transform(X_test)
X_train_new_mask = X_train[:,new_mask]
X_test_new_mask = X_test[:,new_mask]
print('X_train_mask的维度是{}'.format(X_train_mask.shape))
print('X_test_mask的维度是{}'.format(X_test_mask.shape))
print('X_train_mask1的维度是{}'.format(X_train_mask1.shape))
print('X_test_mask1的维度是{}'.format(X_test_mask1.shape))
print('X_train_new_mask的维度是{}'.format(X_train_new_mask.shape))
print('X_test_new_mask的维度是{}'.format(X_test_new_mask.shape))
#评估数据集及模型
for clf, name in zip([clf2, clf1, clf3], model_name):
clf.fit(X_train_mask, y_train)
t =time.time()
train_score = clf.score(X_train_mask, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf.predict(X_test_mask)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print(name+'使用X_train_mask,模型预测能力性能报告:')
print(clf)
print(classification_report(y_test,pred ))
for clf, name in zip([clf2, clf1, clf3], model_name):
clf.fit(X_train_mask1, y_train)
t =time.time()
train_score = clf.score(X_train_mask1, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =t =time.time()
pred = clf.predict(X_test_mask1)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print(name+'使用X_train_mask1,模型预测能力性能报告:')
print(clf)
print(classification_report(y_test,pred ))
print("-"*50)
for clf, name in zip([clf2, clf1, clf3], model_name):
clf.fit(X_train_new_mask, y_train)
t =time.time()
train_score = clf.score(X_train_new_mask, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =t =time.time()
pred = clf.predict(X_test_new_mask)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print(name+'使用X_train_new_mask,模型预测能力性能报告:')
print(clf)
print(classification_report(y_test,pred ))
#优化性能最好的向量机
from sklearn.model_selection import GridSearchCV
param_grid = [{'kernel':['rbf'],
'C':[0.001, 0.01,0.1,1,10,100,1000],
'gamma':[0.001, 0.01,0.1,1,10,100,1000]},
{'kernel':['linear'],
'C':[0.001,0.01,0.1,1,10,100,1000]}]
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_new_mask, y_train)
print('使用选出的macd,KDJ等指标,来尝试提高模型精度。最好的参数:{0}\n最好分数:{1}'.format(grid_search.best_params_, grid_search.best_score_))
# 直接上决策树算法,这个算法有很多扩展算法,直接选用这个
from sklearn.model_selection import GridSearchCV
entropy_thresholds = np.linspace(0, 1, 100)
gini_thresholds = np.linspace(0, 0.2, 100)
param_grid = [{'criterion': ['entropy'], 'min_impurity_decrease': entropy_thresholds},
{'criterion': ['gini'], 'min_impurity_decrease': gini_thresholds},
{'max_depth': np.arange(2,10)},
{'min_samples_split': np.arange(2,30,2)}]
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train_new_mask, y_train)
print('使用决策树算法,最好的参数:{0}\n最好分数:{1}'.format(grid_search.best_params_, grid_search.best_score_))
#使用随机森林模型
rfc = RandomForestClassifier(n_estimators=1000, criterion='gini') # 生成1000个决策树进行交叉
t =time.time()
rfc.fit(X_train_new_mask, y_train)
train_score = rfc.score(X_train_new_mask, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = rfc.predict(X_test_new_mask)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('使用随机森林模型模型预测能力性能报告:')
print(rfc)
print(classification_report(y_test,pred ))
# 使用vote算法对模型再次进行优化
from mlxtend.classifier import EnsembleVoteClassifier
clf1 = LogisticRegression()
clf2 = RandomForestClassifier(n_estimators=1000, criterion='gini')
clf3 = SVC(kernel='rbf',C=0.1,gamma=1,probability=True)
clf4 = DecisionTreeClassifier()
eclf1 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4],voting='hard', verbose=1)
t =time.time()
eclf1.fit(X_train_new_mask, y_train)
train_score = eclf1.score(X_train_new_mask, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = eclf1.predict(X_test_new_mask)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('使用vote算法对模型再次进行优化后,模型预测能力性能报告:')
print(eclf1)
print(classification_report(y_test,pred ))
# 将每个标的的预测概率进行输出——只输出前20个
np.set_printoptions(suppress=True)
print(eclf1.predict_proba(X_test_new_mask[:20]))
运行的结果:
I:\Ml\008zhihu>python 001.py
X_train的数据结构:(1944, 46);
X_test的数据结构:(834, 46);
y_train的数据结构:(1944,);
y_test的数据结构:(834,);
训练分数:1.0
训练共用时间:0.09360027313232422秒
预测结束,用时:0.0秒
决策树模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
precision recall f1-score support
0 0.66 0.67 0.66 419
1 0.66 0.64 0.65 415
accuracy 0.66 834
macro avg 0.66 0.66 0.66 834
weighted avg 0.66 0.66 0.66 834
训练分数:0.5339506172839507
训练共用时间:0.015600204467773438秒
预测结束,用时:0.0秒
线性回归模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
precision recall f1-score support
0 0.51 0.51 0.51 419
1 0.51 0.51 0.51 415
accuracy 0.51 834
macro avg 0.51 0.51 0.51 834
weighted avg 0.51 0.51 0.51 834
训练分数:0.5339506172839507
训练共用时间:0.32760071754455566秒
预测结束,用时:0.09360003471374512秒
向量机模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
precision recall f1-score support
0 0.51 0.58 0.54 419
1 0.51 0.44 0.47 415
accuracy 0.51 834
macro avg 0.51 0.51 0.50 834
weighted avg 0.51 0.51 0.50 834
模型名称:决策树,训练分数:0.4876543209876543
训练共用时间:0.0秒
预测结束,用时:0.0秒
决策树模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
precision recall f1-score support
0 0.59 0.36 0.45 419
1 0.53 0.74 0.62 415
accuracy 0.55 834
macro avg 0.56 0.55 0.53 834
weighted avg 0.56 0.55 0.53 834
********************************************************************************
********************
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:939
: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regressio
n
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
模型名称:线性回归,训练分数:0.7680041152263375
训练共用时间:0.0秒
预测结束,用时:0.0秒
线性回归模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
precision recall f1-score support
0 0.80 0.70 0.75 419
1 0.73 0.83 0.78 415
accuracy 0.76 834
macro avg 0.77 0.77 0.76 834
weighted avg 0.77 0.76 0.76 834
********************************************************************************
********************
模型名称:向量机,训练分数:0.7680041152263375
训练共用时间:0.0秒
预测结束,用时:0.07800030708312988秒
向量机模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
precision recall f1-score support
0 0.77 0.68 0.72 419
1 0.71 0.80 0.75 415
accuracy 0.74 834
macro avg 0.74 0.74 0.74 834
weighted avg 0.74 0.74 0.74 834
********************************************************************************
********************
所有的标签的名称是:Index(['成交额', '成交笔数', 'MA1', 'MA2', 'MA3', 'MA4', 'MA
5', 'MA6', 'MID', 'UPPER',
'LOWER', 'AR', 'BR', 'BIAS1', 'BIAS2', 'BIAS3', 'unknow1', 'CJBS', 'CR',
'MA1.1', 'MA2.1', 'MA3.1', 'PDI', 'MDI', 'ADX', 'ADXR', 'K', 'D', 'K.1',
'D.1', 'J', 'DIFF', 'DEA', 'MACD', 'unknow2', 'unknow3', 'RSI1', 'RSI2',
'RSI3', 'WR1', 'WR2', 'JCS', 'JCM', 'JCL', 'DDD', 'AMA'],
dtype='object')
使用固定比例50%的自动选择模型自动选出的特征分别是Index(['成交额', 'AR', 'BR', 'B
IAS1', 'BIAS2', 'BIAS3', 'unknow1', 'CR', 'PDI',
'MDI', 'K', 'D', 'K.1', 'D.1', 'J', 'DIFF', 'MACD', 'unknow3', 'RSI1',
'RSI2', 'RSI3', 'WR1', 'WR2'],
dtype='object'),一共有23个
使用迭代特征模型自动选出的特征分别是Index(['AR', 'BIAS1', 'BIAS2', 'BIAS3', 'unk
now1', 'CJBS', 'CR', 'PDI', 'MDI',
'ADX', 'K', 'D', 'K.1', 'D.1', 'J', 'MACD', 'RSI1', 'RSI2', 'WR1',
'WR2', 'JCL'],
dtype='object'),一共有21个
模型自动选出的特征分别是Index(['AR', 'BIAS1', 'BIAS2', 'BIAS3', 'unknow1', 'CR',
'PDI', 'MDI', 'K',
'D', 'K.1', 'D.1', 'J', 'MACD', 'RSI1', 'RSI2', 'WR1', 'WR2'],
dtype='object'),一共有18个
X_train_mask的维度是(1944, 21)
X_test_mask的维度是(834, 21)
X_train_mask1的维度是(1944, 23)
X_test_mask1的维度是(834, 23)
X_train_new_mask的维度是(1944, 18)
X_test_new_mask的维度是(834, 18)
训练分数:1.0
训练共用时间:0.015599966049194336秒
预测结束,用时:0.0秒
决策树使用X_train_mask,模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
precision recall f1-score support
0 0.61 0.47 0.53 419
1 0.57 0.70 0.63 415
accuracy 0.58 834
macro avg 0.59 0.58 0.58 834
weighted avg 0.59 0.58 0.58 834
训练分数:0.7602880658436214
训练共用时间:0.0秒
预测结束,用时:0.0秒
线性回归使用X_train_mask,模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
precision recall f1-score support
0 0.80 0.70 0.75 419
1 0.73 0.83 0.78 415
accuracy 0.76 834
macro avg 0.77 0.76 0.76 834
weighted avg 0.77 0.76 0.76 834
训练分数:0.7834362139917695
训练共用时间:0.5616011619567871秒
预测结束,用时:0.23400020599365234秒
向量机使用X_train_mask,模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
precision recall f1-score support
0 0.80 0.70 0.75 419
1 0.73 0.82 0.77 415
accuracy 0.76 834
macro avg 0.77 0.76 0.76 834
weighted avg 0.77 0.76 0.76 834
训练分数:1.0
训练共用时间:0.0秒
预测结束,用时:0.0秒
决策树使用X_train_mask1,模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
precision recall f1-score support
0 0.59 0.50 0.54 419
1 0.56 0.64 0.60 415
accuracy 0.57 834
macro avg 0.57 0.57 0.57 834
weighted avg 0.57 0.57 0.57 834
--------------------------------------------------
训练分数:0.7659465020576132
训练共用时间:0.0秒
预测结束,用时:0.0秒
线性回归使用X_train_mask1,模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
precision recall f1-score support
0 0.80 0.72 0.76 419
1 0.74 0.82 0.78 415
accuracy 0.77 834
macro avg 0.77 0.77 0.77 834
weighted avg 0.77 0.77 0.77 834
--------------------------------------------------
训练分数:0.7870370370370371
训练共用时间:0.4836008548736572秒
预测结束,用时:0.23400044441223145秒
向量机使用X_train_mask1,模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
precision recall f1-score support
0 0.80 0.72 0.76 419
1 0.75 0.82 0.78 415
accuracy 0.77 834
macro avg 0.77 0.77 0.77 834
weighted avg 0.77 0.77 0.77 834
--------------------------------------------------
训练分数:1.0
训练共用时间:0.015599966049194336秒
预测结束,用时:0.0秒
决策树使用X_train_new_mask,模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
precision recall f1-score support
0 0.59 0.49 0.54 419
1 0.56 0.66 0.61 415
accuracy 0.57 834
macro avg 0.58 0.57 0.57 834
weighted avg 0.58 0.57 0.57 834
训练分数:0.7587448559670782
训练共用时间:0.0秒
预测结束,用时:0.0秒
线性回归使用X_train_new_mask,模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
precision recall f1-score support
0 0.80 0.70 0.75 419
1 0.73 0.83 0.78 415
accuracy 0.76 834
macro avg 0.77 0.76 0.76 834
weighted avg 0.77 0.76 0.76 834
训练分数:0.7818930041152263
训练共用时间:0.10920047760009766秒
预测结束,用时:0.04680013656616211秒
向量机使用X_train_new_mask,模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
precision recall f1-score support
0 0.79 0.74 0.77 419
1 0.75 0.81 0.78 415
accuracy 0.77 834
macro avg 0.77 0.77 0.77 834
weighted avg 0.77 0.77 0.77 834
使用选出的macd,KDJ等指标,来尝试提高模型精度。最好的参数:{'C': 100, 'gamma': 1,
'kernel': 'rbf'}
最好分数:0.7844724776720643
使用决策树算法,最好的参数:{'criterion': 'gini', 'min_impurity_decrease': 0.0020
2020202020202}
最好分数:0.693422203376355
训练分数:1.0
训练共用时间:13.275623083114624秒
预测结束,用时:0.28080058097839355秒
使用随机森林模型模型预测能力性能报告:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=1000,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
precision recall f1-score support
0 0.87 0.26 0.41 419
1 0.56 0.96 0.71 415
accuracy 0.61 834
macro avg 0.72 0.61 0.56 834
weighted avg 0.72 0.61 0.56 834
Fitting 4 classifiers...
Fitting clf1: logisticregression (1/4)
Fitting clf2: randomforestclassifier (2/4)
Fitting clf3: svc (3/4)
Fitting clf4: decisiontreeclassifier (4/4)
训练分数:0.8955761316872428
训练共用时间:21.24723744392395秒
预测结束,用时:0.3120005130767822秒
使用vote算法对模型再次进行优化后,模型预测能力性能报告:
EnsembleVoteClassifier(clfs=[LogisticRegression(C=1.0, class_weight=None,
dual=False, fit_intercept=True,
intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None,
penalty='l2', random_state=None,
solver='lbfgs', tol=0.0001,
verbose=0, warm_start=False),
RandomForestClassifier(bootstrap=True,
ccp_alpha=0.0,
class_weight=None,
criterion='gini...
shrinking=True, tol=0.001, verbose=False),
DecisionTreeClassifier(ccp_alpha=0.0,
class_weight=None,
criterion='gini',
max_depth=None,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0
,
presort='deprecated',
random_state=None,
splitter='best')],
refit=True, verbose=1, voting='hard', weights=None)
precision recall f1-score support
0 0.79 0.73 0.76 419
1 0.75 0.80 0.77 415
accuracy 0.77 834
macro avg 0.77 0.77 0.77 834
weighted avg 0.77 0.77 0.77 834
[[0.29904357 0.70095643]
[0.38052022 0.61947978]
[0.04736433 0.95263567]
[0.70955564 0.29044436]
[0.79065533 0.20934467]
[0.11818586 0.88181414]
[0.12765114 0.87234886]
[0.29053959 0.70946041]
[0.11113169 0.88886831]
[0.7328489 0.2671511 ]
[0.58150786 0.41849214]
[0.38384207 0.61615793]
[0.35686211 0.64313789]
[0.25780496 0.74219504]
[0.28229374 0.71770626]
[0.56169932 0.43830068]
[0.37888313 0.62111687]
[0.40678589 0.59321411]
[0.18140309 0.81859691]
[0.34179083 0.65820917]]
参考:https://zhuanlan.zhihu.com/p/54853160