Home >  > 机器学习预测股票(多种方法)

机器学习预测股票(多种方法)

0

一、简介

使用决策树, 线性回归, 向量机等机器学习的方法进行股票价格预测。

二、获取数据的方法
打开大智慧的股票界面,右键->复制数据,然后粘贴到Excel中即可。
然后在指标窗格切换指标,再复制到Excel中即可。

三、知识点
1.classification_report
其中列表左边的一列为分类的标签名(label),
precision recall f1-score三列分别为各个类别的精确度/召回率及 F1值.右边support列为每个标签的出现次数.
avg / total行为各列的均值(support列为总和)。

2.MinMaxScaler (归一化)
关于使用sklearn进行数据预处理,有归一化/标准化/正则化三种方法

MinMaxScaler就是将属性缩放到一个指定的最大和最小值(通常是1-0)之间。

别人的测试结果

通过以上数据可以看出,除归一化处理的效果不好之外,其他三个方式都能有效提升模型性能。

3.zip 打包为元组的列表

四、结果

五、源代码

import pandas as pd
import time 

# 加载数据集
data = pd.read_excel('data.xlsx')

# 拆分数据集
from sklearn.model_selection import train_test_split

y = data['label'].values #标签单独存给Y
X = data.drop(['label'],axis=1).values #其他数据都是X
# 拆分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


print('X_train的数据结构:{0}; \nX_test的数据结构:{1}; \ny_train的数据结构:{2}; \ny_test的数据结构:{3};'.format(
X_train.shape, X_test.shape, y_train.shape, y_test.shape))


# 生成决策树模型的结果
from sklearn.tree import DecisionTreeClassifier

clf2 = DecisionTreeClassifier()
t =time.time()
clf2.fit(X_train, y_train)
train_score = clf2.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
from sklearn.metrics import classification_report
# 模型结果验证及各性能指标
t0 =time.time()
pred = clf2.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('模型预测能力性能报告:')
print(clf2)
print(classification_report(y_test,pred ))


from sklearn.linear_model import LogisticRegression
# 生成线性回归模型的结果
t =time.time()
clf1 = LogisticRegression()
clf1.fit(X_train, y_train)
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf1.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))

print('模型预测能力性能报告:')
print(clf1)
print(classification_report(y_test,pred ))



# 生成向量机模型的结果
from sklearn.svm import SVC
clf3 = SVC()
t =time.time()
clf3.fit(X_train, y_train)
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf3.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))

print('模型预测能力性能报告:')
print(clf3)
print(classification_report(y_test,pred ))



# 特征缩放
from sklearn.preprocessing import MinMaxScaler
X_train = MinMaxScaler().fit_transform(X_train)
X_test = MinMaxScaler().fit_transform(X_test)

# 用for循环,批量带入基本模型中进行验证
model_name = ['决策树', '线性回归', '向量机']
for clf, name in zip([clf2, clf1, clf3], model_name):
    clf.fit(X_train, y_train)
    t =time.time()
    train_score = clf1.score(X_train, y_train)
    print('训练分数:{0}'.format(train_score))
    print('训练共用时间:{0}秒'.format(time.time()-t))
    t0 =time.time()
    pred = clf.predict(X_test)
    print('预测结束,用时:{0}秒'.format(time.time()-t0))

    print('模型预测能力性能报告:')
    print(clf)
    print(classification_report(y_test,pred ))
    print("*"*100)

六、所有源代码

import pandas as pd
import numpy as np
import time 

# 加载数据集
data = pd.read_excel('data.xlsx')

# 拆分数据集
from sklearn.model_selection import train_test_split

y = data['label'].values #标签单独存给Y,打印出来是个Ndarray(更高级的列表)
X = data.drop(['label'],axis=1).values #其他数据都是X
# 拆分数据,30%作为test数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


print('X_train的数据结构:{0}; \nX_test的数据结构:{1}; \ny_train的数据结构:{2}; \ny_test的数据结构:{3};'.format(
X_train.shape, X_test.shape, y_train.shape, y_test.shape))


# 生成决策树模型的结果
from sklearn.tree import DecisionTreeClassifier

clf2 = DecisionTreeClassifier()
t =time.time()
clf2.fit(X_train, y_train)
train_score = clf2.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
from sklearn.metrics import classification_report
# 模型结果验证及各性能指标
t0 =time.time()
pred = clf2.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))
print('决策树模型预测能力性能报告:')
print(clf2)
print(classification_report(y_test,pred ))



# 生成线性回归模型的结果
from sklearn.linear_model import LogisticRegression

t =time.time()
clf1 = LogisticRegression()
clf1.fit(X_train, y_train)
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf1.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))

print('线性回归模型预测能力性能报告:')
print(clf1)
print(classification_report(y_test,pred ))



# 生成向量机模型的结果
from sklearn.svm import SVC
clf3 = SVC()
t =time.time()
clf3.fit(X_train, y_train)
train_score = clf1.score(X_train, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))
t0 =time.time()
pred = clf3.predict(X_test)
print('预测结束,用时:{0}秒'.format(time.time()-t0))

print('向量机模型预测能力性能报告:')
print(clf3)
print(classification_report(y_test,pred ))


#开始进行特征工程,特征工程的主要流程包括:数据预处理 -> 特征选择 两个主要内容。
# 特征缩放
from sklearn.preprocessing import MinMaxScaler
X_train = MinMaxScaler().fit_transform(X_train)
X_test = MinMaxScaler().fit_transform(X_test)

# 用for循环,批量带入基本模型中进行验证
model_name = ['决策树', '线性回归', '向量机']
for clf, name in zip([clf2, clf1, clf3], model_name):
    clf.fit(X_train, y_train)
    t =time.time()
    train_score = clf1.score(X_train, y_train)
    print('模型名称:{0},训练分数:{1}'.format(name,train_score))
    print('训练共用时间:{0}秒'.format(time.time()-t))
    t0 =time.time()
    pred = clf.predict(X_test)
    print('预测结束,用时:{0}秒'.format(time.time()-t0))

    print(name+'模型预测能力性能报告:')
    print(clf)
    print(classification_report(y_test,pred ))
    print("*"*100)

X_name = data.drop(['label'],axis=1).columns
print('所有的标签的名称是:{}'.format(X_name))

# 使用固定比例50%的自动选择
from sklearn.feature_selection import SelectPercentile

select1 = SelectPercentile(percentile=50)
select1.fit(X_train, y_train)
mask1 = select1.get_support()
print('使用固定比例50%的自动选择模型自动选出的特征分别是{0},一共有{1}个'.format(X_name[mask1], len(X_name[mask1])))



# 使用迭代特征进行选择——自动选择的高级模式
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

select = RFE(RandomForestClassifier(n_estimators=100,random_state=42), n_features_to_select=21)
select.fit(X_train, y_train)

mask = select.get_support()
print('使用迭代特征模型自动选出的特征分别是{0},一共有{1}个'.format(X_name[mask], len(X_name[mask])))


# 挑选共有项的特征
new_mask = []
for ii in list(range(0,len(mask))):
    if mask[ii] == True and mask1[ii]== True:
        i = True
    else:
        i = False

    new_mask.append(i)

print('模型自动选出的特征分别是{0},一共有{1}个'.format(X_name[new_mask], len(X_name[new_mask])))


# 按照自动选出来的特征组合出数据集
X_train_mask = select.transform(X_train)
X_test_mask = select.transform(X_test)

X_train_mask1 = select1.transform(X_train)
X_test_mask1 = select1.transform(X_test)

X_train_new_mask = X_train[:,new_mask]
X_test_new_mask = X_test[:,new_mask]

print('X_train_mask的维度是{}'.format(X_train_mask.shape))
print('X_test_mask的维度是{}'.format(X_test_mask.shape))
print('X_train_mask1的维度是{}'.format(X_train_mask1.shape))
print('X_test_mask1的维度是{}'.format(X_test_mask1.shape))
print('X_train_new_mask的维度是{}'.format(X_train_new_mask.shape))
print('X_test_new_mask的维度是{}'.format(X_test_new_mask.shape))    


#评估数据集及模型
for clf, name in zip([clf2, clf1, clf3], model_name):
    clf.fit(X_train_mask, y_train)
    t =time.time()
    train_score = clf.score(X_train_mask, y_train)
    print('训练分数:{0}'.format(train_score))
    print('训练共用时间:{0}秒'.format(time.time()-t))
    t0 =time.time()
    pred = clf.predict(X_test_mask)
    print('预测结束,用时:{0}秒'.format(time.time()-t0))

    print(name+'使用X_train_mask,模型预测能力性能报告:')
    print(clf)
    print(classification_report(y_test,pred ))


for clf, name in zip([clf2, clf1, clf3], model_name):
    clf.fit(X_train_mask1, y_train)
    t =time.time()
    train_score = clf.score(X_train_mask1, y_train)
    print('训练分数:{0}'.format(train_score))
    print('训练共用时间:{0}秒'.format(time.time()-t))
    t0 =t =time.time()
    pred = clf.predict(X_test_mask1)
    print('预测结束,用时:{0}秒'.format(time.time()-t0))

    print(name+'使用X_train_mask1,模型预测能力性能报告:')
    print(clf)
    print(classification_report(y_test,pred ))
    print("-"*50)    


for clf, name in zip([clf2, clf1, clf3], model_name):
    clf.fit(X_train_new_mask, y_train)
    t =time.time()
    train_score = clf.score(X_train_new_mask, y_train)
    print('训练分数:{0}'.format(train_score))
    print('训练共用时间:{0}秒'.format(time.time()-t))
    t0 =t =time.time()
    pred = clf.predict(X_test_new_mask)
    print('预测结束,用时:{0}秒'.format(time.time()-t0))

    print(name+'使用X_train_new_mask,模型预测能力性能报告:')
    print(clf)
    print(classification_report(y_test,pred ))    


#优化性能最好的向量机
from sklearn.model_selection import GridSearchCV

param_grid = [{'kernel':['rbf'],
              'C':[0.001, 0.01,0.1,1,10,100,1000],
              'gamma':[0.001, 0.01,0.1,1,10,100,1000]},
             {'kernel':['linear'], 
             'C':[0.001,0.01,0.1,1,10,100,1000]}]

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_new_mask, y_train)
print('使用选出的macd,KDJ等指标,来尝试提高模型精度。最好的参数:{0}\n最好分数:{1}'.format(grid_search.best_params_, grid_search.best_score_))   


# 直接上决策树算法,这个算法有很多扩展算法,直接选用这个
from sklearn.model_selection import GridSearchCV
entropy_thresholds = np.linspace(0, 1, 100)
gini_thresholds = np.linspace(0, 0.2, 100)

param_grid = [{'criterion': ['entropy'], 'min_impurity_decrease': entropy_thresholds},
              {'criterion': ['gini'], 'min_impurity_decrease': gini_thresholds},
              {'max_depth': np.arange(2,10)},
              {'min_samples_split': np.arange(2,30,2)}]

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train_new_mask, y_train)
print('使用决策树算法,最好的参数:{0}\n最好分数:{1}'.format(grid_search.best_params_, grid_search.best_score_))

#使用随机森林模型
rfc = RandomForestClassifier(n_estimators=1000, criterion='gini') # 生成1000个决策树进行交叉
t =time.time()
rfc.fit(X_train_new_mask, y_train)
train_score = rfc.score(X_train_new_mask, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))

t0 =time.time()
pred = rfc.predict(X_test_new_mask)
print('预测结束,用时:{0}秒'.format(time.time()-t0))

print('使用随机森林模型模型预测能力性能报告:')
print(rfc)
print(classification_report(y_test,pred ))


# 使用vote算法对模型再次进行优化
from mlxtend.classifier import EnsembleVoteClassifier

clf1 = LogisticRegression()
clf2 = RandomForestClassifier(n_estimators=1000, criterion='gini')
clf3 = SVC(kernel='rbf',C=0.1,gamma=1,probability=True)
clf4 = DecisionTreeClassifier()

eclf1 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4],voting='hard', verbose=1)
t =time.time()
eclf1.fit(X_train_new_mask, y_train)
train_score = eclf1.score(X_train_new_mask, y_train)
print('训练分数:{0}'.format(train_score))
print('训练共用时间:{0}秒'.format(time.time()-t))

t0 =time.time()
pred = eclf1.predict(X_test_new_mask)
print('预测结束,用时:{0}秒'.format(time.time()-t0))

print('使用vote算法对模型再次进行优化后,模型预测能力性能报告:')
print(eclf1)
print(classification_report(y_test,pred ))

# 将每个标的的预测概率进行输出——只输出前20个
np.set_printoptions(suppress=True)
print(eclf1.predict_proba(X_test_new_mask[:20]))

运行的结果:

I:\Ml\008zhihu>python 001.py
X_train的数据结构:(1944, 46);
X_test的数据结构:(834, 46);
y_train的数据结构:(1944,);
y_test的数据结构:(834,);
训练分数:1.0
训练共用时间:0.09360027313232422秒
预测结束,用时:0.0秒
决策树模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
              precision    recall  f1-score   support

           0       0.66      0.67      0.66       419
           1       0.66      0.64      0.65       415

    accuracy                           0.66       834
   macro avg       0.66      0.66      0.66       834
weighted avg       0.66      0.66      0.66       834

训练分数:0.5339506172839507
训练共用时间:0.015600204467773438秒
预测结束,用时:0.0秒
线性回归模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

           0       0.51      0.51      0.51       419
           1       0.51      0.51      0.51       415

    accuracy                           0.51       834
   macro avg       0.51      0.51      0.51       834
weighted avg       0.51      0.51      0.51       834

训练分数:0.5339506172839507
训练共用时间:0.32760071754455566秒
预测结束,用时:0.09360003471374512秒
向量机模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.51      0.58      0.54       419
           1       0.51      0.44      0.47       415

    accuracy                           0.51       834
   macro avg       0.51      0.51      0.50       834
weighted avg       0.51      0.51      0.50       834

模型名称:决策树,训练分数:0.4876543209876543
训练共用时间:0.0秒
预测结束,用时:0.0秒
决策树模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
              precision    recall  f1-score   support

           0       0.59      0.36      0.45       419
           1       0.53      0.74      0.62       415

    accuracy                           0.55       834
   macro avg       0.56      0.55      0.53       834
weighted avg       0.56      0.55      0.53       834

********************************************************************************
********************
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:939
: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regressio
n
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
模型名称:线性回归,训练分数:0.7680041152263375
训练共用时间:0.0秒
预测结束,用时:0.0秒
线性回归模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

           0       0.80      0.70      0.75       419
           1       0.73      0.83      0.78       415

    accuracy                           0.76       834
   macro avg       0.77      0.77      0.76       834
weighted avg       0.77      0.76      0.76       834

********************************************************************************
********************
模型名称:向量机,训练分数:0.7680041152263375
训练共用时间:0.0秒
预测结束,用时:0.07800030708312988秒
向量机模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.77      0.68      0.72       419
           1       0.71      0.80      0.75       415

    accuracy                           0.74       834
   macro avg       0.74      0.74      0.74       834
weighted avg       0.74      0.74      0.74       834

********************************************************************************
********************
所有的标签的名称是:Index(['成交额', '成交笔数', 'MA1', 'MA2', 'MA3', 'MA4', 'MA
5', 'MA6', 'MID', 'UPPER',
       'LOWER', 'AR', 'BR', 'BIAS1', 'BIAS2', 'BIAS3', 'unknow1', 'CJBS', 'CR',
       'MA1.1', 'MA2.1', 'MA3.1', 'PDI', 'MDI', 'ADX', 'ADXR', 'K', 'D', 'K.1',
       'D.1', 'J', 'DIFF', 'DEA', 'MACD', 'unknow2', 'unknow3', 'RSI1', 'RSI2',
       'RSI3', 'WR1', 'WR2', 'JCS', 'JCM', 'JCL', 'DDD', 'AMA'],
      dtype='object')
使用固定比例50%的自动选择模型自动选出的特征分别是Index(['成交额', 'AR', 'BR', 'B
IAS1', 'BIAS2', 'BIAS3', 'unknow1', 'CR', 'PDI',
       'MDI', 'K', 'D', 'K.1', 'D.1', 'J', 'DIFF', 'MACD', 'unknow3', 'RSI1',
       'RSI2', 'RSI3', 'WR1', 'WR2'],
      dtype='object'),一共有23个
使用迭代特征模型自动选出的特征分别是Index(['AR', 'BIAS1', 'BIAS2', 'BIAS3', 'unk
now1', 'CJBS', 'CR', 'PDI', 'MDI',
       'ADX', 'K', 'D', 'K.1', 'D.1', 'J', 'MACD', 'RSI1', 'RSI2', 'WR1',
       'WR2', 'JCL'],
      dtype='object'),一共有21个
模型自动选出的特征分别是Index(['AR', 'BIAS1', 'BIAS2', 'BIAS3', 'unknow1', 'CR',
 'PDI', 'MDI', 'K',
       'D', 'K.1', 'D.1', 'J', 'MACD', 'RSI1', 'RSI2', 'WR1', 'WR2'],
      dtype='object'),一共有18个
X_train_mask的维度是(1944, 21)
X_test_mask的维度是(834, 21)
X_train_mask1的维度是(1944, 23)
X_test_mask1的维度是(834, 23)
X_train_new_mask的维度是(1944, 18)
X_test_new_mask的维度是(834, 18)
训练分数:1.0
训练共用时间:0.015599966049194336秒
预测结束,用时:0.0秒
决策树使用X_train_mask,模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
              precision    recall  f1-score   support

           0       0.61      0.47      0.53       419
           1       0.57      0.70      0.63       415

    accuracy                           0.58       834
   macro avg       0.59      0.58      0.58       834
weighted avg       0.59      0.58      0.58       834

训练分数:0.7602880658436214
训练共用时间:0.0秒
预测结束,用时:0.0秒
线性回归使用X_train_mask,模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

           0       0.80      0.70      0.75       419
           1       0.73      0.83      0.78       415

    accuracy                           0.76       834
   macro avg       0.77      0.76      0.76       834
weighted avg       0.77      0.76      0.76       834

训练分数:0.7834362139917695
训练共用时间:0.5616011619567871秒
预测结束,用时:0.23400020599365234秒
向量机使用X_train_mask,模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.80      0.70      0.75       419
           1       0.73      0.82      0.77       415

    accuracy                           0.76       834
   macro avg       0.77      0.76      0.76       834
weighted avg       0.77      0.76      0.76       834

训练分数:1.0
训练共用时间:0.0秒
预测结束,用时:0.0秒
决策树使用X_train_mask1,模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
              precision    recall  f1-score   support

           0       0.59      0.50      0.54       419
           1       0.56      0.64      0.60       415

    accuracy                           0.57       834
   macro avg       0.57      0.57      0.57       834
weighted avg       0.57      0.57      0.57       834

--------------------------------------------------
训练分数:0.7659465020576132
训练共用时间:0.0秒
预测结束,用时:0.0秒
线性回归使用X_train_mask1,模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       419
           1       0.74      0.82      0.78       415

    accuracy                           0.77       834
   macro avg       0.77      0.77      0.77       834
weighted avg       0.77      0.77      0.77       834

--------------------------------------------------
训练分数:0.7870370370370371
训练共用时间:0.4836008548736572秒
预测结束,用时:0.23400044441223145秒
向量机使用X_train_mask1,模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       419
           1       0.75      0.82      0.78       415

    accuracy                           0.77       834
   macro avg       0.77      0.77      0.77       834
weighted avg       0.77      0.77      0.77       834

--------------------------------------------------
训练分数:1.0
训练共用时间:0.015599966049194336秒
预测结束,用时:0.0秒
决策树使用X_train_new_mask,模型预测能力性能报告:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
              precision    recall  f1-score   support

           0       0.59      0.49      0.54       419
           1       0.56      0.66      0.61       415

    accuracy                           0.57       834
   macro avg       0.58      0.57      0.57       834
weighted avg       0.58      0.57      0.57       834

训练分数:0.7587448559670782
训练共用时间:0.0秒
预测结束,用时:0.0秒
线性回归使用X_train_new_mask,模型预测能力性能报告:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

           0       0.80      0.70      0.75       419
           1       0.73      0.83      0.78       415

    accuracy                           0.76       834
   macro avg       0.77      0.76      0.76       834
weighted avg       0.77      0.76      0.76       834

训练分数:0.7818930041152263
训练共用时间:0.10920047760009766秒
预测结束,用时:0.04680013656616211秒
向量机使用X_train_new_mask,模型预测能力性能报告:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.79      0.74      0.77       419
           1       0.75      0.81      0.78       415

    accuracy                           0.77       834
   macro avg       0.77      0.77      0.77       834
weighted avg       0.77      0.77      0.77       834

使用选出的macd,KDJ等指标,来尝试提高模型精度。最好的参数:{'C': 100, 'gamma': 1,
 'kernel': 'rbf'}
最好分数:0.7844724776720643
使用决策树算法,最好的参数:{'criterion': 'gini', 'min_impurity_decrease': 0.0020
2020202020202}
最好分数:0.693422203376355
训练分数:1.0
训练共用时间:13.275623083114624秒
预测结束,用时:0.28080058097839355秒
使用随机森林模型模型预测能力性能报告:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.87      0.26      0.41       419
           1       0.56      0.96      0.71       415

    accuracy                           0.61       834
   macro avg       0.72      0.61      0.56       834
weighted avg       0.72      0.61      0.56       834

Fitting 4 classifiers...
Fitting clf1: logisticregression (1/4)
Fitting clf2: randomforestclassifier (2/4)
Fitting clf3: svc (3/4)
Fitting clf4: decisiontreeclassifier (4/4)
训练分数:0.8955761316872428
训练共用时间:21.24723744392395秒
预测结束,用时:0.3120005130767822秒
使用vote算法对模型再次进行优化后,模型预测能力性能报告:
EnsembleVoteClassifier(clfs=[LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,

                                                penalty='l2', random_state=None,

                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                             RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini...
                                 shrinking=True, tol=0.001, verbose=False),
                             DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0
,
                                                    presort='deprecated',
                                                    random_state=None,
                                                    splitter='best')],
                       refit=True, verbose=1, voting='hard', weights=None)
              precision    recall  f1-score   support

           0       0.79      0.73      0.76       419
           1       0.75      0.80      0.77       415

    accuracy                           0.77       834
   macro avg       0.77      0.77      0.77       834
weighted avg       0.77      0.77      0.77       834

[[0.29904357 0.70095643]
 [0.38052022 0.61947978]
 [0.04736433 0.95263567]
 [0.70955564 0.29044436]
 [0.79065533 0.20934467]
 [0.11818586 0.88181414]
 [0.12765114 0.87234886]
 [0.29053959 0.70946041]
 [0.11113169 0.88886831]
 [0.7328489  0.2671511 ]
 [0.58150786 0.41849214]
 [0.38384207 0.61615793]
 [0.35686211 0.64313789]
 [0.25780496 0.74219504]
 [0.28229374 0.71770626]
 [0.56169932 0.43830068]
 [0.37888313 0.62111687]
 [0.40678589 0.59321411]
 [0.18140309 0.81859691]
 [0.34179083 0.65820917]]



参考:https://zhuanlan.zhihu.com/p/54853160

本文暂无标签

发表评论

*

*