机器学习模型训练测试完整步骤

机器学习模型训练测试完整步骤
媳妇你当家⽂章的⽬的:这篇⽂章主要讲述模型的建⽴、以及测试的完整步骤,重点是各个步骤的关系以及作⽤。
⽂章的前提:这篇⽂章所有代码是⽤python编写,⽤knn算法对经典数据MNIST data(⼿写数字)进⾏建⽴模型。这是⽤⼀个实例说明,不⽤太在乎算法和数据本⾝。
⽂章结构:⽂章会解释每⼀步的作⽤,最后会贴出全部python代码,本⽂的数据下载地址:
⼀、读取数据(原始数据)
读取数据可以看做数据收集,原始数据的状态。这⾥是csv⽂件,具体读取⽅式有很多,这⾥采⽤pandas的⽅法。
def opencsv():# 使⽤pandas打开
data = pd.read_csv('data/train.csv')
data1 = pd.read_csv('data/test.csv')
train_x = data.values[0:, 1:]  # 读⼊全部训练数据
train_y = data.values[0:, 0]
result_x = data1.values[0:, 0:]  # 测试全部测试个数据
return train_x, train_y, result_x
⼆、数据预处理
数据预处理处理是对数据提前进⾏处理和修正。主要包括:特征提取,特征降维、特征空值处理、特征转换(one-hot)、特征归⼀化;⽬标值空值处理,⽬标值转换(one-hot)
(其中常⽤的是:降维、空值、one-hot转换、归⼀化。)
(这⾥我的数据没有经过预处理,因为原始的数据是图⽚像素数据,具体每⼀种数据在sklearn上都有⽅法)
三、交叉验证数据划分
为了模型测试,先选择交叉验证⽅法,提前划分好数据。
(注意:交叉验证会改变数据顺序,若原始数据对你有⽤,可以先进⾏交叉验证,在进⾏降维等预处
理,接下来会演⽰)
def data_pro(x,y):
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.1,random_state=33)
return x_train, x_test, y_train, y_test
四、模型建⽴及测试
⽤处理好的数据建⽴训练模型,对模型的评价有很多参数,常⽤的有:得分(对的⽐例)、查准率、查全率、F1指数
#训练
knnClf = KNeighborsClassifier()  # k=5  KNN中邻值为5,
knnClf.fit(x_train, ravel(y_train))
#预测
y_predict = knnClf.predict(x_test)
print("score on the testdata:",knnClf.score(x_test,y_test))
# print("score on the traindata:",knnClf.score(x_train,y_train))
print(classification_report(y_test,y_predict))
五、预测的可能性计算
计算分类的概率⼤⼩
# 可能性
probablity = knnClf.predict_proba(x_test)
list_pro = []
for i in range(probablity.shape[0]):
pro = max(list(probablity[i]))
list_pro.append(pro)
六、结果保存
将编号,原始结果,预测结果,预测概率保存csv
#输出
index = np.array(id).reshape((-1,1))[:,0:1]
result = pd.lumn_stack((shape(-1,1),np.array(y_test).reshape(-1,1),np.array(y_predict).reshape(-1,1),np.array(list_pro).reshap                          columns=['ImageId','test_label','predict_lable','probablity'])
<_csv('result/knn_result.csv',index=False,header=True,encoding='gbk')
七、错误分析
错误本⾝就是⼀个很重要的东西,将错误分类保存起来。当需要对具体错误类型分析的时候,可以逐个分析错误。(⽐如当对‘4’分类错
误⽐价多的时候,可以加⼤‘4’的权重,使得其充分训练)
#错误分析
diff_index = []
for i in range(result.shape[0]):
diff_index.append(result['test_label'][i] != result['predict_lable'][i])
print(diff_index)
diff = result[diff_index]
diff_x = x_test_original[diff_index]
#查看每个错误
for i in range(len(diff_index)):
# print("label is:",diff['test_label'][i],"predict is:",diff['predict_lable'][i])
print("test label is :",diff.iloc[i]['test_label'],'predict label is :',diff.iloc[i]['predict_lable'])
x = diff_x[i]
img = x.reshape(28,28)
image_show(img)
<_csv('result/knn_result_diff.csv',index=False,header=True,encoding='gbk')
全部代码:
import pandas as pd
import time
from numpy import ravel, savetxt
最强磁铁
from sklearn import svm
semble import RandomForestClassifier
ics import classification_report
del_selection import train_test_split
ighbors import KNeighborsClassifier
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
def image_show(img):
渡边淳一plt.imshow(img)
plt.show()
def opencsv():# 使⽤pandas打开
data = pd.read_csv('data/train.csv')
data1 = pd.read_csv('data/test.csv')
train_x = data.values[0:, 1:]  # 读⼊全部训练数据
train_y = data.values[0:, 0]
result_x = data1.values[0:, 0:]  # 测试全部测试个数据
return train_x, train_y, result_x
def data_pro(x,y):
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.1,random_state=33)
return x_train, x_test, y_train, y_test
def knnClassify(x_train, x_test, y_train, y_test):
id = range(1,x_test.shape[0]+1)
print("start run knn.")
#训练
knnClf = KNeighborsClassifier()  # k=5  KNN中邻值为5,吴阶平简介
knnClf = KNeighborsClassifier()  # k=5  KNN中邻值为5,
r2d2knnClf.fit(x_train, ravel(y_train))
#预测
y_predict = knnClf.predict(x_test)
print("score on the testdata:",knnClf.score(x_test,y_test))
# print("score on the traindata:",knnClf.score(x_train,y_train))
print(classification_report(y_test,y_predict))
# 可能性
probablity = knnClf.predict_proba(x_test)
list_pro = []
for i in range(probablity.shape[0]):
pro = max(list(probablity[i]))
list_pro.append(pro)
#输出
index = np.array(id).reshape((-1,1))[:,0:1]
result = pd.lumn_stack((shape(-1,1),np.array(y_test).reshape(-1,1),np.array(y_predict).reshape(-1,1),np.array(list_pro).reshap                          columns=['ImageId','test_label','predict_lable','probablity'])
<_csv('result/knn_result.csv',index=False,header=True,encoding='gbk')
#错误分析
diff_index = []
for i in range(result.shape[0]):
diff_index.append(result['test_label'][i] != result['predict_lable'][i])
print(diff_index)
diff = result[diff_index]
diff_x = x_test_original[diff_index]
#查看每个错误
for i in range(len(diff_index)):
# print("label is:",diff['test_label'][i],"predict is:",diff['predict_lable'][i])
print("test label is :",diff.iloc[i]['test_label'],'predict label is :',diff.iloc[i]['predict_lable'])
x = diff_x[i]
img = x.reshape(28,28)
image_show(img)
<_csv('result/knn_result_diff.csv',index=False,header=True,encoding='gbk')
def svmClassify(train_x, train_y, test_x):
id = range(1, 28001)
t = time.time()
svc = svm.SVC(kernel='rbf', C=10)
svc.fit(train_x, train_y)
h = time.time()
print('time used:%f' % (h - t))
test_y = svc.predict(test_x)
k = time.time()
print('time used:%f' % (k - h))
savetxt('sklearn_svm_Result.csv', test_y, delimiter=',')
result = pd.lumn_stack((np.array(id).reshape((-1, 1))[:, 0:1], np.array(test_y).reshape((-1, 1))[:, 0:1])),
columns=['ImageId', 'Label'])
<_csv("sklearn_knn_Result2.csv", index=False, header=True, encoding='gbk')
if __name__ == "__main__":
print("start.")
#原数据
train_x_original, train_y_original, result_x_original = opencsv()
国家新型城镇化规划2014-2020# 交叉验证
x_train_original, x_test_original, y_train, y_test = data_pro(train_x_original, train_y_original)
# 降维
pca = PCA(n_components=0.8, whiten=True)
train_x_pca = pca.fit_transform(x_train_original)
x_test_pca = ansform(x_test_original)
x_test_pca = ansform(x_test_original)
result_x_pca = ansform(result_x_original)
#knn
knnClassify(train_x_pca, x_test_pca, y_train, y_test) #SVM
# svmClassify(train_x,train_y,test_x)
print("end.")

本文发布于:2024-09-25 22:20:21,感谢您对本站的认可!

本文链接:https://www.17tex.com/xueshu/326281.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:数据   测试   错误   模型   处理   训练
留言与评论(共有 0 条评论)
   
验证码:
Copyright ©2019-2024 Comsenz Inc.Powered by © 易纺专利技术学习网 豫ICP备2022007602号 豫公网安备41160202000603 站长QQ:729038198 关于我们 投诉建议