Pythonsklearn 数据分析中常⽤⽅法肩扛式摄像机
⼀、数据处理
随机划分训练集和测试集:⽂本特征提取: . ⽂本相关的特征抽取 ⽂本特征向量化,其实就是将所有⽂本中出现的单词组成⼀个词典,这个词典可以作为⼀个向量,对每个样例中出现的次数进⾏统计,从⽽每个样例都会形成⼀个向量。如上图。但如果只是统计词频是不够的,因为常⽤的语⾔中有些单词出现频率特别⾼,但是却没有啥意义。如“the、to”。因此我们需要降低这类单词的权重。TF-IDF思想是⼀个词语在⼀篇⽂章中出现次数越多, 同时在所有⽂档中出现次数越少,越能够代表该⽂章。from del_selection import train_test_split X_all = data_train.drop(['Survived', 'PassengerId'], axis=1) #只包含特征集,不包含预测⽬标y_all = data_train['Survived'] #只包含预测⽬标num_test = 0.20 # 测试集占据⽐例,,如果是整数的话就是样本的数量# 注意返回值:(X_train,y_train)训练集的特征和label || (X_test,y_test)训练集的特征和label X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)# random_state 参数表⽰随机种⼦,如果为0或不填,每次随机产⽣的随机数组不同。 1
2
3
4
5
6
7
核酸检测方法89from del_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)# sss 对象⽤于划分数据集X = train[0::, 1::]# X 为特征集y = train[0::, 0]# y 为Label 集for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
1
2
3
4
5
6
7
8
9
10text .CountVectorizer :将⽂本转换为每个词出现的个数的向量text .TfidfVectorizer :将⽂本转换为tfidf 值的向量text .HashingVectorizer :⽂本的特征哈希
1
2
3# CountVectorizer ar1 = '今天 今天 天⽓ 不错 我们 愉快 玩耍'ar2 = '今天 锻炼 舒服 天⽓ ⼀般'ar3 = '天⽓ 糟糕'text = [ar1,ar2,ar3]from sklearn. import CountVectorizer,TfidfVectorizer ct = CountVectorizer()print(ct.fit_transform(text).todense())print(ct.vocabulary_)
内红瞄准镜
1
2
3
4
5
6
7
8
9火炬点火装置
如果词库很⼤时,⽣成的词向量维度过⼤,可以使⽤hash⽅法对其进⾏降维.hash后的词向量就⽆法解释其意义。
⼆、模型选择
逻辑回归:
SVC⽀持向量机:# TfidfVectorizer from sklearn .feature _extraction .text import TfidfTransformer from sklearn .feature _extraction .text import CountVectorizer transformer = TfidfTransformer()tfidf = transformer .fit _transform(ct .fit _transform(text)) print(tfidf .todense ())1
2
3
4
5
6from sklearn .feature _extraction .text import TfidfVectorizer tfidf2 = TfidfVectorizer()re = tfidf2.fit _transform(text)print(re .todense ())
1
2
3
4from sklearn. import HashingVectorizer vectorizer2=HashingVectorizer(n_features = 6,norm = None )print(vectorizer2.fit_transform(text).todense())
1
2
3# machine learning from sklearn.linear_model import LogisticRegression from sklearn.svm import
SVC, LinearSVC from semble import RandomForestClassifier from ighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from import DecisionTreeClassifier
1
2
3
4
5
6
7
8
9logreg = LogisticRegression()logreg .fit (X_train, Y_train)Y_pred = logreg .predict (X_test)acc_log = round(logreg .score (X_train, Y_train) * 100, 2)acc_log
1
2
3
4
5# 查看特征系数coeff_df = pd.DataFrame(lumns.delete(0))lumns = ['Feature']coeff_df["Correlation"] = pd.f_[0])coeff_df.sort_values(by ='Correlation', ascending =False)
1
2
3
4
5
6
K近邻学习KNN:
朴素贝叶斯分类器:
感知机:
随机梯度下降法:
决策树:
现浇梁随机森林:svc = SVC()svc .fit (X_train, Y_train)Y_pred = svc .predict (X_test)acc_svc = round(svc .score (X_train, Y_train) * 100, 2)1
2
3
4# # Linear SVC # linear_svc = LinearSVC()# linear_svc.fit(X_train, Y_train)# Y_pred = linear_svc.predict(X_test)# acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)# acc_linear_svc
1
2
3
4
5
6# knn = KNeighborsClassifier(n_neighbors = 3)# knn.fit(X_train, Y_train)# Y_pred = knn.predict(X_test)# acc_knn = round(knn.score(X_train, Y_train) * 100, 2)# acc_knn
1
2
3
4
5# gaussian = GaussianNB()# gaussian.fit(X_train, Y_train)# Y_pred = gaussian.predict(X_test)# acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)# acc_gaussian
1
2
3
4
5# perceptron = Perceptron()# perceptron.fit(X_train, Y_train)# Y_pred = perceptron.predict(X_test)# acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)# acc_perceptron
1
2
3
4
非接触测量5# sgd = SGDClassifier()# sgd.fit(X_train, Y_train)# Y_pred = sgd.predict(X_test)# acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)# acc_sgd
1
2
3
4
5# # Decision Tree # decision_tree = DecisionTreeClassifier()# decision_tree.fit(X_train, Y_train)# Y_pred = decision_tree.predict(X_test)# acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)# acc_decision_tree
1
2
3
4
5
6
遍历模型⽅法:# # Random Forest # random_forest = RandomForestClassifier(n_estimators=100)# random_forest.fit(X_train, Y_train)# Y_pred = random_forest.predict(X_test)# random_forest.score(X_train, Y_train)# acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)# acc_random_forest 1
2
3
4
5
6
7# 基于准确率搜索最佳参数的随机森林semble import RandomForestClassifier ics import make_scorer, accuracy_score del_selection import GridSearchCV # Choose the type of classifier. clf = RandomForestClassifier()# Choose some parameter combinations to try parameters = {'n_estimators': [4, 6, 9], 'max_features': ['log2', 'sqrt','auto'], 'criterion': ['entropy', 'gini'], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [1,5,8] }# Type of scoring used to compare parameter combinations acc_scorer = make_scorer(accuracy_score)# Run the grid search grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)grid_obj = grid_obj.fit(X_train, y_train)# Set the clf to the best combination of parameters clf = grid_obj.best_estimator_# Fit the best algorithm to the data. clf.fit(X_train, y_train)1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import matplotlib.pyplot as plt import seaborn as sns from del_selection import StratifiedShuffleSplit from ics import accuracy_score, log_loss from ighbors import KNeighborsClassifier from sklearn.svm import SVC from import DecisionTreeClassifier from semble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.linear_model import LogisticRegression classifiers = [ KNeighborsClassifier(3), SVC(probability=True ), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression()]log_cols = ["Classifier", "Accuracy"]log = pd.DataFrame(columns=log_cols)sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)# sss 对象⽤于划分数据集X = train
[0::, 1::]# X 为特征集y = train[0::, 0]# y 为Label 集acc_dict = {}for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for clf in classifiers: name = clf.__class__.__name__ clf.fit(X_train, y_train) train_predictions = clf.predict(X_test) acc = accuracy_score(y_test, train_predictions) if name in acc_dict: acc_dict[name] += acc else : acc_dict[name] = acc for clf in acc_dict: acc_dict[clf] = acc_dict[clf] / 10.0 # 计算平均准确率 log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols) log = log.append(log_entry)plt.xlabel('Accuracy')plt.title('Classifier Accuracy')sns.set_color_codes("muted")sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")# 画条形图分析123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263