基于基尼指数的决策树特征选择算法(CART)及其python实现

基于基尼指数的决策树特征选择算法(CART)及其python实现
基于基尼指数的决策树特征选择算法(CART)及其python实现
基尼指数
与信息增益和增益率类似,基尼指数是另外⼀种度量指标,由CART决策树使⽤,其定义如下:
对于⼆类分类问题,若样本属于正类的概率为 p,则基尼指数为:
对于给定的样本集合D,其基尼指数定义为:黑山论坛
其中Ck是D中属于第k类的样本⼦集。
如果样本集合D被某个特征A是否取某个值分成两个样本集合D1和D2,则在特征A的条件下,集合D的基尼指数定义为:
基尼指数Gini(D)反应的是集合D的不确定程度,跟熵的含义相似。Gini(D,A)反应的是经过特征A划分后集合D的不确定程度。所以决策树分裂选取Feature的时候,要选择使基尼指数最⼩的Feature,但注意信息增益则是选择最⼤值,这个值得选取是相反的。
下图表⽰⼆分类问题中基尼指数Gini(P)、熵的⼀半1/2H(P)和分类误差率的关系,横坐标表⽰概率p,纵坐标表⽰损失,可以看出基尼指数和熵的⼀半的曲线很接近,都了⼀近似的代表分类误差率。
李维淼
算法的python实现
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 14 17:36:57 2019
@author: Auser
"""
from numpy import*
import numpy as np
第七届中国国际动漫节import pandas as pd
from math import log
import operator
#计算数据集的基尼指数
def calcGini(dataSet_train):
labelCounts={}
#给所有可能分类创建字典
for featVec in dataSet_train:
currentLabel=featVec[-1]#训练数据的类别
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
Gini=1.0
#以2为底数计算⾹农熵
for key in labelCounts:
prob =float(labelCounts[key])/len(dataSet_train)
Gini-=prob*prob
return Gini
def split_data(dataSet_train,feature_index,value):#按照选出的最优的特征的特征值将训练集进⾏分裂,并将使⽤过的特征进⾏删除'''
划分数据集,特征为离散的
feature_index:⽤于划分特征的列数,例如“年龄”
value:划分后的属性值:例如“青少年”
'''
data_split=[]#划分后的数据集
for feature in dataSet_train:#遍历训练集
if feature[feature_index]==value:#如果训练集的特征索引的值等于划分后的属性值
reFeature=feature[:feature_index]#删除使⽤过的特征
reFeature=list(reFeature)
data_split.append(reFeature)
return data_split
'''
def split_countinue_data(dataSet_train,feature_index,value,direction):
data_split=[]
for feature in dataSet_train:
if feature[feature_index]>value:
reFeature=feature[:feature_index]
reFeature=list(reFeature)
data_split.append(reFeature)
else:
if feature[feature_index]<=value:
reFeature=feature[:feature_index]
reFeature=list(reFeature)
data_split.append(reFeature)
return data_split
'''
#选择最好的数据集划分⽅式
def choose_best_to_split(dataSet_train,featname):
min_Gini_index=10000#初始化最⼩的基尼指数
Gini_index=0
best_feature_index=-1
feature=len(dataSet_train[0])-1
for i in range(feature):#遍历特征
feature_list=[extend[i]for extend in dataSet_train]
unique=set(feature_list)#对每个特征值列去重
for value in unique:
sub_data=split_data(dataSet_train,i,value)
prob=len(sub_data)/float(len(dataSet_train))
Gini_index+=prob*calcGini(sub_data)
if(Gini_index<min_Gini_index):
min_Gini_index=Gini_index
best_feature_index=i
return best_feature_index
'''
feature=len(dataSet_train[0])-1#特征数量
min_Gini_index=10000#初始化最⼩基尼指数
new_gini_index=0
best_feature_index=-1
best_feature_index=-1
best_split_dict={}
for i in range(feature):#
feature_list=[extend[i] for extend in dataSet_train]
#当特征是连续的,将数据集按照特征的划分点分为左右两部分
if isinstance(feature_list[0],int) or isinstance(feature_list[0],float):
sort_feature_list=sorted(feature_list)#对特征列进⾏排序
split_point_list=[]
for j in range(len(sort_feature_list)-1):#遍历排序后的特征列
split_point_list.append((sort_feature_list[j]+sort_feature_list[j+1])/2.0)
联通华盛营销管理系统
#slen=len(split_point_list)
for j in range(len(split_point_list)):#遍历排序后的划分点列表
#遍历划分点
new_Gini_index_0=0
new_Gini_index_1=0
#左⼦集
sub_data_0=split_countinue_data(dataSet_train,i,split_point_list[j],0)
#右⼦集
sub_data_1=split_countinue_data(dataSet_train,i,split_point_list[j],1)
prob_0=len(sub_data_0)/float(len(dataSet_train))
prob_1=len(sub_data_1)/float(len(dataSet_train))
new_Gini_index_0+=prob_0*calcGini(sub_data_0)
new_Gini_index_1+=prob_1*calcGini(sub_data_1)
new_Gini_index=new_Gini_index_0+new_Gini_index_1
if(min_Gini_index>new_Gini_index):
min_Gini_index=new_Gini_index
best_feature_index=i
best_split_dict[featname[i]]=split_point_list[best_feature_index]
GiniIndex=min_Gini_index
#当特征是离散的
else:
unique=set(feature_list)
#new_gini_indx=0#初始化基尼指数
for value in unique:
#遍历每个特征的取值
sub_data=split_data(dataSet_train,i,value)
prob=len(sub_data)/float(len(dataSet_train))
new_gini_index+=prob*calcGini(sub_data)
GiniIndex=new_gini_index
if(min_Gini_index>GiniIndex):
min_Gini_index=GiniIndex
best_feature_index=i
#如果当前的最佳划分特征为连续特征,将其以之前记录的划分点位界进⾏⼆值化处理
#判断是否⼩于等于best_split_point
if isinstance(dataSet_train[0][best_feature_index],int) or isinstance(dataSet_train[0][best_feature_index],float):        best_split_value=best_split_dict[featname[best_feature_index]]
featname[best_feature_index]=featname[best_feature_index]+'<='+str(best_split_value)
for i in range(np.shape(dataSet_train)[0]):#遍历特征维数
if(dataSet_train[i][best_feature_index]<=best_split_value):#当样本i的最优特征值⼩于分裂特征值
dataSet_train[i][best_feature_index]=1#将样本i的最优特征值赋1
else:
dataSet_train[i][best_feature_index]=0#否则,赋0
return best_feature_index
'''
def most_occur_label(classList):
#sorted_label_count[0][0]  次数最多的类标签
label_count={}#创建类别标签字典,key为类别,item为每个类别的样本数
for label in classList:#遍历类列表
if label not in label_count.keys():#如果类标签不在类别标签的字典中
label_count[label]=0#将该类别标签置为零
else:
label_count[label]+=1#否则,给对应的类别标签+1
#按照类别字典的值排序
sorted_label_count =sorted(label_count.items(),key = operator.itemgetter(1),reverse =True)
return sorted_label_count[0][0]#返回样本量最多的类别标签
def build_decesion_tree(dataSet_train,featnames):#建⽴决策树
'''
字典的键存放节点信息,分⽀及叶⼦节点存放值
字典的键存放节点信息,分⽀及叶⼦节点存放值
'''
featname = featnames[:]>>>#特证名
classlist =[featvec[-1]for featvec in dataSet_train]#此节点的分类情况
unt(classlist[0])==len(classlist):#全部属于⼀类
return classlist[0]
if len(dataSet_train[0])==1:#分完了,没有属性了联想网御防火墙
return  most_occur_label(classlist)#少数服从多数,返回训练样本数最多的类别作为叶节点
# 选择⼀个最优特征的最优特征值进⾏划分
bestFeat = choose_best_to_split(dataSet_train,featname)#选取特征下标
bestFeatname = featname[bestFeat]
del(featname[bestFeat])#防⽌下标不准
DecisionTree ={bestFeatname:{}}
#print(DecisionTree)
# 创建分⽀,先出所有属性值,即分⽀数
allvalue =[vec[bestFeat]for vec in dataSet_train]#将每个训练样本的最优特征值作为分裂时的特征值
specvalue =sorted(list(set(allvalue)))#对分⽀使有⼀定顺序
'''
if isinstance(dataSet_train[0][bestFeat],str):#当特征之是字符型
currentfeature=featname.index(featname[bestFeat])
feat_value=[example[currentfeature] for example in dataSet_train]
unique=set(feat_value)
del(featname[bestFeat])
#对于bestFeat的每个取值,划分出⼀个⼦树
for value in specvalue:
copyfeatname=featname[:]
if isinstance(dataSet_train[0][bestFeat],str):
DecisionTree[bestFeatname][value]=build_decesion_tree(split_data(dataSet_train,bestFeat,value),copyfeatname)    if isinstance(dataSet_train[0][bestFeat],str):
for value in unique:
DecisionTree[bestFeatname][value]=most_occur_label(classlist)
return DecisionTree
'''甲基丙烯酸正丁酯
for v in specvalue:#遍历选中的最优特征的每个特征取值
copyfeatname = featname[:]#复制特证名,在下⼀轮建造⼦树时使⽤
#  print(copyfeatname)
#递归建造决策树,split——data()对源数据集进⾏分裂,
DecisionTree[bestFeatname][v]=  build_decesion_tree(split_data(dataSet_train,bestFeat,v),copyfeatname) return DecisionTree
def classify(Tree, featnames, X):#对测试集使⽤训练好的决策树进⾏分类
#classLabel=' '
global classLabel
root =list(Tree.keys())[0]#构建树的根
firstDict = Tree[root]#树的根给第⼀个字典
featindex = featnames.index(root)#根节点的属性下标
#classLabel='0'
for key in firstDict.keys():#根属性的取值,取哪个就⾛往哪颗⼦树
if X[featindex]== key:#如果测试样本的特征索引等于key
if type(firstDict[key])==type({}):#如果第⼀个字典的关键字中的内容等于type
classLabel = classify(firstDict[key],featnames,X)#递归调⽤分类
else:
classLabel = firstDict[key]
return classLabel
'''
import re#⽣成⼆叉树的分类
def classify(Tree,featnames,X):
classLabel=''#类别标签
firstStr=list(Tree.keys())[0]
if '<=' in firstStr:
featvalue=float(repile("(<=.+)").search(firstStr).group()[2:])
featKey=repile("(.+<=)").search(firstStr).group()[:-2]
secondDict=Tree[firstStr]
featIndex=featnames.index(featKey)
if X[featIndex]<=featvalue:
judge=1
else:
judge=0
for key in secondDict.keys():
for key in secondDict.keys():
if judge==int(key):
if isinstance(secondDict[key],dict):
classLabel=classify(secondDict[key],featnames,X)
else:
classLabel=secondDict[key]
else:
secondDict=Tree[firstStr]
featIndex=featnames.index(firstStr)
for key in secondDict.keys():
if X[featIndex]==key:
if isinstance(secondDict[key],dict):
classLabel=classify(secondDict[key],featnames,X)
else:
classLabel=secondDict[key]
return classLabel
'''
#计算叶结点数
def getNumLeafs(Tree):
numLeafs =0
firstStr =list(Tree.keys())[0]
secondDict = Tree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':# 测试结点的数据类型是否为字典            numLeafs += getNumLeafs(secondDict[key])
else:  numLeafs +=1
return numLeafs
# 计算树的深度
def getTreeDepth(Tree):
maxDepth =0
firstStr =list(Tree.keys())[0]
secondDict = Tree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':# 测试结点的数据类型是否为字典            thisDepth =1+ getTreeDepth(secondDict[key])
else:  thisDepth =1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth

本文发布于:2024-09-22 01:29:18,感谢您对本站的认可!

本文链接:https://www.17tex.com/xueshu/204420.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:特征   划分   类别   分类   字典   特征值   样本   决策树
留言与评论(共有 0 条评论)
   
验证码:
Copyright ©2019-2024 Comsenz Inc.Powered by © 易纺专利技术学习网 豫ICP备2022007602号 豫公网安备41160202000603 站长QQ:729038198 关于我们 投诉建议