python基于ID3思想的决策树
这是一个判断海洋生物数据是否是鱼类而构建的基于ID3思想的决策树,供大家参考,具体内容如下
#coding=utf-8
importoperator
frommathimportlog
importtime
defcreateDataSet():
dataSet=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no'],
[0,0,'maybe']]
labels=['nosurfaceing','flippers']
returndataSet,labels
#计算香农熵
defcalcShannonEnt(dataSet):
numEntries=len(dataSet)
labelCounts={}
forfeaVecindataSet:
currentLabel=feaVec[-1]
ifcurrentLabelnotinlabelCounts:
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt=0.0
forkeyinlabelCounts:
prob=float(labelCounts[key])/numEntries
shannonEnt-=prob*log(prob,2)
returnshannonEnt
defsplitDataSet(dataSet,axis,value):
retDataSet=[]
forfeatVecindataSet:
iffeatVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
returnretDataSet
defchooseBestFeatureToSplit(dataSet):
numFeatures=len(dataSet[0])-1#因为数据集的最后一项是标签
baseEntropy=calcShannonEnt(dataSet)
bestInfoGain=0.0
bestFeature=-1
foriinrange(numFeatures):
featList=[example[i]forexampleindataSet]
uniqueVals=set(featList)
newEntropy=0.0
forvalueinuniqueVals:
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
newEntropy+=prob*calcShannonEnt(subDataSet)
infoGain=baseEntropy-newEntropy
ifinfoGain>bestInfoGain:
bestInfoGain=infoGain
bestFeature=i
returnbestFeature
#因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类
#还是没有算完,这时候就会采用多数表决的方式计算节点分类
defmajorityCnt(classList):
classCount={}
forvoteinclassList:
ifvotenotinclassCount.keys():
classCount[vote]=0
classCount[vote]+=1
returnmax(classCount)
defcreateTree(dataSet,labels):
classList=[example[-1]forexampleindataSet]
ifclassList.count(classList[0])==len(classList):#类别相同则停止划分
returnclassList[0]
iflen(dataSet[0])==1:#所有特征已经用完
returnmajorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
del(labels[bestFeat])
featValues=[example[bestFeat]forexampleindataSet]
uniqueVals=set(featValues)
forvalueinuniqueVals:
subLabels=labels[:]#为了不改变原始列表的内容复制了一下
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,
bestFeat,value),subLabels)
returnmyTree
defmain():
data,label=createDataSet()
t1=time.clock()
myTree=createTree(data,label)
t2=time.clock()
printmyTree
print'executefor',t2-t1
if__name__=='__main__':
main()
最后我们测试一下这个脚本即可,如果想把这个生成的决策树用图像画出来,也只是在需要在脚本里面定义一个plottree的函数即可。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。