python基于ID3思想的决策树
这是一个判断海洋生物数据是否是鱼类而构建的基于ID3思想的决策树,供大家参考,具体内容如下
#coding=utf-8 importoperator frommathimportlog importtime defcreateDataSet(): dataSet=[[1,1,'yes'], [1,1,'yes'], [1,0,'no'], [0,1,'no'], [0,1,'no'], [0,0,'maybe']] labels=['nosurfaceing','flippers'] returndataSet,labels #计算香农熵 defcalcShannonEnt(dataSet): numEntries=len(dataSet) labelCounts={} forfeaVecindataSet: currentLabel=feaVec[-1] ifcurrentLabelnotinlabelCounts: labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 shannonEnt=0.0 forkeyinlabelCounts: prob=float(labelCounts[key])/numEntries shannonEnt-=prob*log(prob,2) returnshannonEnt defsplitDataSet(dataSet,axis,value): retDataSet=[] forfeatVecindataSet: iffeatVec[axis]==value: reducedFeatVec=featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) returnretDataSet defchooseBestFeatureToSplit(dataSet): numFeatures=len(dataSet[0])-1#因为数据集的最后一项是标签 baseEntropy=calcShannonEnt(dataSet) bestInfoGain=0.0 bestFeature=-1 foriinrange(numFeatures): featList=[example[i]forexampleindataSet] uniqueVals=set(featList) newEntropy=0.0 forvalueinuniqueVals: subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) newEntropy+=prob*calcShannonEnt(subDataSet) infoGain=baseEntropy-newEntropy ifinfoGain>bestInfoGain: bestInfoGain=infoGain bestFeature=i returnbestFeature #因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类 #还是没有算完,这时候就会采用多数表决的方式计算节点分类 defmajorityCnt(classList): classCount={} forvoteinclassList: ifvotenotinclassCount.keys(): classCount[vote]=0 classCount[vote]+=1 returnmax(classCount) defcreateTree(dataSet,labels): classList=[example[-1]forexampleindataSet] ifclassList.count(classList[0])==len(classList):#类别相同则停止划分 returnclassList[0] iflen(dataSet[0])==1:#所有特征已经用完 returnmajorityCnt(classList) bestFeat=chooseBestFeatureToSplit(dataSet) bestFeatLabel=labels[bestFeat] myTree={bestFeatLabel:{}} del(labels[bestFeat]) featValues=[example[bestFeat]forexampleindataSet] uniqueVals=set(featValues) forvalueinuniqueVals: subLabels=labels[:]#为了不改变原始列表的内容复制了一下 myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet, bestFeat,value),subLabels) returnmyTree defmain(): data,label=createDataSet() t1=time.clock() myTree=createTree(data,label) t2=time.clock() printmyTree print'executefor',t2-t1 if__name__=='__main__': main()
最后我们测试一下这个脚本即可,如果想把这个生成的决策树用图像画出来,也只是在需要在脚本里面定义一个plottree的函数即可。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。