数据集

训练集

RID,age,income,student,credit_rating,class_buys_computer1,youth,high,no,fair,no2,youth,high,no,excellent,no3,middle_aged,high,no,fair,yes4,senior,medium,no,fair,yes5,senior,low,yes,fair,yes6,senior,low,yes,excellent,no7,middle_aged,low,yes,excellent,yes8,youth,medium,no,fair,no9,youth,low,yes,fair,yes10,senior,medium,yes,fair,yes11,youth,medium,yes,excellent,yes12,middle_aged,medium,no,excellent,yes13,middle_aged,high,yes,fair,yes14,senior,medium,no,excellent,no
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16

测试集

RID,age,income,student,credit_rating,class_buys_computer1,youth,high,no,fair,no2,youth,high,no,excellent,no3,middle_aged,high,no,fair,yes4,senior,medium,no,fair,yes5,senior,low,yes,fair,yes6,senior,low,yes,excellent,no7,middle_aged,low,yes,excellent,yes8,youth,medium,no,fair,no9,youth,low,yes,fair,yes10,senior,medium,yes,fair,yes11,youth,medium,yes,excellent,yes12,middle_aged,medium,no,excellent,yes13,youth,medium,no,excellent,yes
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

 代码

#coding=utf-8#设置python编码from sklearn.feature_extraction import DictVectorizerimport csvimport osfrom sklearn import preprocessingfrom sklearn import treefrom sklearn.externals.six import StringIO#---数据获取---#使用CSV包,按行读取CSV数据allElectron\= open(r'./AllElectronics.csv','rb')
reader = csv.reader(allElectronicsData)#获取各个字段及其名headers = reader.next()
print("headers : " ,headers)#---数据预处理---#sklearn只接受数值型的数据#以CSV中第一行age数据为例#age:youth middle_age senior#矩阵:  1       0        0#特征值ListfeatureList = []#类别List , Yes/NolabelList = []for row in reader:    #将每一行的结果放入labelList
    labelList.append(row[len(row)-1])    #对每一行数据创建一个字典(将每行特征数据转为JSON格式),将headers中的字段与实际值对应 如age:youth
    rowDict = {}    #i从1开始,取消RID的影响
    for i in range(1,len(row)-1):
        rowDict[headers[i]] = row[i]
    featureList.append(rowDict)# print labelList# print featureList#把featureList向量化vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()
print("dummyX : " + str(dummyX))print ("feature mapping : " + str(vec.get_feature_names()))#把labelList向量化,使用python自带LabelBinarizerlb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)# print("dummyY : " + str(dummyY))#使用tree分类器创建,使用信息熵 ID3算法clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX,dummyY)print ("clf: " +  str(clf))#创建dot文件并输出树数据with open('DTreeData.dot','w') as f:
    f = tree.export_graphviz(clf,feature_names= vec.get_feature_names(),out_file=f)

os.system("dot -Tpdf D:\Data\MyCode\codepython\ML_Base_Demo\DecisionTree\DTreeData.dot -o D:\Data\MyCode\codepython\ML_Base_Demo\DecisionTree\DTree.pdf")#利用生成的决策树进行预测# on\= dummyX[1,:]# print ("one row : " + str(oneRow))## newRow = oneRow# newRow[0] = 1# newRow[2] = 0# print("new row x : " + str(newRow))## predictedY = clf.predict(newRow)## print ("predict result : " + str(predictedY))testSet = open(r'test_set.csv','rb')
reader = csv.reader(testSet)
reader.next()

testList = []for row in reader:    #将每一行的结果放入labelList
    labelList.append(row[len(row)-1])    #对每一行数据创建一个字典(将每行特征数据转为JSON格式),将headers中的字段与实际值对应 如age:youth
    rowDict = {}    #i从1开始,取消RID的影响
    for i in range(1,len(row)-1):
        rowDict[headers[i]] = row[i]
    testList.append(rowDict)# print testList#把testList向量化vec = DictVectorizer()
testX = vec.fit_transform(testList).toarray()
print("testX : " + str(testX))
predictSet = clf.predict(testX)print predictSet
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103

运行结果

image [机器学习]机器学习笔记整理06-决策树应用 AI教程 第1张

('headers : ', ['RID', 'age', 'income', 'student', 'credit_rating', 'class_buys_computer']) 

dummyX : [[ 0.  0.  1.  0.  1.  1.  0.  0.  1.  0.] 

 [ 0.  0.  1.  1.  0.  1.  0.  0.  1.  0.] 

 [ 1.  0.  0.  0.  1.  1.  0.  0.  1.  0.] 

 [ 0.  1.  0.  0.  1.  0.  0.  1.  1.  0.] 

 [ 0.  1.  0.  0.  1.  0.  1.  0.  0.  1.] 

 [ 0.  1.  0.  1.  0.  0.  1.  0.  0.  1.] 

 [ 1.  0.  0.  1.  0.  0.  1.  0.  0.  1.] 

 [ 0.  0.  1.  0.  1.  0.  0.  1.  1.  0.] 

 [ 0.  0.  1.  0.  1.  0.  1.  0.  0.  1.] 

 [ 0.  1.  0.  0.  1.  0.  0.  1.  0.  1.] 

 [ 0.  0.  1.  1.  0.  0.  0.  1.  0.  1.] 

 [ 1.  0.  0.  1.  0.  0.  0.  1.  1.  0.] 

 [ 1.  0.  0.  0.  1.  1.  0.  0.  0.  1.] 

 [ 0.  1.  0.  1.  0.  0.  0.  1.  1.  0.]] feature mapping : ['age=middle_aged', 'age=senior', 'age=youth', 'credit_rating=excellent', 'credit_rating=fair', 'income=high', 'income=low', 'income=medium', 'student=no', 'student=yes'] 

clf: DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None, 

            max_features=None, max_leaf_nodes=None, 

            min_impurity_split=1e-07, min_samples_leaf=1, 

            min_samples_split=2, min_weight_fraction_leaf=0.0, 

            presort=False, random_state=None, splitter='best') 

Error: Could not open "D:\Data\MyCode\codepython\ML_Base_Demo\DecisionTree\DTree.pdf" for writing : Permission denied 

testX : [[ 0.  0.  1.  0.  1.  1.  0.  0.  1.  0.] 

 [ 0.  0.  1.  1.  0.  1.  0.  0.  1.  0.] 

 [ 1.  0.  0.  0.  1.  1.  0.  0.  1.  0.] 

 [ 0.  1.  0.  0.  1.  0.  0.  1.  1.  0.] 

 [ 0.  1.  0.  0.  1.  0.  1.  0.  0.  1.] 

 [ 0.  1.  0.  1.  0.  0.  1.  0.  0.  1.] 

 [ 1.  0.  0.  1.  0.  0.  1.  0.  0.  1.] 

 [ 0.  0.  1.  0.  1.  0.  0.  1.  1.  0.] 

 [ 0.  0.  1.  0.  1.  0.  1.  0.  0.  1.] 

 [ 0.  1.  0.  0.  1.  0.  0.  1.  0.  1.] 

 [ 0.  0.  1.  1.  0.  0.  0.  1.  0.  1.] 

 [ 1.  0.  0.  1.  0.  0.  0.  1.  1.  0.] 

 [ 0.  0.  1.  1.  0.  0.  0.  1.  1.  0.]] [0 0 1 1 1 0 1 0 1 1 1 1 0]
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
 [机器学习]机器学习笔记整理06-决策树应用 AI教程 第2张