小毛驴

Adventure may hurt you, but monotony will kill you.

0%

决策树ID3和C4.5算法Python实现源码

首先推荐李航的《统计机器学习》这本书,这个实现就是按照书上的算法来的。Python 用的是最新的3.3版的,和2.x不兼容,运行的时候需要注意。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
'''
Created on 2012-12-18
@author: viso
'''

class Node:
'''Represents a decision tree node. '''
def __init__(self, parent = None, dataset = None):
self.dataset = dataset # 落在该结点的训练实例集
self.result = None # 结果类标签
self.attr = None # 该结点的分裂属性ID
self.childs = {} # 该结点的子树列表,key-value pair: (属性attr的值, 对应的子树)
self.parent = parent # 该结点的父亲结点



def entropy(props):
if (not isinstance(props, (tuple, list))):
return None

from math import log
log2 = lambda x:log(x)/log(2) # an anonymous function
e = 0.0
for p in props:
e -= p * log2(p)
return e


def info_gain(D, A, T = -1, return_ratio = False):
'''特征A对训练数据集D的信息增益 g(D,A)

g(D,A)=entropy(D) - entropy(D|A)
假设数据集D的每个元组的最后一个特征为类标签
T为目标属性的ID,-1表示元组的最后一个元素为目标'''
if (not isinstance(D, (set, list))):
return None
if (not type(A) is int):
return None
C = {} # 类别计数字典
DA = {} # 特征A的取值计数字典
CDA = {} # 类别和特征A的不同组合的取值计数字典
for t in D:
C[t[T]] = C.get(t[T], 0) + 1
DA[t[A]] = DA.get(t[A], 0) + 1
CDA[(t[T], t[A])] = CDA.get((t[T], t[A]), 0) + 1

PC = map(lambda x : x / len(D), C.values()) # 类别的概率列表
entropy_D = entropy(tuple(PC)) # map返回的对象类型为map,需要强制类型转换为元组


PCDA = {} # 特征A的每个取值给定的条件下各个类别的概率(条件概率)
for key, value in CDA.items():
a = key[1] # 特征A
pca = value / DA[a]
PCDA.setdefault(a, []).append(pca)

condition_entropy = 0.0
for a, v in DA.items():
p = v / len(D)
e = entropy(PCDA[a])
condition_entropy += e * p

if (return_ratio):
return (entropy_D - condition_entropy) / entropy_D
else:
return entropy_D - condition_entropy

def get_result(D, T = -1):
'''获取数据集D中实例数最大的目标特征T的值'''
if (not isinstance(D, (set, list))):
return None
if (not type(T) is int):
return None
count = {}
for t in D:
count[t[T]] = count.get(t[T], 0) + 1
max_count = 0
for key, value in count.items():
if (value > max_count):
max_count = value
result = key
return result


def devide_set(D, A):
'''根据特征A的值把数据集D分裂为多个子集'''
if (not isinstance(D, (set, list))):
return None
if (not type(A) is int):
return None
subset = {}
for t in D:
subset.setdefault(t[A], []).append(t)
return subset


def build_tree(D, A, threshold = 0.0001, T = -1, Tree = None, algo = "ID3"):
'''根据数据集D和特征集A构建决策树.

T为目标属性在元组中的索引 . 目前支持ID3和C4.5两种算法'''
if (Tree != None and not isinstance(Tree, Node)):
return None
if (not isinstance(D, (set, list))):
return None
if (not type(A) is set):
return None

if (None == Tree):
Tree = Node(None, D)
subset = devide_set(D, T)
if (len(subset) <= 1):
for key in subset.keys():
Tree.result = key
del(subset)
return Tree
if (len(A) <= 0):
Tree.result = get_result(D)
return Tree
use_gain_ratio = False if algo == "ID3" else True
max_gain = 0.0
for a in A:
gain = info_gain(D, a, return_ratio = use_gain_ratio)
if (gain > max_gain):
max_gain = gain
attr_id = a # 获取信息增益最大的特征
if (max_gain < threshold):
Tree.result = get_result(D)
return Tree
Tree.attr = attr_id
subD = devide_set(D, attr_id)
del(D[:]) # 删除中间数据,释放内存
Tree.dataset = None
A.discard(attr_id) # 从特征集中排查已经使用过的特征
for key in subD.keys():
tree = Node(Tree, subD.get(key))
Tree.childs[key] = tree
build_tree(subD.get(key), A, threshold, T, tree)
return Tree


def print_brance(brance, target):
odd = 0
for e in brance:
print(e, end = ('=' if odd == 0 else '∧'))
odd = 1 - odd
print("target =", target)


def print_tree(Tree, stack = []):
if (None == Tree):
return
if (None != Tree.result):
print_brance(stack, Tree.result)
return
stack.append(Tree.attr)
for key, value in Tree.childs.items():
stack.append(key)
print_tree(value, stack)
stack.pop()
stack.pop()

def classify(Tree, instance):
if (None == Tree):
return None
if (None != Tree.result):
return Tree.result
return classify(Tree.childs[instance[Tree.attr]], instance)

dataset = [
("青年", "否", "否", "一般", "否")
,("青年", "否", "否", "好", "否")
,("青年", "是", "否", "好", "是")
,("青年", "是", "是", "一般", "是")
,("青年", "否", "否", "一般", "否")
,("中年", "否", "否", "一般", "否")
,("中年", "否", "否", "好", "否")
,("中年", "是", "是", "好", "是")
,("中年", "否", "是", "非常好", "是")
,("中年", "否", "是", "非常好", "是")
,("老年", "否", "是", "非常好", "是")
,("老年", "否", "是", "好", "是")
,("老年", "是", "否", "好", "是")
,("老年", "是", "否", "非常好", "是")
,("老年", "否", "否", "一般", "否")
]


T = build_tree(dataset, set(range(0, len(dataset[0]) - 1)))
print_tree(T)
print(classify(T, ("老年", "否", "否", "一般")))

运行结果如下:

2=否∧1=否∧target = 否
2=否∧1=是∧target = 是
2=是∧target = 是