1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
| ''' Created on 2012-12-18 @author: viso '''
class Node: '''Represents a decision tree node. ''' def __init__(self, parent = None, dataset = None): self.dataset = dataset self.result = None self.attr = None self.childs = {} self.parent = parent
def entropy(props): if (not isinstance(props, (tuple, list))): return None from math import log log2 = lambda x:log(x)/log(2) e = 0.0 for p in props: e -= p * log2(p) return e
def info_gain(D, A, T = -1, return_ratio = False): '''特征A对训练数据集D的信息增益 g(D,A) g(D,A)=entropy(D) - entropy(D|A) 假设数据集D的每个元组的最后一个特征为类标签 T为目标属性的ID,-1表示元组的最后一个元素为目标''' if (not isinstance(D, (set, list))): return None if (not type(A) is int): return None C = {} DA = {} CDA = {} for t in D: C[t[T]] = C.get(t[T], 0) + 1 DA[t[A]] = DA.get(t[A], 0) + 1 CDA[(t[T], t[A])] = CDA.get((t[T], t[A]), 0) + 1 PC = map(lambda x : x / len(D), C.values()) entropy_D = entropy(tuple(PC))
PCDA = {} for key, value in CDA.items(): a = key[1] pca = value / DA[a] PCDA.setdefault(a, []).append(pca) condition_entropy = 0.0 for a, v in DA.items(): p = v / len(D) e = entropy(PCDA[a]) condition_entropy += e * p if (return_ratio): return (entropy_D - condition_entropy) / entropy_D else: return entropy_D - condition_entropy def get_result(D, T = -1): '''获取数据集D中实例数最大的目标特征T的值''' if (not isinstance(D, (set, list))): return None if (not type(T) is int): return None count = {} for t in D: count[t[T]] = count.get(t[T], 0) + 1 max_count = 0 for key, value in count.items(): if (value > max_count): max_count = value result = key return result
def devide_set(D, A): '''根据特征A的值把数据集D分裂为多个子集''' if (not isinstance(D, (set, list))): return None if (not type(A) is int): return None subset = {} for t in D: subset.setdefault(t[A], []).append(t) return subset
def build_tree(D, A, threshold = 0.0001, T = -1, Tree = None, algo = "ID3"): '''根据数据集D和特征集A构建决策树. T为目标属性在元组中的索引 . 目前支持ID3和C4.5两种算法''' if (Tree != None and not isinstance(Tree, Node)): return None if (not isinstance(D, (set, list))): return None if (not type(A) is set): return None if (None == Tree): Tree = Node(None, D) subset = devide_set(D, T) if (len(subset) <= 1): for key in subset.keys(): Tree.result = key del(subset) return Tree if (len(A) <= 0): Tree.result = get_result(D) return Tree use_gain_ratio = False if algo == "ID3" else True max_gain = 0.0 for a in A: gain = info_gain(D, a, return_ratio = use_gain_ratio) if (gain > max_gain): max_gain = gain attr_id = a if (max_gain < threshold): Tree.result = get_result(D) return Tree Tree.attr = attr_id subD = devide_set(D, attr_id) del(D[:]) Tree.dataset = None A.discard(attr_id) for key in subD.keys(): tree = Node(Tree, subD.get(key)) Tree.childs[key] = tree build_tree(subD.get(key), A, threshold, T, tree) return Tree
def print_brance(brance, target): odd = 0 for e in brance: print(e, end = ('=' if odd == 0 else '∧')) odd = 1 - odd print("target =", target)
def print_tree(Tree, stack = []): if (None == Tree): return if (None != Tree.result): print_brance(stack, Tree.result) return stack.append(Tree.attr) for key, value in Tree.childs.items(): stack.append(key) print_tree(value, stack) stack.pop() stack.pop() def classify(Tree, instance): if (None == Tree): return None if (None != Tree.result): return Tree.result return classify(Tree.childs[instance[Tree.attr]], instance) dataset = [ ("青年", "否", "否", "一般", "否") ,("青年", "否", "否", "好", "否") ,("青年", "是", "否", "好", "是") ,("青年", "是", "是", "一般", "是") ,("青年", "否", "否", "一般", "否") ,("中年", "否", "否", "一般", "否") ,("中年", "否", "否", "好", "否") ,("中年", "是", "是", "好", "是") ,("中年", "否", "是", "非常好", "是") ,("中年", "否", "是", "非常好", "是") ,("老年", "否", "是", "非常好", "是") ,("老年", "否", "是", "好", "是") ,("老年", "是", "否", "好", "是") ,("老年", "是", "否", "非常好", "是") ,("老年", "否", "否", "一般", "否") ]
T = build_tree(dataset, set(range(0, len(dataset[0]) - 1))) print_tree(T) print(classify(T, ("老年", "否", "否", "一般")))
|