-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdecisiontree.py
140 lines (100 loc) · 4.32 KB
/
decisiontree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from __future__ import division
import random
import numpy as np
from scipy.stats import mode
from utilities import information_gain, entropy
class DecisionTreeClassifier(object):
""" A decision tree classifier.
A decision tree is a structure in which each node represents a binary
conditional decision on a specific feature, each branch represents the
outcome of the decision, and each leaf node represents a final
classification.
"""
def __init__(self, max_features=lambda x: x, max_depth=10,
min_samples_split=2):
"""
Args:
max_features: A function that controls the number of features to
randomly consider at each split. The argument will be the number
of features in the data.
max_depth: The maximum number of levels the tree can grow downwards
before forcefully becoming a leaf.
min_samples_split: The minimum number of samples needed at a node to
justify a new node split.
"""
self.max_features = max_features
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def fit(self, X, y):
""" Builds the tree by chooseing decision rules for each node based on
the data. """
n_features = X.shape[1]
n_sub_features = int(self.max_features(n_features))
feature_indices = random.sample(xrange(n_features), n_sub_features)
self.trunk = self.build_tree(X, y, feature_indices, 0)
def predict(self, X):
""" Predict the class of each sample in X. """
num_samples = X.shape[0]
y = np.empty(num_samples)
for j in xrange(num_samples):
node = self.trunk
while isinstance(node, Node):
if X[j][node.feature_index] <= node.threshold:
node = node.branch_true
else:
node = node.branch_false
y[j] = node
return y
def build_tree(self, X, y, feature_indices, depth):
""" Recursivly builds a decision tree. """
if depth is self.max_depth or len(y) < self.min_samples_split or entropy(y) is 0:
return mode(y)[0][0]
feature_index, threshold = find_split(X, y, feature_indices)
X_true, y_true, X_false, y_false = split(X, y, feature_index, threshold)
if y_true.shape[0] is 0 or y_false.shape[0] is 0:
return mode(y)[0][0]
branch_true = self.build_tree(X_true, y_true, feature_indices, depth + 1)
branch_false = self.build_tree(X_false, y_false, feature_indices, depth + 1)
return Node(feature_index, threshold, branch_true, branch_false)
def find_split(X, y, feature_indices):
""" Returns the best split rule for a tree node. """
num_features = X.shape[1]
best_gain = 0
best_feature_index = 0
best_threshold = 0
for feature_index in feature_indices:
values = sorted(set(X[:, feature_index])) ### better way
for j in xrange(len(values) - 1):
threshold = (values[j] + values[j+1])/2
X_true, y_true, X_false, y_false = split(X, y, feature_index, threshold)
gain = information_gain(y, y_true, y_false)
if gain > best_gain:
best_gain = gain
best_feature_index = feature_index
best_threshold = threshold
return best_feature_index, best_threshold
class Node(object):
""" A node in a decision tree with the binary condition xi <= t. """
def __init__(self, feature_index, threshold, branch_true, branch_false):
self.feature_index = feature_index
self.threshold = threshold
self.branch_true = branch_true
self.branch_false = branch_false
def split(X, y, feature_index, threshold):
""" Splits X and y based on the binary condition xi <= threshold. """
X_true = []
y_true = []
X_false = []
y_false = []
for j in xrange(len(y)):
if X[j][feature_index] <= threshold:
X_true.append(X[j])
y_true.append(y[j])
else:
X_false.append(X[j])
y_false.append(y[j])
X_true = np.array(X_true)
y_true = np.array(y_true)
X_false = np.array(X_false)
y_false = np.array(y_false)
return X_true, y_true, X_false, y_false