-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathrandomforest.py
78 lines (62 loc) · 2.59 KB
/
randomforest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from __future__ import division
import numpy as np
from scipy.stats import mode
from utilities import shuffle_in_unison
from decisiontree import DecisionTreeClassifier
class RandomForestClassifier(object):
""" A random forest classifier.
A random forest is a collection of decision trees that vote on a
classification decision. Each tree is trained with a subset of the data and
features.
"""
def __init__(self, n_estimators=32, max_features=np.sqrt, max_depth=10,
min_samples_split=2, bootstrap=0.9):
"""
Args:
n_estimators: The number of decision trees in the forest.
max_features: Controls the number of features to randomly consider
at each split.
max_depth: The maximum number of levels that the tree can grow
downwards before forcefully becoming a leaf.
min_samples_split: The minimum number of samples needed at a node to
justify a new node split.
bootstrap: The fraction of randomly choosen data to fit each tree on.
"""
self.n_estimators = n_estimators
self.max_features = max_features
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.bootstrap = bootstrap
self.forest = []
def fit(self, X, y):
""" Creates a forest of decision trees using a random subset of data and
features. """
self.forest = []
n_samples = len(y)
n_sub_samples = round(n_samples*self.bootstrap)
for i in xrange(self.n_estimators):
shuffle_in_unison(X, y)
X_subset = X[:n_sub_samples]
y_subset = y[:n_sub_samples]
tree = DecisionTreeClassifier(self.max_features, self.max_depth,
self.min_samples_split)
tree.fit(X_subset, y_subset)
self.forest.append(tree)
def predict(self, X):
""" Predict the class of each sample in X. """
n_samples = X.shape[0]
n_trees = len(self.forest)
predictions = np.empty([n_trees, n_samples])
for i in xrange(n_trees):
predictions[i] = self.forest[i].predict(X)
return mode(predictions)[0][0]
def score(self, X, y):
""" Return the accuracy of the prediction of X compared to y. """
y_predict = self.predict(X)
n_samples = len(y)
correct = 0
for i in xrange(n_samples):
if y_predict[i] == y[i]:
correct = correct + 1
accuracy = correct/n_samples
return accuracy