-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcategory_similarities.py
114 lines (102 loc) · 4.72 KB
/
category_similarities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np
import time
class CategoricalSimilarity():
def __init__(self, data):
self.data = data
self.n = data.shape[0]
self.d = data.shape[1]
def overlap(self):
score = np.zeros((self.n, self.n))
for i, X in enumerate(self.data):
print(X)
print(i)
print(1 * np.sum(data==X, axis=1))
print(self.d)
score[i] = 1. * np.sum(data == X, axis=1) / self.d
return score
def eskin(self):
score = np.zeros((self.n, self.n))
n_k = [len(np.unique(data[:, x])) for x in range(data.shape[1])]
for i, X in enumerate(self.data):
tmp_bool = data==X
tmp_x = np.zeros(self.data.shape)
for k in range(self.d):
tmp_x[:, k][tmp_bool[:, k] == True] = 1.0 / self.d
tmp_x[:, k][tmp_bool[:,k] == False] = 1.0 * n_k[k]**2 / (n_k[k]**2 + 2) / self.d
score[i] = np.sum(tmp_x, axis=1)
return score
def iof(self):
score = np.zeros((self.n, self.n))
f_k_d = [dict(zip(np.unique(data[:, x], return_counts=True)[0], np.unique(data[:, x], return_counts=True)[1])) for x in range(self.d)]
for i, X in enumerate(self.data):
tmp_x = np.zeros(self.data.shape)
for k in range(self.d):
f_k_x = f_k_d[k].get(X[k], 0)
for val, f_k_y in f_k_d[k].items():
if val == X[k]:
tmp_x[:, k][data[:, k] == val] = 1. / self.d
else:
tmp_x[:, k][data[:, k] == val] = 1. / (1 + np.log(f_k_x) * np.log(f_k_y)) / self.d
score[i] = np.sum(tmp_x, axis=1)
return score
def of(self):
score = np.zeros((self.n, self.n))
f_k_d = [dict(zip(np.unique(data[:, x], return_counts=True)[0], np.unique(data[:, x], return_counts=True)[1])) for x in range(self.d)]
for i, X in enumerate(self.data):
tmp_x = np.zeros(self.data.shape)
for k in range(self.d):
f_k_x = f_k_d[k].get(X[k], 0)
for val, f_k_y in f_k_d[k].items():
if val == X[k]:
tmp_x[:, k][data[:, k] == val] = 1. / self.d
else:
tmp_x[:, k][data[:, k] == val] = 1. / (1 + np.log(1.*self.n/f_k_x) * np.log(1.*self.n/f_k_y)) / self.d
score[i] = np.sum(tmp_x, axis=1)
return score
# def lin(self):
# score = np.zeros((self.n, self.n))
# p_hat_k_d = [dict(zip(np.unique(data[:, x], return_counts=True)[0], 1.*np.unique(data[:, x], return_counts=True)[1]/self.n)) for x in range(self.d)]
# for i, X in enumerate(self.data):
# tmp_x = np.zeros(self.data.shape)
# for k in xrange(self.d):
# p_hat_k_x = p_hat_k_d[k].get(X[k], 0)
# for val, p_hat_k_y in p_hat_k_d[k].iteritems():
# if val == X[k]:
# tmp_x[:, k][data[:, k] == val] = 2. * np.log(p_hat_k_x) #TODO: normalize
# else:
# tmp_x[:, k][data[:, k] == val] = 2. * np.log(p_hat_k_x + p_hat_k_y) #TODO: normalize
# score[i] = np.sum(tmp_x, axis=1)
# return score
def goodall3(self):
score = np.zeros((self.n, self.n))
p_2_k_d = [dict(zip(np.unique(data[:, x], return_counts=True)[0], 1.*np.unique(data[:, x], return_counts=True)[1]*(np.unique(data[:, x], return_counts=True)[1]-1)/self.n/(self.n-1))) for x in range(self.d)]
for i, X in enumerate(self.data):
tmp_bool = data==X
tmp_x = np.zeros(self.data.shape)
for k in range(self.d):
tmp_x[:, k][tmp_bool[:, k] == True] = 1. * (1 - p_2_k_d[k].get(X[k], 0)) / self.d
score[i] = np.sum(tmp_x, axis=1)
return score
def goodall4(self):
score = np.zeros((self.n, self.n))
p_2_k_d = [dict(zip(np.unique(data[:, x], return_counts=True)[0], 1.*np.unique(data[:, x], return_counts=True)[1]*(np.unique(data[:, x], return_counts=True)[1]-1)/self.n/(self.n-1))) for x in range(self.d)]
for i, X in enumerate(self.data):
tmp_bool = data==X
tmp_x = np.zeros(self.data.shape)
for k in range(self.d):
tmp_x[:, k][tmp_bool[:, k] == True] = 1. * p_2_k_d[k].get(X[k], 0) / self.d
score[i] = np.sum(tmp_x, axis=1)
return score
if __name__ == '__main__':
# Test Data
data = np.array([
[1, 1, 3],
[1, 1, 2]
])
sim = CategoricalSimilarity(data)
print(sim.overlap())
#print(sim.eskin())
#print(sim.iof())
#print(sim.of())
#print(sim.goodall3())
#print(sim.goodall4())