Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve and add Spearman #227

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ script:
branches:
only:
- master
- spearman
3 changes: 2 additions & 1 deletion surprise/prediction_algorithms/algo_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ def compute_similarities(self, verbose=False):
construction_func = {'cosine': sims.cosine,
'msd': sims.msd,
'pearson': sims.pearson,
'pearson_baseline': sims.pearson_baseline}
'pearson_baseline': sims.pearson_baseline,
'spearman': sims.spearman}

if self.sim_options['user_based']:
n_x, yr = self.trainset.n_users, self.trainset.ir
Expand Down
112 changes: 112 additions & 0 deletions surprise/similarities.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Available similarity measures:
msd
pearson
pearson_baseline
spearman
"""

from __future__ import (absolute_import, division, print_function,
Expand All @@ -24,6 +25,8 @@ import numpy as np
from six.moves import range
from six import iteritems

from scipy.stats import rankdata


def cosine(n_x, yr, min_support):
"""Compute the cosine similarity between all pairs of users (or items).
Expand Down Expand Up @@ -359,3 +362,112 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases,
sim[xj, xi] = sim[xi, xj]

return sim


def spearman(n_x, yr, min_support):
"""Compute the Spearman correlation coefficient between all pairs of users
(or items).

Only **common** users (or items) are taken into account. The Spearman
correlation coefficient can be seen as a non parametric Pearson's
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean non-parametric?

I'd like to add something like "The spearman correlation coefficient is equivalent to Pearson correlation coefficient where the ratings are replaced by their rankings."

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay I've improved on that.

Similarity, and is defined as:

.. math ::
\\text{spearman_sim}(u, v) = \\frac{ \\sum\\limits_{i \\in I_{uv}}
(rank(r_{ui}) - \\overline{rank(u)}) \\cdot (rank(r_{vi}) - \\overline{rank(v)})} {\\sqrt{\\sum\\limits_{i
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please avoid lines longer than 79 characters

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This as well.

\\in I_{uv}} (rank(r_{ui}) - \\overline{rank(u)})^2} \\cdot \\sqrt{\\sum\\limits_{i \\in
I_{uv}} (rank(r_{vi}) - \\overline{rank(v)})^2} }

or

.. math ::
\\text{spearman_sim}(i, j) = \\frac{ \\sum\\limits_{u \\in U_{ij}}
(rank(r_{ui}) - \\overline{rank(i)}) \\cdot (rank(r_{uj}) - \\overline{rank(j)})} {\\sqrt{\\sum\\limits_{u
\\in U_{ij}} (rank(r_{ui}) - \\overline{rank(i)})^2} \\cdot \\sqrt{\\sum\\limits_{u \\in
U_{ij}} (rank(r_{uj}) - \\overline{rank(j)})^2} }

depending on the ``user_based`` field of ``sim_options`` (see
:ref:`similarity_measures_configuration`).


Note: if there are no common users or items, similarity will be 0 (and not
-1).

For details on Spearman coefficient, see in chapter 4, page 126 of: `Recommender Systems Handbook
<http://www.cs.ubbcluj.ro/~gabis/DocDiplome/SistemeDeRecomandare/Recommender_systems_handbook.pdf>`__.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't add the link I doubt it's very legal ;)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the change better?


"""

# number of common ys
cdef np.ndarray[np.int_t, ndim=2] freq
# sum (rank_xy * rank_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] prods
# sum (rank_xy ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqi
# sum (rank_x'y ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqj
# sum (rank_xy) for common ys
cdef np.ndarray[np.double_t, ndim=2] si
# sum (rank_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] sj
# the similarity matrix
cdef np.ndarray[np.double_t, ndim=2] sim

cdef np.ndarray[np.double_t, ndim=1] ranks
cdef np.ndarray[np.double_t, ndim=2] matrix

cdef int xi, xj
cdef double ri, rj
cdef int min_sprt = min_support

freq = np.zeros((n_x, n_x), np.int)
prods = np.zeros((n_x, n_x), np.double)
sqi = np.zeros((n_x, n_x), np.double)
sqj = np.zeros((n_x, n_x), np.double)
si = np.zeros((n_x, n_x), np.double)
sj = np.zeros((n_x, n_x), np.double)
sim = np.zeros((n_x, n_x), np.double)
ranks = np.zeros(n_x, np.double)
matrix = np.zeros((len(yr), n_x), np.double)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to be huge (n_users * n_items).

Passing xr as well would avoid the need to create matrix right? If that's the case then we should do it.


# turn yr into a matrix
for y, y_ratings in iteritems(yr):
for x_i, r_i in y_ratings:
matrix[y, x_i] = r_i
# turn the yr matrix into a matrix which contains the ranks the elements in yr
for x_i in range(n_x):
matrix[:,x_i] = rankdata(matrix[:,x_i])

for y, y_ratings in iteritems(yr):
for xi, ri in y_ratings:
# use the ranking matrix to get the elements row by row
ranks[xi] = matrix[y, xi]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there might be a problem here:

ranks[xi] contains the ranks for all the ys, right?

But when we compare 2 xs, we only want to do the that on the basis of their common ys. In the subsequent code you will compare them on the basis of all the ys.

Say we have 5 items and 2 users

ratings:
user 1: 1, 2, X, 4, 5
user 2: X, X, 1, 5, 2

The ranks are:

ranks:
user 1: 1, 2, X, 4, 5
user 2: X, X, 1, 3, 2

But on the common items the ratings are

ratings:
user 1: X, X, X, 4, 5
user 2: X, X, X, 5, 2

and the ranks are then

ranks:
user 1: X, X, X, 1, 2
user 2: X, X, X, 2, 1

So your code will consider the ranks

ranks:
user 1: 4, 5
user 2: 3, 2

while it should actually be considering

ranks:
user 1: 1, 2
user 2: 2, 1

Maybe this has no impact because the relative order of each rank will stay the same, and it has no effect on pearson? I don't know what would happen if there are ties though...

for xi, _ in y_ratings:
for xj, _ in y_ratings:
prods[xi, xj] += ranks[xi] * ranks[xj]
freq[xi, xj] += 1
sqi[xi, xj] += ranks[xi]**2
sqj[xi, xj] += ranks[xj]**2
si[xi, xj] += ranks[xi]
sj[xi, xj] += ranks[xj]

for xi in range(n_x):
sim[xi, xi] = 1
for xj in range(xi + 1, n_x):

if freq[xi, xj] < min_sprt:
sim[xi, xj] = 0
else:
n = freq[xi, xj]
num = n * prods[xi, xj] - si[xi, xj]*sj[xi, xj]
denum_l = n*sqi[xi, xj] - si[xi, xj]**2
denum_r = n*sqj[xi, xj] - sj[xi, xj]**2
denum = np.sqrt(denum_l * denum_r)
if denum == 0:
sim[xi, xj] = 0
else:
sim[xi, xj] = num / denum

sim[xj, xi] = sim[xi, xj]

return sim
7 changes: 6 additions & 1 deletion tests/test_sim_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@ def test_name_field(u1_ml100k, pkf):
algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
rmse_pearson_bsl = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

sim_options = {'name': 'spearman'}
bsl_options = {'n_epochs': 1}
algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
rmse_spearman = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

for rmse_a, rmse_b in combinations((rmse_cosine, rmse_msd, rmse_pearson,
rmse_pearson_bsl), 2):
rmse_pearson_bsl, rmse_spearman), 2):
assert (rmse_a != rmse_b)

with pytest.raises(NameError):
Expand Down
72 changes: 65 additions & 7 deletions tests/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@

n_x = 8
yr_global = {
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
1: [(0, 4), (1, 4), (2, 4), ], # noqa
2: [ (2, 5), (3, 2), (4, 3) ], # noqa
3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
}


Expand All @@ -33,7 +33,7 @@ def test_cosine_sim():

sim = sims.cosine(n_x, yr, min_support=1)

# check symetry and bounds (as ratings are > 0, cosine sim must be >= 0)
# check symmetry and bounds (as ratings are > 0, cosine sim must be >= 0)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol thanks for correcting the typos

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Always leave the place cleaner than you found it. ^^

for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
Expand Down Expand Up @@ -81,7 +81,7 @@ def test_msd_sim():

sim = sims.msd(n_x, yr, min_support=1)

# check symetry and bounds. MSD sim must be in [0, 1]
# check symmetry and bounds. MSD sim must be in [0, 1]
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
Expand Down Expand Up @@ -120,7 +120,7 @@ def test_pearson_sim():
random.shuffle(ratings)

sim = sims.pearson(n_x, yr, min_support=1)
# check symetry and bounds. -1 <= pearson coeff <= 1
# check symmetry and bounds. -1 <= pearson coeff <= 1
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
Expand Down Expand Up @@ -182,7 +182,7 @@ def test_pearson_baseline_sim():
x_biases = np.random.normal(0, 1, n_x) # fake
y_biases = np.random.normal(0, 1, 5) # fake (there are 5 ys)
sim = sims.pearson_baseline(n_x, yr, 1, global_mean, x_biases, y_biases)
# check symetry and bounds. -1 <= pearson coeff <= 1
# check symmetry and bounds. -1 <= pearson coeff <= 1
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
Expand All @@ -205,3 +205,61 @@ def test_pearson_baseline_sim():
for j in range(i + 1, n_x):
if i != 1 and j != 2:
assert sim[i, j] == 0


def test_spearman_sim():
"""Test for spearman similarity"""

yr = yr_global.copy()

# shuffle every rating list, to ensure the order in which ratings are
# processed does not matter (it's important because it used to be error
# prone when we were using itertools.combinations)
for _, ratings in yr.items():
random.shuffle(ratings)

sim = sims.spearman(n_x, yr, min_support=1)
# check symetry and bounds. -1 <= pearson coeff <= 1
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
assert sim[xi, xj] == sim[xj, xi]
assert -1 <= sim[xi, xj] <= 1

# on common items, users 0, 1 and 2 have the same ratings
assert sim[0, 1] == 1
assert sim[0, 2] == 1

# for vectors with constant ratings, pearson sim is necessarily zero (as
# ratings are centered)
assert sim[3, 4] == 0
assert sim[2, 3] == 0
assert sim[2, 4] == 0

# pairs of users (0, 3), have no common items
assert sim[0, 3] == 0
assert sim[0, 4] == 0

# ratings have same rankings
assert sim[5, 6] == 1

# check for float point support and computation correctness
mean6 = (1 + 2 + 3) / 3
var6 = (3 - mean6) ** 2 + (1 - mean6) ** 2 + (2 - mean6) ** 2
mean7 = (1 + 2 + 3) / 3
var7 = (1 - mean7) ** 2 + (3 - mean7) ** 2 + (2 - mean7) ** 2
num = sum([((3 - mean6) * (1 - mean7)),
((1 - mean6) * (3 - mean7)),
((2 - mean6) * (2 - mean7))
])
assert sim[6, 7] == num / (var6 * var7) ** 0.5

# ensure min_support is taken into account. Only users 1 and 2 have more
# than 4 common ratings.
sim = sims.spearman(n_x, yr, min_support=4)
for i in range(n_x):
for j in range(i + 1, n_x):
if i != 1 and j != 2:
assert sim[i, j] == 0