forked from dchaplinsky/unshred-tag
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrics.py
executable file
·97 lines (74 loc) · 3.1 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!env python
from __future__ import division
from itertools import combinations
from fn.iters import grouper
from flask import Blueprint, g, redirect, render_template
from flask.ext import login
from models import Shreds, ShredsDistances, User
BULK_INSERT_SIZE = 100000
SHREDS_CAP = 10000
TAGS_REPEATS = 2
mod = Blueprint('metrics', __name__, template_folder='templates/metrics',
url_prefix='/metrics')
@mod.route('/shred/pairs', defaults={'start': 0, 'end': 100})
@mod.route('/shred/pairs/<int:end>', defaults={'start': 0})
@mod.route('/shred/pairs/<int:start>/<int:end>')
@login.login_required
def dist_pairs(start, end):
if not User.objects(pk=g.user.id).first()['admin']:
redirect('/')
dists = ShredsDistances.objects().order_by('distance')[start:end]
return render_template('distance_pairs.html', dists=dists)
def jaccard_distance(tags_a, tags_b):
"""
0 <= J <= 1
0 - tags_a & tags_a sets are equal
1 - tags_a have nothing in common with tags_b
See http://en.wikipedia.org/wiki/Jaccard_index for details
"""
return 1 - len(tags_a.intersection(tags_b)) / len(tags_a.union(tags_b))
def jaccard_distances_iterator(shreds_tags):
for shred_a, shred_b in combinations(shreds_tags.items(), 2):
shred_a_id, tags_a = shred_a
shred_b_id, tags_b = shred_b
yield shred_a_id, shred_b_id, jaccard_distance(set(tags_a), set(tags_b))
def fetch_normalized_shreds_tags(repeats):
"""
Returns dictionary where keys are shreds ids and values are sets of
filtered normalized tags.
"""
shreds = Shreds.objects().only('id', 'tags.tags')[:SHREDS_CAP]
shreds_tags = {}
for s in shreds:
tags = s.get_repeated_tags(repeats)
if tags:
shreds_tags[s.id] = tags
return shreds_tags
def churn_jaccard(drop=False, repeats=TAGS_REPEATS):
"""
We'll be working with pre-created list of ShredsDistances documents - s_distances
This way we don't spend CPU cycles & mallocs to create millions of
objects to be thrown away.
It would be even more efficient to not create these documents at all
and insert raw data into mongo, but I haven't dug deep enough.
"""
if drop:
ShredsDistances.objects(distance_type='jaccard').delete()
shreds_tags = fetch_normalized_shreds_tags(repeats=repeats)
s_distances = [ShredsDistances() for _ in xrange(BULK_INSERT_SIZE)]
for distances in grouper(BULK_INSERT_SIZE, jaccard_distances_iterator(shreds_tags)):
for i, distance in enumerate(distances):
if distance:
# assign data to pre-created ShredsDistances document
tag_a, tag_b, dist = distance
s_d = s_distances[i]
s_d.shreds_pair = [tag_a, tag_b]
s_d.distance = dist
s_d.distance_type = 'jaccard'
else:
# Cut the tail of pre-created documents from last bulk set
s_distances[i] = None
ShredsDistances.objects.insert(filter(None, s_distances), load_bulk=False)
if __name__ == '__main__':
from app import db
churn_jaccard()