-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtf_universal_sent_emb.py
67 lines (39 loc) · 1.86 KB
/
tf_universal_sent_emb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 19:46:30 2018
@author: gurunath.lv
"""
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
def universal_sent_embeddings(text_series):
"""
Taking too much time to process but produces good
quality word embeddings using universal sentance embeddings
"""
import tensorflow as tf
import tensorflow_hub as hub
test_decriptions_list=text_series.tolist()
embed = hub.Module(r'C:\Users\gurunath.lv\AppData\Local\Temp\tfhub_modules\c6f5954ffa065cdb2f2e604e740e8838bf21a2d3')
tf.logging.set_verbosity(tf.logging.ERROR)
with tf.Session() as session:
session.run([tf.global_variables_initializer(), tf.tables_initializer()])
test_descriptions_embeddings = session.run(embed(test_decriptions_list))
return test_descriptions_embeddings
def transform_using_tfidf(text_series):
tfidf=TfidfVectorizer(stop_words='english')
array=tfidf.fit_transform(text_series.tolist()).toarray()
return array,tfidf
def similarity_measure(inp_sent,array,tfidf,top_n):
inp_vec=tfidf.transform([inp_sent]).toarray()
cs=cosine_similarity(inp_vec,array)
top_match_index=np.flip(np.argsort(cs,axis=1)[:,-top_n:],axis=1)
return top_match_index
def get_similar_records(inp_sent,total_text,top_n=10):
array,tfidf=transform_using_tfidf(total_text)
top_match_index=similarity_measure(inp_sent,array,tfidf,top_n)
return total_text.iloc[top_match_index.ravel()]
if __name__=='__main__':
user_story=pd.read_csv(r'D:\Testing_frameworks\Testcase-Vmops\Insight\data\interim\filtered_user_story_by_priority.csv',encoding='ISO-8859-1')
get_similar_records(user_story['Summary'][2],user_story['Summary'])