-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEx2.py
98 lines (84 loc) · 3.63 KB
/
Ex2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gensim
import re
import nltk
import os
from scipy.stats import spearmanr
def prepare_file(filename):
f = tuple(open(filename, 'r',encoding="utf8"))
# f = open(filename, 'rU').read();
sentences = []
cur_sent = []
for line in f:
line = line.strip()
if line == '</s>':
sentences.append(cur_sent)
cur_sent = []
elif line == '<s>' or line.startswith('<text'):
continue
else:
cur_sent.append(line)
return sentences;
def similars(model,pos_a,pos_b,neg_a,neg_real_value):
print(pos_a + " and " + pos_b + " minus " + neg_a +" should be " + neg_real_value)
print(model.most_similar(positive=[pos_a, pos_b], negative=[neg_a]))
def create_file_and_spearman(sentences, window, size,simlex):
lines = tuple(open(simlex, 'r'))
file_name = open(cwd + '\\size' + str(size) + '_window' + str(window) + '.txt', 'w');
word_to_vec = gensim.models.Word2Vec(sentences,min_count=5,window=window,size=size);
list_pos__a = [];
list_pos_n = [];
list_pos__v = [];
list_pos_all = [];
list_pos__a_simlex = [];
list_pos__n_simlex = [];
list_pos__v_simlex = [];
list_pos_all_simlex = [];
for line in lines:
list_words = re.split(r'\t+', line);
word1 = list_words[0];
word2 = list_words[1];
POS = list_words[2];
simlex_val = list_words[3];
try:
val = word_to_vec.similarity(word1,word2);
except:
print("error in words " + word1 + " " + word2);
val = 0;
finally:
file_name.write(word1 + " " + word2 + " " + POS + " " + str(val) + '\n');
list_pos_all.append(val);
list_pos_all_simlex.append(simlex_val);
if POS == 'A':
list_pos__a.append(val);
list_pos__a_simlex.append(simlex_val);
if POS == 'N':
list_pos_n.append(val);
list_pos__n_simlex.append(simlex_val);
if POS == 'V':
list_pos__v.append(val);
list_pos__v_simlex.append(simlex_val);
continue;
file_name.close();
print("Spearman for window " + str(window) + " and size " + str(size) + " A: " + str(spearmanr(list_pos__a,list_pos__a_simlex)))
print("Spearman for window " + str(window) + " and size " + str(size) + " N: " + str(spearmanr(list_pos_n, list_pos__n_simlex)))
print("Spearman for window " + str(window) + " and size " + str(size) + " V: " + str(spearmanr(list_pos__v, list_pos__v_simlex)))
print("Spearman for window " + str(window) + " and size " + str(size) + " ALL: " + str(spearmanr(list_pos_all, list_pos_all_simlex)))
print("\nSimilarities:")
similars(word_to_vec, 'big', 'tall', 'small', 'short');
similars(word_to_vec, 'man', 'flower', 'fetus', 'bud');
similars(word_to_vec, 'lake', 'meadow', 'pool', 'garden');
similars(word_to_vec, 'word', 'brick', 'book', 'building');
similars(word_to_vec, 'doctor', 'veterinarian', 'human', 'animal');
word_to_vec
cwd = os.getcwd()
path_corp = nltk.data.find(cwd+'\\corpus_ex2_3.txt');
path_simlex = nltk.data.find(cwd+'\\SimLex-999_2.txt');
sentences = prepare_file(path_corp)
create_file_and_spearman(sentences,10,100,path_simlex);
print('==================================================')
create_file_and_spearman(sentences,10,1000,path_simlex);
print('==================================================')
create_file_and_spearman(sentences,2,1000,path_simlex);
print('==================================================')
create_file_and_spearman(sentences,2,100,path_simlex);
print('==================================================')