-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyze_multiple_layers.py
181 lines (141 loc) · 7.52 KB
/
analyze_multiple_layers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import load_conll
import numpy as np
from sklearn.cross_decomposition import CCA
import codecs
import pandas as pd
from sklearn.metrics import normalized_mutual_info_score
import matplotlib.pyplot as plt
class CorrelationMatrix(object):
"""
Correlation matrix.
"""
def __init__(self, labels_a=None, labels_b=None, occurences_a=None, occurences_b=None):
# rows are true labels, columns predictions
self.matrix = np.zeros(shape=(len(labels_a), len(labels_b)))
self.labels_a = labels_a
self.labels_b = labels_b
if len(occurences_a) != len(occurences_b):
raise ValueError("Predictions and gold labels do not have the same count.")
else:
for i in range(len(occurences_a)):
indices_occurence_a = [labels_a.index(label) for label in occurences_a[i]]
indices_occurence_b = [labels_b.index(label) for label in occurences_b[i]]
for index_a in indices_occurence_a:
for index_b in indices_occurence_b:
self.matrix[index_a][index_b] += 1
def to_string(self, name):
with codecs.open("results/correlation_matrix_" + name + ".txt", "w", "utf8") as f:
f.write("\t")
for label in self.labels_b:
label = label.strip()
f.write(label + "\t")
f.write("\n")
for row_label, row in zip(self.labels_a, self.matrix):
f.write('%s\t[%s]' % (row_label, '\t'.join('%03s' % i for i in row)))
f.write("\n")
def port_annotations_to_sentence_level(annotations):
sentence_annotations = []
for sentence in annotations:
sentence_annotations.append(list(set(sentence)))
return sentence_annotations
def canonical_correlation_analysis(occurences_a, occurences_b):
occurences_a = pd.Series(occurences_a, dtype="category")
occurences_a = pd.get_dummies(occurences_a)
occurences_b = pd.DataFrame.from_items(occurences_b)
occurences_b = pd.get_dummies(occurences_b)
cca = CCA(n_components=1)
cca.fit(occurences_a, occurences_b)
return cca.score(occurences_a, occurences_b)
def mutual_information(occurences_a, occurences_b):
score = normalized_mutual_info_score(occurences_a, occurences_b)
return score
def information_theoretic_measures(y_arg, y_rhet,y_citation, y_aspect, y_summary):
y_arg = port_annotations_to_sentence_level(y_arg)
y_rhet = port_annotations_to_sentence_level(y_rhet)
y_aspect = port_annotations_to_sentence_level(y_aspect)
y_summary = port_annotations_to_sentence_level(y_summary)
y_citation = port_annotations_to_sentence_level(y_citation)
with codecs.open("results/mutual_information.txt", "w", "utf8") as f:
f.write("Argumentation & Discourse: ")
f.write(str(normalized_mutual_info_score(y_arg, y_rhet)) + "\n")
f.write("Argumentation & Aspect: ")
f.write(str(normalized_mutual_info_score(y_arg, y_aspect)) + "\n")
f.write("Argumentation & Summary: ")
f.write(str(normalized_mutual_info_score(y_arg, y_summary)) + "\n")
f.write("Argumentation & Citation Context: ")
f.write(str(normalized_mutual_info_score(y_arg, y_citation)) + "\n")
f.write("Discourse & Aspect: ")
f.write(str(normalized_mutual_info_score(y_rhet, y_aspect)) + "\n")
f.write("Discourse & Summary: ")
f.write(str(normalized_mutual_info_score(y_rhet, y_summary)) + "\n")
f.write("Discourse & Citation Context: ")
f.write(str(normalized_mutual_info_score(y_rhet, y_citation)) + "\n")
f.write("Aspect & Summary: ")
f.write(str(normalized_mutual_info_score(y_aspect, y_summary)) + "\n")
f.write("Aspect & Citation Context: ")
f.write(str(normalized_mutual_info_score(y_aspect, y_citation)) + "\n")
f.write("Summary & Citation Context: ")
f.write(str(normalized_mutual_info_score(y_summary, y_citation)) + "\n")
f.write("Discourse & Argumentation (sanity check): ")
f.write(str(normalized_mutual_info_score(y_rhet, y_arg)) + "\n")
def compute_correlation_matrices(y_arg, y_rhet,y_citation, y_aspect, y_summary):
y_arg = port_annotations_to_sentence_level(y_arg)
y_rhet = port_annotations_to_sentence_level(y_rhet)
y_aspect = port_annotations_to_sentence_level(y_aspect)
y_summary = port_annotations_to_sentence_level(y_summary)
y_citation = port_annotations_to_sentence_level(y_citation)
cm_arg_rhet = CorrelationMatrix(labels_a=list(set([item for sublist in y_arg for item in sublist])),
labels_b=list(set([item for sublist in y_rhet for item in sublist])),
occurences_a=y_arg, occurences_b=y_rhet)
cm_arg_rhet.to_string("arg_rhet")
cm_arg_cit = CorrelationMatrix(labels_a=list(set([item for sublist in y_arg for item in sublist])),
labels_b=list(set([item for sublist in y_citation for item in sublist])),
occurences_a=y_arg, occurences_b=y_citation)
cm_arg_cit.to_string("arg_cit")
cm_arg_summary = CorrelationMatrix(labels_a=list(set([item for sublist in y_arg for item in sublist])),
labels_b=list(set([item for sublist in y_summary for item in sublist])),
occurences_a=y_arg, occurences_b=y_summary)
cm_arg_summary.to_string("arg_summary")
cm_arg_aspect = CorrelationMatrix(labels_a=list(set([item for sublist in y_arg for item in sublist])),
labels_b=list(set([item for sublist in y_aspect for item in sublist])),
occurences_a=y_arg, occurences_b=y_aspect)
cm_arg_aspect.to_string("arg_aspect")
print("Process finished")
def plot_sentence_lengths(x):
lengths = [len(sentence) for sentence in x]
plot = plt.hist(lengths)
plt.show()
def remove_bio_from_arg(y):
for i,sentence in enumerate(y):
for j,token in enumerate(sentence):
#sentence.dtype ="S30"
token = token.split("Token_Label.")[1]
y[i][j] = token
if token == "BEGIN_BACKGROUND_CLAIM":
y[i][j] = "INSIDE_BACKGROUND_CLAIM"
elif token == "BEGIN_OWN_CLAIM":
y[i][j] = "INSIDE_OWN_CLAIM"
elif token == "BEGIN_DATA":
y[i][j] = "INSIDE_DATA"
return y
def remove_bio_from_cit(y):
for i,sentence in enumerate(y):
for j,token in enumerate(sentence):
#sentence.dtype ="S30"
#token = token.split("Token_Label.")[1]
#y[i][j] = token
if token == "BEGIN_CIT_CONTEXT\n":
y[i][j] = "INSIDE_CIT_CONTEXT\n"
return y
def main():
x, y_arg, y_rhet, y_aspect, y_summary, y_citation = load_conll.load_data_multiple("./annotations_conll_final_without_abstracts")
y_arg = remove_bio_from_arg(y_arg)
y_citation = remove_bio_from_cit(y_citation)
# plot_sentence_lengths(x)
print("Number of sentences with more than 200 tokens: " + str(len([len(sentence) for sentence in x if len(sentence) > 100])))
print(str(len([len(sentence) for sentence in x if len(sentence) > 100])/len(x)))
#compute_correlation_matrices(y_arg=y_arg, y_rhet=y_rhet, y_citation=y_citation, y_aspect=y_aspect, y_summary=y_summary)
#print(canonical_correlation_analysis(y_arg, y_rhet))
#information_theoretic_measures(y_arg=y_arg, y_rhet=y_rhet, y_citation=y_citation, y_aspect=y_aspect, y_summary=y_summary)
if __name__=="__main__":
main()