-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluate.py
125 lines (99 loc) · 4.16 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os, sys, json
import textstat
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score
import nltk
from alignscore import AlignScore
from lens.lens_score import LENS
import torch
from summac.model_summac import SummaCConv
nltk.download('punkt')
def calc_rouge(preds, refs):
# Get ROUGE F1 scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], \
use_stemmer=True, split_summaries=True)
scores = [scorer.score(p, refs[i]) for i, p in enumerate(preds)]
return np.mean([s['rouge1'].fmeasure for s in scores]), \
np.mean([s['rouge2'].fmeasure for s in scores]), \
np.mean([s['rougeLsum'].fmeasure for s in scores])
def calc_bertscore(preds, refs):
# Get BERTScore F1 scores
P, R, F1 = score(preds, refs, lang="en", verbose=True, device='cuda:0')
return np.mean(F1.tolist())
def calc_readability(preds):
fkgl_scores = []
cli_scores = []
dcrs_scores = []
for pred in preds:
fkgl_scores.append(textstat.flesch_kincaid_grade(pred))
cli_scores.append(textstat.coleman_liau_index(pred))
dcrs_scores.append(textstat.dale_chall_readability_score(pred))
return np.mean(fkgl_scores), np.mean(cli_scores), np.mean(dcrs_scores)
def calc_lens(preds, refs, docs):
model_path = "./models/LENS/LENS/checkpoints/epoch=5-step=6102.ckpt"
metric = LENS(model_path, rescale=True)
abstracts = [d.split("\n")[0] for d in docs]
refs = [[x] for x in refs]
scores = metric.score(abstracts, preds, refs, batch_size=8, gpus=1)
return np.mean(scores)
def calc_alignscore(preds, docs):
alignscorer = AlignScore(model='roberta-base', batch_size=16, device='cuda:0', \
ckpt_path='./models/AlignScore/AlignScore-base.ckpt', evaluation_mode='nli_sp')
return np.mean(alignscorer.score(contexts=docs, claims=preds))
def cal_summac(preds, docs):
model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cuda", start_file="default", agg="mean")
return np.mean(model_conv.score(docs, preds)['scores'])
def read_file_lines(path):
with open(path, 'r') as f:
lines = f.readlines()
if path.endswith('.jsonl'):
lines = [json.loads(line) for line in lines]
return lines
def evaluate(pred_path, gold_path):
# Load data from files
refs_dicts = read_file_lines(gold_path)
preds = read_file_lines(pred_path)
refs = [d['lay_summary'] for d in refs_dicts]
docs = [d['article'] for d in refs_dicts]
score_dict = {}
# Relevance scores
rouge1_score, rouge2_score, rougel_score = calc_rouge(preds, refs)
score_dict['ROUGE1'] = rouge1_score
score_dict['ROUGE2'] = rouge2_score
score_dict['ROUGEL'] = rougel_score
score_dict['BERTScore'] = calc_bertscore(preds, refs)
# # Readability scores
fkgl_score, cli_score, dcrs_score = calc_readability(preds)
score_dict['FKGL'] = fkgl_score
score_dict['DCRS'] = dcrs_score
score_dict['CLI'] = cli_score
score_dict['LENS'] = calc_lens(preds, refs, docs)
# Factuality scores
score_dict['AlignScore'] = calc_alignscore(preds, docs)
score_dict['SummaC'] = cal_summac(preds, docs)
return score_dict
def write_scores(score_dict, output_filepath):
# Write scores to file
with open(output_filepath, 'w') as f:
for key, value in score_dict.items():
f.write(f"{key}: {value}\n")
submit_dir = sys.argv[1] # directory with txt files ("elife.txt" and "plos.txt") with predictions
truth_dir = sys.argv[2] # directory with jsonl files containing references and articles
output_dir = "./"
# Calculate + write eLife scores
elife_scores = evaluate(
os.path.join(submit_dir, 'elife.txt'),
os.path.join(truth_dir, 'eLife_val.jsonl'),
)
write_scores(elife_scores, os.path.join(output_dir, 'elife_scores.txt'))
torch.cuda.empty_cache()
# Calculate + write PLOS scores
plos_scores = evaluate(
os.path.join(submit_dir, 'plos.txt'),
os.path.join(truth_dir, 'PLOS_val.jsonl'),
)
write_scores(plos_scores, os.path.join(output_dir, 'plos_scores.txt'))
# Calculate + write overall scores
avg_scores = {key: np.mean([elife_scores[key], plos_scores[key]]) for key in elife_scores.keys()}
write_scores(avg_scores, os.path.join(output_dir, 'scores.txt'))