Skip to content

Commit

Permalink
Fix dropped entity bug for Binary Relations
Browse files Browse the repository at this point in the history
  • Loading branch information
serenalotreck committed Nov 9, 2022
1 parent 9714f0d commit 6c687c6
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 12 deletions.
33 changes: 26 additions & 7 deletions scripts/new-dataset/annotated_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class AnnotatedDocError(Exception):

class AnnotatedDoc:
def __init__(self, text, sents, ents, bin_rels, events, equiv_rels,
doc_key, dataset, coref, nlp, total_original_ents):
doc_key, dataset, coref, nlp, total_original_ents,
total_original_rels):
"""
Provides dual functionality for class construction. If this function is
used, be sure that the ents, bin_rels, events, and equiv_rels are
Expand All @@ -33,8 +34,9 @@ def __init__(self, text, sents, ents, bin_rels, events, equiv_rels,
self.coref = coref # True if EquivRels should be treated as corefs
self.nlp = nlp
self.dropped_ents = 0
self.dropped_rels = 0
self.total_original_ents = total_original_ents

self.total_original_rels = total_original_rels

@classmethod
def parse_ann(cls, txt, ann, nlp, dataset, coref):
Expand Down Expand Up @@ -88,6 +90,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):
events = []
equiv_rels = []
total_original_ents = 0
total_original_rels = 0
for line in split_lines:

# The first character of the first element in the annotation
Expand All @@ -99,6 +102,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):

elif line[0][0] == 'R':
bin_rels.append(BinRel(line))
total_original_rels += 1

elif line[0][0] == 'E':
events.append(Event(line))
Expand All @@ -108,7 +112,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):

annotated_doc = AnnotatedDoc(text, sents, ents, bin_rels, events,
equiv_rels, doc_key, dataset, coref, nlp,
total_original_ents)
total_original_ents, total_original_rels)
annotated_doc.set_annotation_objects()

return annotated_doc
Expand Down Expand Up @@ -142,7 +146,10 @@ def format_dygiepp(self):

# Format data
ner = Ent.format_ner_dygiepp(self.ents, sent_idx_tups)
bin_rels = BinRel.format_bin_rels_dygiepp(self.bin_rels, sent_idx_tups)
bin_rels, self.dropped_rels = BinRel.format_bin_rels_dygiepp(self.bin_rels,
sent_idx_tups)
print(f'Completed relation formatting for {self.doc_key}. {self.dropped_rels} of '
f'{self.total_original_rels} entities were dropped due to tokenization mismatches.')
if len(self.equiv_rels
) > 0 and self.coref: # Some datasets don't have coreferences
corefs = EquivRel.format_corefs_dygiepp(self.equiv_rels)
Expand Down Expand Up @@ -231,8 +238,8 @@ def char_to_token(self):
# Set the list of entities that had token matches as ents for doc
self.ents = ent_list_tokens

print(f'Completed doc {self.doc_key}. {self.dropped_ents} of '
f'{self.total_original_ents} entities '
print(f'Completed character to token conversion for doc {self.doc_key}. '
f'{self.dropped_ents} of {self.total_original_ents} entities '
'were dropped due to tokenization mismatches.')


Expand Down Expand Up @@ -340,14 +347,26 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):
returns:
bin_rels, list of list: dygiepp formatted relations
dropped_rels, int: number of relations that were dropped due to
entity token mismatches
"""
bin_rels = []
dropped_rels = 0
# Go through each sentence to get the relations in that sentence
for sent_start, sent_end in sent_idx_tups:

# Check first entity to see if relation is in this sentence
sent_rels = []
for rel in rel_list:
# Check to make sure both entities actually have token starts
if rel.arg1.tok_start == None or rel.arg2.tok_start == None:
warnings.warn('Either the start or end token for relation '
f'{rel.arg1.text} -- {rel.label} -- {rel.arg2.text} '
f'(ID: {rel.ID}) was dropped due to tokenization '
'mismatches. This relation will also be dropped '
'as a result.')
dropped_rels += 1
continue
rel_start = rel.arg1.tok_start
if sent_start <= rel_start < sent_end:
sent_rels.append([
Expand All @@ -357,7 +376,7 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):

bin_rels.append(sent_rels)

return bin_rels
return bin_rels, dropped_rels


class Event:
Expand Down
30 changes: 25 additions & 5 deletions scripts/new-dataset/brat_to_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def format_annotated_document(fname_pair, dataset_name, nlp, coref):
returns:
res, json dict: formatted data
dropped_totals, dict: numbers of original and dropped entities and
relations for the document
"""
# Make annotated doc object
annotated_doc = AnnotatedDoc.parse_ann(fname_pair[0], fname_pair[1], nlp,
Expand All @@ -43,7 +45,13 @@ def format_annotated_document(fname_pair, dataset_name, nlp, coref):
# Do the dygiepp conversion
res = annotated_doc.format_dygiepp()

return res
# Get the numbers of dropped entities and relations for this document
dropped_totals = {'dropped_ents': annotated_doc.dropped_ents,
'total_original_ents': annotated_doc.total_original_ents,
'dropped_rels': annotated_doc.dropped_rels,
'total_original_rels': annotated_doc.total_original_rels}

return res, dropped_totals


def get_paired_files(all_files):
Expand Down Expand Up @@ -101,10 +109,22 @@ def format_labeled_dataset(data_directory, output_file, dataset_name,
paired_files = get_paired_files(all_files)

# Format doc file pairs
res = [
format_annotated_document(fname_pair, dataset_name, nlp, coref)
for fname_pair in paired_files
]
overall_dropped_totals = {'dropped_ents':0, 'total_original_ents':0,
'dropped_rels':0, 'total_original_rels':0}
res = []
for fname_pair in paired_files:
r, dropped_totals = format_annotated_document(fname_pair, dataset_name, nlp, coref)
res.append(r)
overall_dropped_totals = {k: v + dropped_totals[k] for k, v in
overall_dropped_totals.items()}

print('\n\nCompleted conversion for entire dataset! '
f'{overall_dropped_totals["dropped_ents"]} of '
f'{overall_dropped_totals["total_original_ents"]} original entities '
'were dropped due to tokenization mismatches. As a result, '
f'{overall_dropped_totals["dropped_rels"]} of '
f'{overall_dropped_totals["total_original_rels"]} original relations '
'were dropped.')

# Write out doc dictionaries
with open(output_file, "w") as f:
Expand Down

0 comments on commit 6c687c6

Please sign in to comment.