Fix dropped entity bug for Binary Relations

dwadden · Nov 9, 2022 · 6c687c6 · 6c687c6
1 parent 9714f0d
commit 6c687c6
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 12 deletions.
diff --git a/scripts/new-dataset/annotated_doc.py b/scripts/new-dataset/annotated_doc.py
@@ -16,7 +16,8 @@ class AnnotatedDocError(Exception):
 
 class AnnotatedDoc:
     def __init__(self, text, sents, ents, bin_rels, events, equiv_rels,
-                 doc_key, dataset, coref, nlp, total_original_ents):
+                 doc_key, dataset, coref, nlp, total_original_ents,
+                 total_original_rels):
         """
         Provides dual functionality for class construction. If this function is
         used, be sure that the ents, bin_rels, events, and equiv_rels are
@@ -33,8 +34,9 @@ def __init__(self, text, sents, ents, bin_rels, events, equiv_rels,
         self.coref = coref  # True if EquivRels should be treated as corefs
         self.nlp = nlp
         self.dropped_ents = 0
+        self.dropped_rels = 0
         self.total_original_ents = total_original_ents
-
+        self.total_original_rels = total_original_rels
 
     @classmethod
     def parse_ann(cls, txt, ann, nlp, dataset, coref):
@@ -88,6 +90,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):
         events = []
         equiv_rels = []
         total_original_ents = 0
+        total_original_rels = 0
         for line in split_lines:
 
             # The first character of the first element in the annotation
@@ -99,6 +102,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):
 
             elif line[0][0] == 'R':
                 bin_rels.append(BinRel(line))
+                total_original_rels += 1
 
             elif line[0][0] == 'E':
                 events.append(Event(line))
@@ -108,7 +112,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):
 
         annotated_doc = AnnotatedDoc(text, sents, ents, bin_rels, events,
                                      equiv_rels, doc_key, dataset, coref, nlp,
-                                     total_original_ents)
+                                     total_original_ents, total_original_rels)
         annotated_doc.set_annotation_objects()
 
         return annotated_doc
@@ -142,7 +146,10 @@ def format_dygiepp(self):
 
         # Format data
         ner = Ent.format_ner_dygiepp(self.ents, sent_idx_tups)
-        bin_rels = BinRel.format_bin_rels_dygiepp(self.bin_rels, sent_idx_tups)
+        bin_rels, self.dropped_rels = BinRel.format_bin_rels_dygiepp(self.bin_rels,
+                sent_idx_tups)
+        print(f'Completed relation formatting for {self.doc_key}. {self.dropped_rels} of '
+            f'{self.total_original_rels} entities were dropped due to tokenization mismatches.')
         if len(self.equiv_rels
                ) > 0 and self.coref:  # Some datasets don't have coreferences
             corefs = EquivRel.format_corefs_dygiepp(self.equiv_rels)
@@ -231,8 +238,8 @@ def char_to_token(self):
         # Set the list of entities that had token matches as ents for doc
         self.ents = ent_list_tokens
 
-        print(f'Completed doc {self.doc_key}. {self.dropped_ents} of '
-                f'{self.total_original_ents} entities '
+        print(f'Completed character to token conversion for doc {self.doc_key}. '
+                f'{self.dropped_ents} of {self.total_original_ents} entities '
                 'were dropped due to tokenization mismatches.')
 
 
@@ -340,14 +347,26 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):
 
         returns:
             bin_rels, list of list: dygiepp formatted relations
+            dropped_rels, int: number of relations that were dropped due to
+                entity token mismatches
         """
         bin_rels = []
+        dropped_rels = 0
         # Go through each sentence to get the relations in that sentence
         for sent_start, sent_end in sent_idx_tups:
 
             # Check first entity to see if relation is in this sentence
             sent_rels = []
             for rel in rel_list:
+                # Check to make sure both entities actually have token starts
+                if rel.arg1.tok_start == None or rel.arg2.tok_start == None:
+                    warnings.warn('Either the start or end token for relation '
+                            f'{rel.arg1.text} -- {rel.label} -- {rel.arg2.text} '
+                            f'(ID: {rel.ID}) was dropped due to tokenization '
+                            'mismatches. This relation will also be dropped '
+                            'as a result.')
+                    dropped_rels += 1
+                    continue
                 rel_start = rel.arg1.tok_start
                 if sent_start <= rel_start < sent_end:
                     sent_rels.append([
@@ -357,7 +376,7 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):
 
             bin_rels.append(sent_rels)
 
-        return bin_rels
+        return bin_rels, dropped_rels
 
 
 class Event:

diff --git a/scripts/new-dataset/brat_to_input.py b/scripts/new-dataset/brat_to_input.py
@@ -32,6 +32,8 @@ def format_annotated_document(fname_pair, dataset_name, nlp, coref):
 
     returns:
         res, json dict: formatted data
+        dropped_totals, dict: numbers of original and dropped entities and
+            relations for the document
     """
     # Make annotated doc object
     annotated_doc = AnnotatedDoc.parse_ann(fname_pair[0], fname_pair[1], nlp,
@@ -43,7 +45,13 @@ def format_annotated_document(fname_pair, dataset_name, nlp, coref):
     # Do the dygiepp conversion
     res = annotated_doc.format_dygiepp()
 
-    return res
+    # Get the numbers of dropped entities and relations for this document
+    dropped_totals = {'dropped_ents': annotated_doc.dropped_ents,
+                    'total_original_ents': annotated_doc.total_original_ents,
+                    'dropped_rels': annotated_doc.dropped_rels,
+                    'total_original_rels': annotated_doc.total_original_rels}
+
+    return res, dropped_totals
 
 
 def get_paired_files(all_files):
@@ -101,10 +109,22 @@ def format_labeled_dataset(data_directory, output_file, dataset_name,
     paired_files = get_paired_files(all_files)
 
     # Format doc file pairs
-    res = [
-        format_annotated_document(fname_pair, dataset_name, nlp, coref)
-        for fname_pair in paired_files
-    ]
+    overall_dropped_totals = {'dropped_ents':0, 'total_original_ents':0,
+            'dropped_rels':0, 'total_original_rels':0}
+    res = []
+    for fname_pair in paired_files:
+        r, dropped_totals = format_annotated_document(fname_pair, dataset_name, nlp, coref)
+        res.append(r)
+        overall_dropped_totals = {k: v + dropped_totals[k] for k, v in
+                overall_dropped_totals.items()}
+
+    print('\n\nCompleted conversion for entire dataset! '
+        f'{overall_dropped_totals["dropped_ents"]} of '
+        f'{overall_dropped_totals["total_original_ents"]} original entities '
+        'were dropped due to tokenization mismatches. As a result, '
+        f'{overall_dropped_totals["dropped_rels"]} of '
+        f'{overall_dropped_totals["total_original_rels"]} original relations '
+        'were dropped.')
 
     # Write out doc dictionaries
     with open(output_file, "w") as f: