Fix bug in relations, entity and event counters

dwadden · Jun 27, 2023 · 34d82ca · 34d82ca
1 parent 1f4865a
commit 34d82ca
Show file tree

Hide file tree

Showing 2 changed files with 200 additions and 15 deletions.
diff --git a/dygie/tests/data/annotated_doc_test.py b/dygie/tests/data/annotated_doc_test.py
@@ -505,6 +505,173 @@ def test_format_dygiepp(self):
         res = annotated_doc.format_dygiepp()
         self.assertEqual(res, self.dygiepp_dict)
 
+class TestDropCounters(unittest.TestCase):
+    """
+    Tests the functionality of the entity and relation counters in the
+    AnnotatedDoc class..
+    """
+    def setUp(self):
+
+        # Set up temp dir and test docs
+        self.tmpdir = "tmp"
+        os.makedirs(self.tmpdir, exist_ok=True)
+
+        simple_txt = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. "
+               "She was elected in 2017.")
+
+        self.simple_txt = f'{self.tmpdir}/mysimplefile.txt'
+        with open(self.simple_txt, 'w') as f:
+            f.write(simple_txt)
+
+        simple_ann = ("T1\tCity 0 7;13 23\tSeattle\n"
+               "T2\tPerson 25 37\tJenny Durkan\n"
+               "T3\tCity 41 51\tthe city's\n"
+               "T4\tPerson 59 62\tShe\n"
+               "T5\tPersonnel.Election 67 74\telected\n"
+               "T6\tYear 78 82\t2017\n"
+               "T7\tCity 13 23\trainy city\n"
+               "R1\tIs-A Arg1:T1 Arg2:T7\n"
+               "R2\tMayor-Of Arg1:T2 Arg2:T3\n"
+               "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
+               "*\tEQUIV T1 T3\n"
+               "*\tEQUIV T2 T4\n")
+
+        self.simple_ann = f'{self.tmpdir}/mysimplefile.ann'
+        with open(self.simple_ann, 'w') as f:
+            f.write(simple_ann)
+
+        complex_txt = ("Global target profile of the kinase inhibitor bosutinib "
+            "in primary chronic myeloid leukemia cells.\n"
+            "The detailed molecular mechanism of action of second-generation "
+            "BCR-ABL tyrosine kinase inhibitors, including perturbed targets and "
+            "pathways, should contribute to rationalized therapy in chronic "
+            "myeloid leukemia (CML) or in other affected diseases. Here, we "
+            "characterized the target profile of the dual SRC/ABL inhibitor "
+            "bosutinib employing a two-tiered approach using chemical proteomics "
+            "to identify natural binders in whole cell lysates of primary CML "
+            "and K562 cells in parallel to in vitro kinase assays against a large "
+            "recombinant kinase panel. The combined strategy resulted in a global "
+            "survey of bosutinib targets comprised of over 45 novel tyrosine "
+            "and serine/threonine kinases. We have found clear differences in the "
+            "target patterns of bosutinib in primary CML cells versus the K562 "
+            "cell line. A comparison of bosutinib with dasatinib across the "
+            "whole kinase panel revealed overlapping, but distinct, inhibition "
+            "profiles. Common among those were the SRC, ABL and TEC family "
+            "kinases. Bosutinib did not inhibit KIT or platelet-derived growth "
+            "factor receptor, but prominently targeted the apoptosis-linked "
+            "STE20 kinases. Although in vivo bosutinib is inactive against ABL "
+            "T315I, we found this clinically important mutant to be "
+            "enzymatically inhibited in the mid-nanomolar range. Finally, "
+            "bosutinib is the first kinase inhibitor shown to target CAMK2G, "
+            "recently implicated in myeloid leukemia cell proliferation.")
+
+        self.complex_txt = f'{self.tmpdir}/mycomplexfile.txt'
+        with open(self.complex_txt, 'w') as f:
+            f.write(complex_txt)
+
+        complex_ann = ("T10\tCHEMICAL 932 941\tdasatinib\n"
+                "T11\tCHEMICAL 1090 1099\tBosutinib\n"
+                "T12\tCHEMICAL 46 55\tbosutinib\n"
+                "T13\tGENE-Y 1116 1119\tKIT\n"
+                "T14\tGENE-N 1123 1162\tplatelet-derived growth factor receptor\n"
+                "T15\tGENE-N 1210 1223\tSTE20 kinases\n"
+                "T16\tGENE-Y 1272 1275\tABL\n"
+                "T17\tGENE-N 1276 1281\tT315I\n"
+                "T18\tGENE-N 1415 1421\tkinase\n"
+                "T19\tGENE-Y 1448 1454\tCAMK2G\n"
+                "T1\tCHEMICAL 1242 1251\tbosutinib\n"
+                "T20\tGENE-Y 402 405\tSRC\n"
+                "T21\tGENE-Y 406 409\tABL\n"
+                "T22\tGENE-N 592 598\tkinase\n"
+                "T23\tGENE-N 634 640\tkinase\n"
+                "T24\tGENE-Y 163 166\tBCR\n"
+                "T25\tGENE-N 746 783\ttyrosine and serine/threonine kinases\n"
+                "T26\tGENE-Y 167 170\tABL\n"
+                "T27\tGENE-N 171 186\ttyrosine kinase\n"
+                "T28\tGENE-N 959 965\tkinase\n"
+                "T29\tGENE-Y 1057 1060\tSRC\n"
+                "T2\tCHEMICAL 1392 1401\tbosutinib\n"
+                "T30\tGENE-Y 1062 1065\tABL\n"
+                "T31\tGENE-Y 1070 1073\tTEC\n"
+                "T32\tGENE-N 1081 1088\tkinases\n"
+                "T33\tGENE-N 29 35\tkinase\n"
+                "T3\tCHEMICAL 420 429\tbosutinib\n"
+                "T4\tCHEMICAL 701 710\tbosutinib\n"
+                "T5\tCHEMICAL 746 754\ttyrosine\n"
+                "T6\tCHEMICAL 759 765\tserine\n"
+                "T7\tCHEMICAL 766 775\tthreonine\n"
+                "T8\tCHEMICAL 843 852\tbosutinib\n"
+                "T9\tCHEMICAL 917 926\tbosutinib\n"
+                "R0\tCPR:10 Arg1:T11 Arg2:T13\n"
+                "R1\tCPR:10 Arg1:T11 Arg2:T14\n"
+                "R2\tCPR:10 Arg1:T1 Arg2:T16\n"
+                "R3\tCPR:10 Arg1:T1 Arg2:T17\n"
+                "R4\tCPR:2 Arg1:T11 Arg2:T15\n"
+                "R5\tCPR:4 Arg1:T10 Arg2:T28\n"
+                "R6\tCPR:4 Arg1:T12 Arg2:T33\n"
+                "R7\tCPR:4 Arg1:T2 Arg2:T18\n"
+                "R8\tCPR:4 Arg1:T2 Arg2:T19\n"
+                "R9\tCPR:4 Arg1:T3 Arg2:T20\n"
+                "R10\tCPR:4 Arg1:T3 Arg2:T21\n"
+                "R11\tCPR:4 Arg1:T9 Arg2:T28\n")
+
+        self.complex_ann = f'{self.tmpdir}/mycomplexfile.ann'
+        with open(self.complex_ann, 'w') as f:
+            f.write(complex_ann)
+
+        # Define other attributes
+        self.nlp = spacy.load("en_core_web_sm")
+        self.scinlp = spacy.load("en_core_sci_sm")
+        self.dataset = 'scierc'
+
+    def test_entity_counters_simple(self):
+
+        annotated_doc = ad.AnnotatedDoc.parse_ann(self.simple_txt,
+                                                  self.simple_ann,
+                                                  self.nlp,
+                                                  self.dataset,
+                                                  coref=True)
+        annotated_doc.char_to_token()
+        res = annotated_doc.format_dygiepp()
+        self.assertEqual(annotated_doc.total_original_ents, 7)
+        self.assertEqual(annotated_doc.dropped_ents, 1)
+
+    def test_relation_counters_simple(self):
+
+        annotated_doc = ad.AnnotatedDoc.parse_ann(self.simple_txt,
+                                                  self.simple_ann,
+                                                  self.nlp,
+                                                  self.dataset,
+                                                  coref=True)
+        annotated_doc.char_to_token()
+        res = annotated_doc.format_dygiepp()
+        self.assertEqual(annotated_doc.total_original_rels, 2)
+        self.assertEqual(annotated_doc.dropped_rels, 1)
+
+    def test_entity_counters_complex(self):
+
+        annotated_doc = ad.AnnotatedDoc.parse_ann(self.complex_txt,
+                                                  self.complex_ann,
+                                                  self.scinlp,
+                                                  self.dataset,
+                                                  coref=True)
+        annotated_doc.char_to_token()
+        res = annotated_doc.format_dygiepp()
+        self.assertEqual(annotated_doc.total_original_ents, 33)
+        self.assertEqual(annotated_doc.dropped_ents, 6)
+
+    def test_relation_counters_complex(self):
+
+        annotated_doc = ad.AnnotatedDoc.parse_ann(self.complex_txt,
+                                                  self.complex_ann,
+                                                  self.scinlp,
+                                                  self.dataset,
+                                                  coref=True)
+        annotated_doc.char_to_token()
+        res = annotated_doc.format_dygiepp()
+        self.assertEqual(annotated_doc.total_original_rels, 12)
+        self.assertEqual(annotated_doc.dropped_rels, 2)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/scripts/new-dataset/annotated_doc.py b/scripts/new-dataset/annotated_doc.py
@@ -16,7 +16,7 @@ class AnnotatedDocError(Exception):
 
 class AnnotatedDoc:
     def __init__(self, text, sents, ents, bin_rels, events, equiv_rels,
-                 doc_key, dataset, coref, nlp, total_original_ents,
+                 doc_key, dataset, coref, nlp, dropped_ents, total_original_ents,
                  total_original_rels, total_original_equiv_rels,
                  total_original_events):
         """
@@ -34,7 +34,7 @@ def __init__(self, text, sents, ents, bin_rels, events, equiv_rels,
         self.dataset = dataset
         self.coref = coref  # True if EquivRels should be treated as corefs
         self.nlp = nlp
-        self.dropped_ents = 0
+        self.dropped_ents = dropped_ents
         self.dropped_rels = 0
         self.dropped_equiv_rels = 0
         self.dropped_events = 0
@@ -74,11 +74,13 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):
 
         # Drop discontinuous ents by looking for semicolons before second \t
         lines_continuous = []
+        discont_dropped = 0
         for line in lines:
             if line[0] == 'T':
                 second_tab = line.rfind('\t')
                 if ';' in line[:second_tab]:
                     idx = line[:line.index("\t")]
+                    discont_dropped += 1
                     warnings.warn(f'Entity "{line[second_tab:]}" (ID: '
                                   f'{idx}) is disjoint, and will be dropped.')
                 else:
@@ -94,7 +96,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):
         bin_rels = []
         events = []
         equiv_rels = []
-        total_original_ents = 0
+        total_original_ents = discont_dropped
         total_original_rels = 0
         total_original_equiv_rels = 0
         total_original_events = 0
@@ -121,8 +123,8 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):
 
         annotated_doc = AnnotatedDoc(text, sents, ents, bin_rels, events,
                                      equiv_rels, doc_key, dataset, coref, nlp,
-                                     total_original_ents, total_original_rels,
-                                     total_original_equiv_rels,
+                                     discont_dropped, total_original_ents,
+                                     total_original_rels, total_original_equiv_rels,
                                      total_original_events)
         annotated_doc.set_annotation_objects()
 
@@ -386,7 +388,10 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):
                 entity token mismatches
         """
         bin_rels = []
-        dropped_rels = 0
+        dropped_rels_list = []
+        dropped_rel_warnings = []
+        num_its = 0
+
         # Go through each sentence to get the relations in that sentence
         for sent_start, sent_end in sent_idx_tups:
 
@@ -395,22 +400,22 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):
             for rel in rel_list:
                 # Check to see if either entity as dropped because disjoint
                 if rel.arg1 is None or rel.arg2 is None:
-                    warnings.warn(
+                    dropped_rel_warnings.append(
                             'One or more of the argument entities for '
                             f'relation {rel.ID} was dropped because it was '
                             'disjoint. This relation will also be dropped as '
                             'a result.')
-                    dropped_rels += 1
+                    dropped_rels_list.append(rel)
                     continue
                 # Check to make sure both entities actually have token starts
                 if rel.arg1.tok_start == None or rel.arg2.tok_start == None:
-                    warnings.warn(
+                    dropped_rel_warnings.append(
                         'Either the start or end token for relation '
                         f'{rel.arg1.text} -- {rel.label} -- {rel.arg2.text} '
                         f'(ID: {rel.ID}) was dropped due to tokenization '
                         'mismatches. This relation will also be dropped '
                         'as a result.')
-                    dropped_rels += 1
+                    dropped_rels_list.append(rel)
                     continue
                 rel_start = rel.arg1.tok_start
                 if sent_start <= rel_start < sent_end:
@@ -421,6 +426,12 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):
 
             bin_rels.append(sent_rels)
 
+        dropped_rels = len(list(set(dropped_rels_list)))
+
+        unique_warnings = list(set(dropped_rel_warnings))
+        for wa in unique_warnings:
+            warnings.warn(wa)
+
         return bin_rels, dropped_rels
 
 
@@ -491,7 +502,8 @@ def format_events_dygiepp(event_list, sent_idx_tups):
                 mismatches
         """
         events = []
-        dropped_events = 0
+        dropped_events_list = []
+        dropped_event_warnings = []
         # Go through each sentence to get the relations in that sentence
         for sent_start, sent_end in sent_idx_tups:
 
@@ -503,24 +515,24 @@ def format_events_dygiepp(event_list, sent_idx_tups):
                 # have token starts
                 # First, check the trigger
                 if event.trigger.tok_start == None or event.trigger.tok_end == None:
-                    warnings.warn(
+                    dropped_event_warnings.append(
                         f'The trigger for event ID: {event.ID} '
                         f'(trigger: {event.trigger.text} was dropped due '
                         'to tokenization mismatches. This event will be '
                         'dropped as a result.')
-                    dropped_events += 1
+                    dropped_events_list.append(event)
                     continue
                 # Then check all the arguments in the event
                 any_missing_arg = False
                 for arg_obj in event.args:
                     if arg_obj.tok_start == None or arg_obj.tok_end == None:
                         any_missing_arg = True
                 if any_missing_arg:
-                    warnings.warn(
+                    dropped_event_warnings.append(
                         f'One or more arguments for event ID: '
                         f'{event.ID} were dropped due to tokenization mismatches. '
                         'This event will be dropped as a result.')
-                    dropped_events += 1
+                    dropped_events_list.append(event)
                     continue
 
                 # Check if event is in sentence
@@ -558,6 +570,12 @@ def format_events_dygiepp(event_list, sent_idx_tups):
                     sent_events.append(formatted_event)
 
             events.append(sent_events)
+
+        dropped_events = len(list(set(dropped_events_list)))
+
+        unique_warnings = list(set(dropped_event_warnings))
+        for wa in unique_warnings:
+            warnings.warn(wa)
 
         return events, dropped_events