diff --git a/dygie/tests/data/annotated_doc_test.py b/dygie/tests/data/annotated_doc_test.py index 123489a..9897c19 100644 --- a/dygie/tests/data/annotated_doc_test.py +++ b/dygie/tests/data/annotated_doc_test.py @@ -505,6 +505,173 @@ def test_format_dygiepp(self): res = annotated_doc.format_dygiepp() self.assertEqual(res, self.dygiepp_dict) +class TestDropCounters(unittest.TestCase): + """ + Tests the functionality of the entity and relation counters in the + AnnotatedDoc class.. + """ + def setUp(self): + + # Set up temp dir and test docs + self.tmpdir = "tmp" + os.makedirs(self.tmpdir, exist_ok=True) + + simple_txt = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. " + "She was elected in 2017.") + + self.simple_txt = f'{self.tmpdir}/mysimplefile.txt' + with open(self.simple_txt, 'w') as f: + f.write(simple_txt) + + simple_ann = ("T1\tCity 0 7;13 23\tSeattle\n" + "T2\tPerson 25 37\tJenny Durkan\n" + "T3\tCity 41 51\tthe city's\n" + "T4\tPerson 59 62\tShe\n" + "T5\tPersonnel.Election 67 74\telected\n" + "T6\tYear 78 82\t2017\n" + "T7\tCity 13 23\trainy city\n" + "R1\tIs-A Arg1:T1 Arg2:T7\n" + "R2\tMayor-Of Arg1:T2 Arg2:T3\n" + "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" + "*\tEQUIV T1 T3\n" + "*\tEQUIV T2 T4\n") + + self.simple_ann = f'{self.tmpdir}/mysimplefile.ann' + with open(self.simple_ann, 'w') as f: + f.write(simple_ann) + + complex_txt = ("Global target profile of the kinase inhibitor bosutinib " + "in primary chronic myeloid leukemia cells.\n" + "The detailed molecular mechanism of action of second-generation " + "BCR-ABL tyrosine kinase inhibitors, including perturbed targets and " + "pathways, should contribute to rationalized therapy in chronic " + "myeloid leukemia (CML) or in other affected diseases. Here, we " + "characterized the target profile of the dual SRC/ABL inhibitor " + "bosutinib employing a two-tiered approach using chemical proteomics " + "to identify natural binders in whole cell lysates of primary CML " + "and K562 cells in parallel to in vitro kinase assays against a large " + "recombinant kinase panel. The combined strategy resulted in a global " + "survey of bosutinib targets comprised of over 45 novel tyrosine " + "and serine/threonine kinases. We have found clear differences in the " + "target patterns of bosutinib in primary CML cells versus the K562 " + "cell line. A comparison of bosutinib with dasatinib across the " + "whole kinase panel revealed overlapping, but distinct, inhibition " + "profiles. Common among those were the SRC, ABL and TEC family " + "kinases. Bosutinib did not inhibit KIT or platelet-derived growth " + "factor receptor, but prominently targeted the apoptosis-linked " + "STE20 kinases. Although in vivo bosutinib is inactive against ABL " + "T315I, we found this clinically important mutant to be " + "enzymatically inhibited in the mid-nanomolar range. Finally, " + "bosutinib is the first kinase inhibitor shown to target CAMK2G, " + "recently implicated in myeloid leukemia cell proliferation.") + + self.complex_txt = f'{self.tmpdir}/mycomplexfile.txt' + with open(self.complex_txt, 'w') as f: + f.write(complex_txt) + + complex_ann = ("T10\tCHEMICAL 932 941\tdasatinib\n" + "T11\tCHEMICAL 1090 1099\tBosutinib\n" + "T12\tCHEMICAL 46 55\tbosutinib\n" + "T13\tGENE-Y 1116 1119\tKIT\n" + "T14\tGENE-N 1123 1162\tplatelet-derived growth factor receptor\n" + "T15\tGENE-N 1210 1223\tSTE20 kinases\n" + "T16\tGENE-Y 1272 1275\tABL\n" + "T17\tGENE-N 1276 1281\tT315I\n" + "T18\tGENE-N 1415 1421\tkinase\n" + "T19\tGENE-Y 1448 1454\tCAMK2G\n" + "T1\tCHEMICAL 1242 1251\tbosutinib\n" + "T20\tGENE-Y 402 405\tSRC\n" + "T21\tGENE-Y 406 409\tABL\n" + "T22\tGENE-N 592 598\tkinase\n" + "T23\tGENE-N 634 640\tkinase\n" + "T24\tGENE-Y 163 166\tBCR\n" + "T25\tGENE-N 746 783\ttyrosine and serine/threonine kinases\n" + "T26\tGENE-Y 167 170\tABL\n" + "T27\tGENE-N 171 186\ttyrosine kinase\n" + "T28\tGENE-N 959 965\tkinase\n" + "T29\tGENE-Y 1057 1060\tSRC\n" + "T2\tCHEMICAL 1392 1401\tbosutinib\n" + "T30\tGENE-Y 1062 1065\tABL\n" + "T31\tGENE-Y 1070 1073\tTEC\n" + "T32\tGENE-N 1081 1088\tkinases\n" + "T33\tGENE-N 29 35\tkinase\n" + "T3\tCHEMICAL 420 429\tbosutinib\n" + "T4\tCHEMICAL 701 710\tbosutinib\n" + "T5\tCHEMICAL 746 754\ttyrosine\n" + "T6\tCHEMICAL 759 765\tserine\n" + "T7\tCHEMICAL 766 775\tthreonine\n" + "T8\tCHEMICAL 843 852\tbosutinib\n" + "T9\tCHEMICAL 917 926\tbosutinib\n" + "R0\tCPR:10 Arg1:T11 Arg2:T13\n" + "R1\tCPR:10 Arg1:T11 Arg2:T14\n" + "R2\tCPR:10 Arg1:T1 Arg2:T16\n" + "R3\tCPR:10 Arg1:T1 Arg2:T17\n" + "R4\tCPR:2 Arg1:T11 Arg2:T15\n" + "R5\tCPR:4 Arg1:T10 Arg2:T28\n" + "R6\tCPR:4 Arg1:T12 Arg2:T33\n" + "R7\tCPR:4 Arg1:T2 Arg2:T18\n" + "R8\tCPR:4 Arg1:T2 Arg2:T19\n" + "R9\tCPR:4 Arg1:T3 Arg2:T20\n" + "R10\tCPR:4 Arg1:T3 Arg2:T21\n" + "R11\tCPR:4 Arg1:T9 Arg2:T28\n") + + self.complex_ann = f'{self.tmpdir}/mycomplexfile.ann' + with open(self.complex_ann, 'w') as f: + f.write(complex_ann) + + # Define other attributes + self.nlp = spacy.load("en_core_web_sm") + self.scinlp = spacy.load("en_core_sci_sm") + self.dataset = 'scierc' + + def test_entity_counters_simple(self): + + annotated_doc = ad.AnnotatedDoc.parse_ann(self.simple_txt, + self.simple_ann, + self.nlp, + self.dataset, + coref=True) + annotated_doc.char_to_token() + res = annotated_doc.format_dygiepp() + self.assertEqual(annotated_doc.total_original_ents, 7) + self.assertEqual(annotated_doc.dropped_ents, 1) + + def test_relation_counters_simple(self): + + annotated_doc = ad.AnnotatedDoc.parse_ann(self.simple_txt, + self.simple_ann, + self.nlp, + self.dataset, + coref=True) + annotated_doc.char_to_token() + res = annotated_doc.format_dygiepp() + self.assertEqual(annotated_doc.total_original_rels, 2) + self.assertEqual(annotated_doc.dropped_rels, 1) + + def test_entity_counters_complex(self): + + annotated_doc = ad.AnnotatedDoc.parse_ann(self.complex_txt, + self.complex_ann, + self.scinlp, + self.dataset, + coref=True) + annotated_doc.char_to_token() + res = annotated_doc.format_dygiepp() + self.assertEqual(annotated_doc.total_original_ents, 33) + self.assertEqual(annotated_doc.dropped_ents, 6) + + def test_relation_counters_complex(self): + + annotated_doc = ad.AnnotatedDoc.parse_ann(self.complex_txt, + self.complex_ann, + self.scinlp, + self.dataset, + coref=True) + annotated_doc.char_to_token() + res = annotated_doc.format_dygiepp() + self.assertEqual(annotated_doc.total_original_rels, 12) + self.assertEqual(annotated_doc.dropped_rels, 2) + if __name__ == "__main__": unittest.main() diff --git a/scripts/new-dataset/annotated_doc.py b/scripts/new-dataset/annotated_doc.py index 4983637..221e675 100644 --- a/scripts/new-dataset/annotated_doc.py +++ b/scripts/new-dataset/annotated_doc.py @@ -16,7 +16,7 @@ class AnnotatedDocError(Exception): class AnnotatedDoc: def __init__(self, text, sents, ents, bin_rels, events, equiv_rels, - doc_key, dataset, coref, nlp, total_original_ents, + doc_key, dataset, coref, nlp, dropped_ents, total_original_ents, total_original_rels, total_original_equiv_rels, total_original_events): """ @@ -34,7 +34,7 @@ def __init__(self, text, sents, ents, bin_rels, events, equiv_rels, self.dataset = dataset self.coref = coref # True if EquivRels should be treated as corefs self.nlp = nlp - self.dropped_ents = 0 + self.dropped_ents = dropped_ents self.dropped_rels = 0 self.dropped_equiv_rels = 0 self.dropped_events = 0 @@ -74,11 +74,13 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref): # Drop discontinuous ents by looking for semicolons before second \t lines_continuous = [] + discont_dropped = 0 for line in lines: if line[0] == 'T': second_tab = line.rfind('\t') if ';' in line[:second_tab]: idx = line[:line.index("\t")] + discont_dropped += 1 warnings.warn(f'Entity "{line[second_tab:]}" (ID: ' f'{idx}) is disjoint, and will be dropped.') else: @@ -94,7 +96,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref): bin_rels = [] events = [] equiv_rels = [] - total_original_ents = 0 + total_original_ents = discont_dropped total_original_rels = 0 total_original_equiv_rels = 0 total_original_events = 0 @@ -121,8 +123,8 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref): annotated_doc = AnnotatedDoc(text, sents, ents, bin_rels, events, equiv_rels, doc_key, dataset, coref, nlp, - total_original_ents, total_original_rels, - total_original_equiv_rels, + discont_dropped, total_original_ents, + total_original_rels, total_original_equiv_rels, total_original_events) annotated_doc.set_annotation_objects() @@ -386,7 +388,10 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups): entity token mismatches """ bin_rels = [] - dropped_rels = 0 + dropped_rels_list = [] + dropped_rel_warnings = [] + num_its = 0 + # Go through each sentence to get the relations in that sentence for sent_start, sent_end in sent_idx_tups: @@ -395,22 +400,22 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups): for rel in rel_list: # Check to see if either entity as dropped because disjoint if rel.arg1 is None or rel.arg2 is None: - warnings.warn( + dropped_rel_warnings.append( 'One or more of the argument entities for ' f'relation {rel.ID} was dropped because it was ' 'disjoint. This relation will also be dropped as ' 'a result.') - dropped_rels += 1 + dropped_rels_list.append(rel) continue # Check to make sure both entities actually have token starts if rel.arg1.tok_start == None or rel.arg2.tok_start == None: - warnings.warn( + dropped_rel_warnings.append( 'Either the start or end token for relation ' f'{rel.arg1.text} -- {rel.label} -- {rel.arg2.text} ' f'(ID: {rel.ID}) was dropped due to tokenization ' 'mismatches. This relation will also be dropped ' 'as a result.') - dropped_rels += 1 + dropped_rels_list.append(rel) continue rel_start = rel.arg1.tok_start if sent_start <= rel_start < sent_end: @@ -421,6 +426,12 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups): bin_rels.append(sent_rels) + dropped_rels = len(list(set(dropped_rels_list))) + + unique_warnings = list(set(dropped_rel_warnings)) + for wa in unique_warnings: + warnings.warn(wa) + return bin_rels, dropped_rels @@ -491,7 +502,8 @@ def format_events_dygiepp(event_list, sent_idx_tups): mismatches """ events = [] - dropped_events = 0 + dropped_events_list = [] + dropped_event_warnings = [] # Go through each sentence to get the relations in that sentence for sent_start, sent_end in sent_idx_tups: @@ -503,12 +515,12 @@ def format_events_dygiepp(event_list, sent_idx_tups): # have token starts # First, check the trigger if event.trigger.tok_start == None or event.trigger.tok_end == None: - warnings.warn( + dropped_event_warnings.append( f'The trigger for event ID: {event.ID} ' f'(trigger: {event.trigger.text} was dropped due ' 'to tokenization mismatches. This event will be ' 'dropped as a result.') - dropped_events += 1 + dropped_events_list.append(event) continue # Then check all the arguments in the event any_missing_arg = False @@ -516,11 +528,11 @@ def format_events_dygiepp(event_list, sent_idx_tups): if arg_obj.tok_start == None or arg_obj.tok_end == None: any_missing_arg = True if any_missing_arg: - warnings.warn( + dropped_event_warnings.append( f'One or more arguments for event ID: ' f'{event.ID} were dropped due to tokenization mismatches. ' 'This event will be dropped as a result.') - dropped_events += 1 + dropped_events_list.append(event) continue # Check if event is in sentence @@ -558,6 +570,12 @@ def format_events_dygiepp(event_list, sent_idx_tups): sent_events.append(formatted_event) events.append(sent_events) + + dropped_events = len(list(set(dropped_events_list))) + + unique_warnings = list(set(dropped_event_warnings)) + for wa in unique_warnings: + warnings.warn(wa) return events, dropped_events