Skip to content

Commit

Permalink
Fix bug in relations, entity and event counters
Browse files Browse the repository at this point in the history
  • Loading branch information
serenalotreck committed Jun 27, 2023
1 parent 1f4865a commit 34d82ca
Show file tree
Hide file tree
Showing 2 changed files with 200 additions and 15 deletions.
167 changes: 167 additions & 0 deletions dygie/tests/data/annotated_doc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,173 @@ def test_format_dygiepp(self):
res = annotated_doc.format_dygiepp()
self.assertEqual(res, self.dygiepp_dict)

class TestDropCounters(unittest.TestCase):
"""
Tests the functionality of the entity and relation counters in the
AnnotatedDoc class..
"""
def setUp(self):

# Set up temp dir and test docs
self.tmpdir = "tmp"
os.makedirs(self.tmpdir, exist_ok=True)

simple_txt = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. "
"She was elected in 2017.")

self.simple_txt = f'{self.tmpdir}/mysimplefile.txt'
with open(self.simple_txt, 'w') as f:
f.write(simple_txt)

simple_ann = ("T1\tCity 0 7;13 23\tSeattle\n"
"T2\tPerson 25 37\tJenny Durkan\n"
"T3\tCity 41 51\tthe city's\n"
"T4\tPerson 59 62\tShe\n"
"T5\tPersonnel.Election 67 74\telected\n"
"T6\tYear 78 82\t2017\n"
"T7\tCity 13 23\trainy city\n"
"R1\tIs-A Arg1:T1 Arg2:T7\n"
"R2\tMayor-Of Arg1:T2 Arg2:T3\n"
"E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
"*\tEQUIV T1 T3\n"
"*\tEQUIV T2 T4\n")

self.simple_ann = f'{self.tmpdir}/mysimplefile.ann'
with open(self.simple_ann, 'w') as f:
f.write(simple_ann)

complex_txt = ("Global target profile of the kinase inhibitor bosutinib "
"in primary chronic myeloid leukemia cells.\n"
"The detailed molecular mechanism of action of second-generation "
"BCR-ABL tyrosine kinase inhibitors, including perturbed targets and "
"pathways, should contribute to rationalized therapy in chronic "
"myeloid leukemia (CML) or in other affected diseases. Here, we "
"characterized the target profile of the dual SRC/ABL inhibitor "
"bosutinib employing a two-tiered approach using chemical proteomics "
"to identify natural binders in whole cell lysates of primary CML "
"and K562 cells in parallel to in vitro kinase assays against a large "
"recombinant kinase panel. The combined strategy resulted in a global "
"survey of bosutinib targets comprised of over 45 novel tyrosine "
"and serine/threonine kinases. We have found clear differences in the "
"target patterns of bosutinib in primary CML cells versus the K562 "
"cell line. A comparison of bosutinib with dasatinib across the "
"whole kinase panel revealed overlapping, but distinct, inhibition "
"profiles. Common among those were the SRC, ABL and TEC family "
"kinases. Bosutinib did not inhibit KIT or platelet-derived growth "
"factor receptor, but prominently targeted the apoptosis-linked "
"STE20 kinases. Although in vivo bosutinib is inactive against ABL "
"T315I, we found this clinically important mutant to be "
"enzymatically inhibited in the mid-nanomolar range. Finally, "
"bosutinib is the first kinase inhibitor shown to target CAMK2G, "
"recently implicated in myeloid leukemia cell proliferation.")

self.complex_txt = f'{self.tmpdir}/mycomplexfile.txt'
with open(self.complex_txt, 'w') as f:
f.write(complex_txt)

complex_ann = ("T10\tCHEMICAL 932 941\tdasatinib\n"
"T11\tCHEMICAL 1090 1099\tBosutinib\n"
"T12\tCHEMICAL 46 55\tbosutinib\n"
"T13\tGENE-Y 1116 1119\tKIT\n"
"T14\tGENE-N 1123 1162\tplatelet-derived growth factor receptor\n"
"T15\tGENE-N 1210 1223\tSTE20 kinases\n"
"T16\tGENE-Y 1272 1275\tABL\n"
"T17\tGENE-N 1276 1281\tT315I\n"
"T18\tGENE-N 1415 1421\tkinase\n"
"T19\tGENE-Y 1448 1454\tCAMK2G\n"
"T1\tCHEMICAL 1242 1251\tbosutinib\n"
"T20\tGENE-Y 402 405\tSRC\n"
"T21\tGENE-Y 406 409\tABL\n"
"T22\tGENE-N 592 598\tkinase\n"
"T23\tGENE-N 634 640\tkinase\n"
"T24\tGENE-Y 163 166\tBCR\n"
"T25\tGENE-N 746 783\ttyrosine and serine/threonine kinases\n"
"T26\tGENE-Y 167 170\tABL\n"
"T27\tGENE-N 171 186\ttyrosine kinase\n"
"T28\tGENE-N 959 965\tkinase\n"
"T29\tGENE-Y 1057 1060\tSRC\n"
"T2\tCHEMICAL 1392 1401\tbosutinib\n"
"T30\tGENE-Y 1062 1065\tABL\n"
"T31\tGENE-Y 1070 1073\tTEC\n"
"T32\tGENE-N 1081 1088\tkinases\n"
"T33\tGENE-N 29 35\tkinase\n"
"T3\tCHEMICAL 420 429\tbosutinib\n"
"T4\tCHEMICAL 701 710\tbosutinib\n"
"T5\tCHEMICAL 746 754\ttyrosine\n"
"T6\tCHEMICAL 759 765\tserine\n"
"T7\tCHEMICAL 766 775\tthreonine\n"
"T8\tCHEMICAL 843 852\tbosutinib\n"
"T9\tCHEMICAL 917 926\tbosutinib\n"
"R0\tCPR:10 Arg1:T11 Arg2:T13\n"
"R1\tCPR:10 Arg1:T11 Arg2:T14\n"
"R2\tCPR:10 Arg1:T1 Arg2:T16\n"
"R3\tCPR:10 Arg1:T1 Arg2:T17\n"
"R4\tCPR:2 Arg1:T11 Arg2:T15\n"
"R5\tCPR:4 Arg1:T10 Arg2:T28\n"
"R6\tCPR:4 Arg1:T12 Arg2:T33\n"
"R7\tCPR:4 Arg1:T2 Arg2:T18\n"
"R8\tCPR:4 Arg1:T2 Arg2:T19\n"
"R9\tCPR:4 Arg1:T3 Arg2:T20\n"
"R10\tCPR:4 Arg1:T3 Arg2:T21\n"
"R11\tCPR:4 Arg1:T9 Arg2:T28\n")

self.complex_ann = f'{self.tmpdir}/mycomplexfile.ann'
with open(self.complex_ann, 'w') as f:
f.write(complex_ann)

# Define other attributes
self.nlp = spacy.load("en_core_web_sm")
self.scinlp = spacy.load("en_core_sci_sm")
self.dataset = 'scierc'

def test_entity_counters_simple(self):

annotated_doc = ad.AnnotatedDoc.parse_ann(self.simple_txt,
self.simple_ann,
self.nlp,
self.dataset,
coref=True)
annotated_doc.char_to_token()
res = annotated_doc.format_dygiepp()
self.assertEqual(annotated_doc.total_original_ents, 7)
self.assertEqual(annotated_doc.dropped_ents, 1)

def test_relation_counters_simple(self):

annotated_doc = ad.AnnotatedDoc.parse_ann(self.simple_txt,
self.simple_ann,
self.nlp,
self.dataset,
coref=True)
annotated_doc.char_to_token()
res = annotated_doc.format_dygiepp()
self.assertEqual(annotated_doc.total_original_rels, 2)
self.assertEqual(annotated_doc.dropped_rels, 1)

def test_entity_counters_complex(self):

annotated_doc = ad.AnnotatedDoc.parse_ann(self.complex_txt,
self.complex_ann,
self.scinlp,
self.dataset,
coref=True)
annotated_doc.char_to_token()
res = annotated_doc.format_dygiepp()
self.assertEqual(annotated_doc.total_original_ents, 33)
self.assertEqual(annotated_doc.dropped_ents, 6)

def test_relation_counters_complex(self):

annotated_doc = ad.AnnotatedDoc.parse_ann(self.complex_txt,
self.complex_ann,
self.scinlp,
self.dataset,
coref=True)
annotated_doc.char_to_token()
res = annotated_doc.format_dygiepp()
self.assertEqual(annotated_doc.total_original_rels, 12)
self.assertEqual(annotated_doc.dropped_rels, 2)


if __name__ == "__main__":
unittest.main()
48 changes: 33 additions & 15 deletions scripts/new-dataset/annotated_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class AnnotatedDocError(Exception):

class AnnotatedDoc:
def __init__(self, text, sents, ents, bin_rels, events, equiv_rels,
doc_key, dataset, coref, nlp, total_original_ents,
doc_key, dataset, coref, nlp, dropped_ents, total_original_ents,
total_original_rels, total_original_equiv_rels,
total_original_events):
"""
Expand All @@ -34,7 +34,7 @@ def __init__(self, text, sents, ents, bin_rels, events, equiv_rels,
self.dataset = dataset
self.coref = coref # True if EquivRels should be treated as corefs
self.nlp = nlp
self.dropped_ents = 0
self.dropped_ents = dropped_ents
self.dropped_rels = 0
self.dropped_equiv_rels = 0
self.dropped_events = 0
Expand Down Expand Up @@ -74,11 +74,13 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):

# Drop discontinuous ents by looking for semicolons before second \t
lines_continuous = []
discont_dropped = 0
for line in lines:
if line[0] == 'T':
second_tab = line.rfind('\t')
if ';' in line[:second_tab]:
idx = line[:line.index("\t")]
discont_dropped += 1
warnings.warn(f'Entity "{line[second_tab:]}" (ID: '
f'{idx}) is disjoint, and will be dropped.')
else:
Expand All @@ -94,7 +96,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):
bin_rels = []
events = []
equiv_rels = []
total_original_ents = 0
total_original_ents = discont_dropped
total_original_rels = 0
total_original_equiv_rels = 0
total_original_events = 0
Expand All @@ -121,8 +123,8 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref):

annotated_doc = AnnotatedDoc(text, sents, ents, bin_rels, events,
equiv_rels, doc_key, dataset, coref, nlp,
total_original_ents, total_original_rels,
total_original_equiv_rels,
discont_dropped, total_original_ents,
total_original_rels, total_original_equiv_rels,
total_original_events)
annotated_doc.set_annotation_objects()

Expand Down Expand Up @@ -386,7 +388,10 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):
entity token mismatches
"""
bin_rels = []
dropped_rels = 0
dropped_rels_list = []
dropped_rel_warnings = []
num_its = 0

# Go through each sentence to get the relations in that sentence
for sent_start, sent_end in sent_idx_tups:

Expand All @@ -395,22 +400,22 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):
for rel in rel_list:
# Check to see if either entity as dropped because disjoint
if rel.arg1 is None or rel.arg2 is None:
warnings.warn(
dropped_rel_warnings.append(
'One or more of the argument entities for '
f'relation {rel.ID} was dropped because it was '
'disjoint. This relation will also be dropped as '
'a result.')
dropped_rels += 1
dropped_rels_list.append(rel)
continue
# Check to make sure both entities actually have token starts
if rel.arg1.tok_start == None or rel.arg2.tok_start == None:
warnings.warn(
dropped_rel_warnings.append(
'Either the start or end token for relation '
f'{rel.arg1.text} -- {rel.label} -- {rel.arg2.text} '
f'(ID: {rel.ID}) was dropped due to tokenization '
'mismatches. This relation will also be dropped '
'as a result.')
dropped_rels += 1
dropped_rels_list.append(rel)
continue
rel_start = rel.arg1.tok_start
if sent_start <= rel_start < sent_end:
Expand All @@ -421,6 +426,12 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups):

bin_rels.append(sent_rels)

dropped_rels = len(list(set(dropped_rels_list)))

unique_warnings = list(set(dropped_rel_warnings))
for wa in unique_warnings:
warnings.warn(wa)

return bin_rels, dropped_rels


Expand Down Expand Up @@ -491,7 +502,8 @@ def format_events_dygiepp(event_list, sent_idx_tups):
mismatches
"""
events = []
dropped_events = 0
dropped_events_list = []
dropped_event_warnings = []
# Go through each sentence to get the relations in that sentence
for sent_start, sent_end in sent_idx_tups:

Expand All @@ -503,24 +515,24 @@ def format_events_dygiepp(event_list, sent_idx_tups):
# have token starts
# First, check the trigger
if event.trigger.tok_start == None or event.trigger.tok_end == None:
warnings.warn(
dropped_event_warnings.append(
f'The trigger for event ID: {event.ID} '
f'(trigger: {event.trigger.text} was dropped due '
'to tokenization mismatches. This event will be '
'dropped as a result.')
dropped_events += 1
dropped_events_list.append(event)
continue
# Then check all the arguments in the event
any_missing_arg = False
for arg_obj in event.args:
if arg_obj.tok_start == None or arg_obj.tok_end == None:
any_missing_arg = True
if any_missing_arg:
warnings.warn(
dropped_event_warnings.append(
f'One or more arguments for event ID: '
f'{event.ID} were dropped due to tokenization mismatches. '
'This event will be dropped as a result.')
dropped_events += 1
dropped_events_list.append(event)
continue

# Check if event is in sentence
Expand Down Expand Up @@ -558,6 +570,12 @@ def format_events_dygiepp(event_list, sent_idx_tups):
sent_events.append(formatted_event)

events.append(sent_events)

dropped_events = len(list(set(dropped_events_list)))

unique_warnings = list(set(dropped_event_warnings))
for wa in unique_warnings:
warnings.warn(wa)

return events, dropped_events

Expand Down

0 comments on commit 34d82ca

Please sign in to comment.