diff --git a/dygie/tests/data/annotated_doc_test.py b/dygie/tests/data/annotated_doc_test.py index afa6fbc..4a1db16 100644 --- a/dygie/tests/data/annotated_doc_test.py +++ b/dygie/tests/data/annotated_doc_test.py @@ -110,7 +110,6 @@ def setUp(self): nlp, dataset, coref=True) - self.annotated_doc.char_to_token() # Set up relation self.rel1 = ad.BinRel("R1\tMayor-Of Arg1:T2 Arg2:T3".split()) @@ -118,10 +117,36 @@ def setUp(self): # Right answer self.relations = [[], [[6, 7, 9, 11, "Mayor-Of"]], []] + # Missing entity annotations + missing_ann = ("T1\tCity 0 7\tSeattle\n" + "T2\tPerson 22 37\tJenny Durkan\n" + "T3\tCity 41 51\tthe city's\n" + "T4\tPerson 59 62\tShe\n" + "T5\tPersonnel.Election 67 74\telected\n" + "T6\tYear 78 82\t2017\n" + "R1\tMayor-Of Arg1:T2 Arg2:T3\n" + "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" + "*\tEQUIV T1 T3\n" + "*\tEQUIV T2 T4\n") + missing_ann_path = f'{self.tmpdir}/missing_myfile.ann' + with open(missing_ann_path, 'w') as f: + f.write(missing_ann) + + # Set up annotated_doc object + self.missing_annotated_doc = ad.AnnotatedDoc.parse_ann( + text_path, missing_ann_path, nlp, dataset, coref=True) + + # Right answer + self.missing_relations = [[], [], []] + def tearDown(self): shutil.rmtree(self.tmpdir) + # set_arg_objects is always called *before* char_to_token + # They will fail if run in the opposite order with entities that get + # dropped, but if they are only used with brat_to_input.py, the order is + # baked in and therefore safe def test_set_arg_objects(self): self.rel1.set_arg_objects(self.annotated_doc.ents) @@ -129,14 +154,31 @@ def test_set_arg_objects(self): self.assertEqual(self.rel1.arg1, self.annotated_doc.ents[1]) self.assertEqual(self.rel1.arg2, self.annotated_doc.ents[2]) + def test_set_arg_objects_missing_arg(self): + + self.rel1.set_arg_objects(self.missing_annotated_doc.ents) + + self.assertEqual(self.rel1.arg1, self.missing_annotated_doc.ents[1]) + self.assertEqual(self.rel1.arg2, self.missing_annotated_doc.ents[2]) + def test_format_bin_rels_dygiepp(self): self.rel1.set_arg_objects(self.annotated_doc.ents) - relations = ad.BinRel.format_bin_rels_dygiepp([self.rel1], - self.sent_idx_tups) + self.annotated_doc.char_to_token() + relations, dropped_rels = ad.BinRel.format_bin_rels_dygiepp( + [self.rel1], self.sent_idx_tups) self.assertEqual(relations, self.relations) + def test_format_bin_rels_dygiepp_missing_arg(self): + + self.rel1.set_arg_objects(self.missing_annotated_doc.ents) + self.missing_annotated_doc.char_to_token() + relations, dropped_rels = ad.BinRel.format_bin_rels_dygiepp( + [self.rel1], self.sent_idx_tups) + + self.assertEqual(relations, self.missing_relations) + class TestEvent(unittest.TestCase): def setUp(self): @@ -175,7 +217,6 @@ def setUp(self): nlp, dataset, coref=True) - self.annotated_doc.char_to_token() # Set up events self.event1 = ad.Event( @@ -186,6 +227,28 @@ def setUp(self): [[[16, "Personnel.Election"], [14, 14, "Person"], [18, 18, "Year"]]]] + # Missing entity annotations + missing_ann = ("T1\tCity 0 7\tSeattle\n" + "T2\tPerson 22 37\tJenny Durkan\n" + "T3\tCity 41 51\tthe city's\n" + "T4\tPerson 59 62\tShe\n" + "T5\tPersonnel.Election 63 74\telected\n" + "T6\tYear 78 82\t2017\n" + "R1\tMayor-Of Arg1:T2 Arg2:T3\n" + "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" + "*\tEQUIV T1 T3\n" + "*\tEQUIV T2 T4\n") + missing_ann_path = f'{self.tmpdir}/missing_myfile.ann' + with open(missing_ann_path, 'w') as f: + f.write(missing_ann) + + # Set up annotated_doc object + self.missing_annotated_doc = ad.AnnotatedDoc.parse_ann( + text_path, missing_ann_path, nlp, dataset, coref=True) + + # Right answer + self.missing_events = [[], [], []] + def tearDown(self): shutil.rmtree(self.tmpdir) @@ -202,11 +265,32 @@ def test_set_arg_objects(self): def test_format_events_dygiepp(self): self.event1.set_arg_objects(self.annotated_doc.ents) - events = ad.Event.format_events_dygiepp([self.event1], - self.sent_idx_tups) + self.annotated_doc.char_to_token() + events, dropped_events = ad.Event.format_events_dygiepp( + [self.event1], self.sent_idx_tups) self.assertEqual(events, self.events) + def test_set_arg_objects_missing_ann(self): + + self.event1.set_arg_objects(self.missing_annotated_doc.ents) + + self.assertEqual(self.event1.trigger, + self.missing_annotated_doc.ents[4]) + self.assertEqual(self.event1.args, [ + self.missing_annotated_doc.ents[3], + self.missing_annotated_doc.ents[5] + ]) + + def test_format_events_dygiepp_missing_ann(self): + + self.event1.set_arg_objects(self.missing_annotated_doc.ents) + self.missing_annotated_doc.char_to_token() + events, dropped_events = ad.Event.format_events_dygiepp( + [self.event1], self.sent_idx_tups) + + self.assertEqual(events, self.missing_events) + class TestEquivRel(unittest.TestCase): def setUp(self): @@ -243,7 +327,6 @@ def setUp(self): nlp, dataset, coref=True) - self.annotated_doc.char_to_token() # Set up equivalence relations self.equivrel1 = ad.EquivRel("*\tEQUIV T1 T3".split()) @@ -252,6 +335,28 @@ def setUp(self): # The dygiepp-formatted correct answer self.corefs = [[[0, 0], [9, 11]], [[6, 7], [14, 14]]] + # Missing entity annotations + missing_ann = ("T1\tCity 0 7\tSeattle\n" + "T2\tPerson 22 37\tJenny Durkan\n" + "T3\tCity 41 51\tthe city's\n" + "T4\tPerson 59 62\tShe\n" + "T5\tPersonnel.Election 67 74\telected\n" + "T6\tYear 78 82\t2017\n" + "R1\tMayor-Of Arg1:T2 Arg2:T3\n" + "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" + "*\tEQUIV T1 T3\n" + "*\tEQUIV T2 T4\n") + missing_ann_path = f'{self.tmpdir}/missing_myfile.ann' + with open(missing_ann_path, 'w') as f: + f.write(missing_ann) + + # Set up annotated_doc object + self.missing_annotated_doc = ad.AnnotatedDoc.parse_ann( + text_path, missing_ann_path, nlp, dataset, coref=True) + + # The dygiepp-formatted correct answer + self.missing_corefs = [[[0, 0], [9, 11]]] + def tearDown(self): shutil.rmtree(self.tmpdir) @@ -272,11 +377,36 @@ def test_format_corefs_dygiepp(self): self.equivrel1.set_arg_objects(self.annotated_doc.ents) self.equivrel2.set_arg_objects(self.annotated_doc.ents) - corefs = ad.EquivRel.format_corefs_dygiepp( + self.annotated_doc.char_to_token() + corefs, dropped_equiv_rels = ad.EquivRel.format_corefs_dygiepp( [self.equivrel1, self.equivrel2]) self.assertEqual(corefs, self.corefs) + def test_set_arg_objects_missing_ann(self): + + self.equivrel1.set_arg_objects(self.missing_annotated_doc.ents) + self.equivrel2.set_arg_objects(self.missing_annotated_doc.ents) + + self.assertEqual(self.equivrel1.args, [ + self.missing_annotated_doc.ents[0], + self.missing_annotated_doc.ents[2] + ]) + self.assertEqual(self.equivrel2.args, [ + self.missing_annotated_doc.ents[1], + self.missing_annotated_doc.ents[3] + ]) + + def test_format_corefs_dygiepp_missing_ann(self): + + self.equivrel1.set_arg_objects(self.missing_annotated_doc.ents) + self.equivrel2.set_arg_objects(self.missing_annotated_doc.ents) + self.missing_annotated_doc.char_to_token() + corefs, dropped_equiv_rels = ad.EquivRel.format_corefs_dygiepp( + [self.equivrel1, self.equivrel2]) + + self.assertEqual(corefs, self.missing_corefs) + class TestAnnotatedDoc(unittest.TestCase): """ @@ -362,7 +492,6 @@ def test_format_dygiepp(self): coref=True) annotated_doc.char_to_token() res = annotated_doc.format_dygiepp() - self.assertEqual(res, self.dygiepp_dict) diff --git a/scripts/new-dataset/annotated_doc.py b/scripts/new-dataset/annotated_doc.py index 9d53164..f3b12c9 100644 --- a/scripts/new-dataset/annotated_doc.py +++ b/scripts/new-dataset/annotated_doc.py @@ -16,7 +16,9 @@ class AnnotatedDocError(Exception): class AnnotatedDoc: def __init__(self, text, sents, ents, bin_rels, events, equiv_rels, - doc_key, dataset, coref, nlp, total_original_ents): + doc_key, dataset, coref, nlp, total_original_ents, + total_original_rels, total_original_equiv_rels, + total_original_events): """ Provides dual functionality for class construction. If this function is used, be sure that the ents, bin_rels, events, and equiv_rels are @@ -33,8 +35,13 @@ def __init__(self, text, sents, ents, bin_rels, events, equiv_rels, self.coref = coref # True if EquivRels should be treated as corefs self.nlp = nlp self.dropped_ents = 0 + self.dropped_rels = 0 + self.dropped_equiv_rels = 0 + self.dropped_events = 0 self.total_original_ents = total_original_ents - + self.total_original_rels = total_original_rels + self.total_original_equiv_rels = total_original_equiv_rels + self.total_original_events = total_original_events @classmethod def parse_ann(cls, txt, ann, nlp, dataset, coref): @@ -73,7 +80,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref): if ';' in line[:second_tab]: idx = line[:line.index("\t")] warnings.warn(f'Entity "{line[second_tab:]}" (ID: ' - f'{idx}) is disjoint, and will be dropped.') + f'{idx}) is disjoint, and will be dropped.') else: lines_continuous.append(line) else: @@ -88,6 +95,9 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref): events = [] equiv_rels = [] total_original_ents = 0 + total_original_rels = 0 + total_original_equiv_rels = 0 + total_original_events = 0 for line in split_lines: # The first character of the first element in the annotation @@ -99,21 +109,25 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref): elif line[0][0] == 'R': bin_rels.append(BinRel(line)) + total_original_rels += 1 elif line[0][0] == 'E': events.append(Event(line)) + total_original_events += 1 elif line[0][0] == '*' and coref: equiv_rels.append(EquivRel(line)) + total_original_equiv_rels += 1 annotated_doc = AnnotatedDoc(text, sents, ents, bin_rels, events, equiv_rels, doc_key, dataset, coref, nlp, - total_original_ents) + total_original_ents, total_original_rels, + total_original_equiv_rels, + total_original_events) annotated_doc.set_annotation_objects() return annotated_doc - def set_annotation_objects(self): """ For each type of annotation, replace the string IDs with the @@ -123,7 +137,6 @@ def set_annotation_objects(self): [event.set_arg_objects(self.ents) for event in self.events] [equiv_rel.set_arg_objects(self.ents) for equiv_rel in self.equiv_rels] - def format_dygiepp(self): """ Creates a dygiepp-formatted json for the doc, using each class' @@ -142,12 +155,27 @@ def format_dygiepp(self): # Format data ner = Ent.format_ner_dygiepp(self.ents, sent_idx_tups) - bin_rels = BinRel.format_bin_rels_dygiepp(self.bin_rels, sent_idx_tups) + bin_rels, self.dropped_rels = BinRel.format_bin_rels_dygiepp( + self.bin_rels, sent_idx_tups) + print( + f'Completed relation formatting for {self.doc_key}. {self.dropped_rels} of ' + f'{self.total_original_rels} relations were dropped due to tokenization mismatches.' + ) if len(self.equiv_rels ) > 0 and self.coref: # Some datasets don't have coreferences - corefs = EquivRel.format_corefs_dygiepp(self.equiv_rels) + corefs, self.dropped_equiv_rels = EquivRel.format_corefs_dygiepp( + self.equiv_rels) + print(f'Completed coreference formatting for {self.doc_key}. ' + f'{self.dropped_equiv_rels} of ' + f'{self.total_original_equiv_rels} were dropped due to ' + 'tokenization mismatches.') if len(self.events) > 0: # Some datasets don't have events - events = Event.format_events_dygiepp(self.events, sent_idx_tups) + events, self.dropped_events = Event.format_events_dygiepp( + self.events, sent_idx_tups) + print(f'Completed event formatting for {self.doc_key}. ' + f'{self.dropped_events} of ' + f'{self.total_original_events} were dropped due to ' + 'tokenization mismatches.') # Make dict res = { @@ -166,7 +194,6 @@ def format_dygiepp(self): return res - def char_to_token(self): """ Does the heavy lifting for converting brat format to dygiepp format. @@ -192,8 +219,9 @@ def char_to_token(self): # If the entity can't be found because there isn't an exact # match in the list, warn that it will be dropped - warnings.warn(f'The entity {ent.text} (ID: {ent.ID}) cannot ' - 'be aligned to the tokenization, and will be dropped.') + warnings.warn( + f'The entity {ent.text} (ID: {ent.ID}) cannot ' + 'be aligned to the tokenization, and will be dropped.') self.dropped_ents += 1 else: @@ -212,11 +240,13 @@ def char_to_token(self): # Double-check that the tokens from the annotation file match up # with the tokens in the source text. ent_tok_text = [tok.text.lower() for tok in processed_ent] - doc_tok_text = [tok.text.lower() for i, tok in enumerate(tok_text) - if i >= ent_tok_start and i <= ent_tok_end] + doc_tok_text = [ + tok.text.lower() for i, tok in enumerate(tok_text) + if i >= ent_tok_start and i <= ent_tok_end + ] if ent_tok_text != doc_tok_text: msg = ('The annotation file and source document disagree ' - f'on the tokens for entity {ent.text} (ID: ' + f'on the tokens for entity {ent.text} (ID: ' f'{ent.ID}). This entity will be dropped.') warnings.warn(msg) self.dropped_ents += 1 @@ -231,9 +261,10 @@ def char_to_token(self): # Set the list of entities that had token matches as ents for doc self.ents = ent_list_tokens - print(f'Completed doc {self.doc_key}. {self.dropped_ents} of ' - f'{self.total_original_ents} entities ' - 'were dropped due to tokenization mismatches.') + print( + f'Completed character to token conversion for doc {self.doc_key}. ' + f'{self.dropped_ents} of {self.total_original_ents} entities ' + 'were dropped due to tokenization mismatches.') class Ent: @@ -340,14 +371,27 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups): returns: bin_rels, list of list: dygiepp formatted relations + dropped_rels, int: number of relations that were dropped due to + entity token mismatches """ bin_rels = [] + dropped_rels = 0 # Go through each sentence to get the relations in that sentence for sent_start, sent_end in sent_idx_tups: # Check first entity to see if relation is in this sentence sent_rels = [] for rel in rel_list: + # Check to make sure both entities actually have token starts + if rel.arg1.tok_start == None or rel.arg2.tok_start == None: + warnings.warn( + 'Either the start or end token for relation ' + f'{rel.arg1.text} -- {rel.label} -- {rel.arg2.text} ' + f'(ID: {rel.ID}) was dropped due to tokenization ' + 'mismatches. This relation will also be dropped ' + 'as a result.') + dropped_rels += 1 + continue rel_start = rel.arg1.tok_start if sent_start <= rel_start < sent_end: sent_rels.append([ @@ -357,7 +401,7 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups): bin_rels.append(sent_rels) - return bin_rels + return bin_rels, dropped_rels class Event: @@ -423,8 +467,11 @@ def format_events_dygiepp(event_list, sent_idx_tups): returns: events, list of list: dygiepp formatted events + dropped_ents, int: number of events dropped due to entity token + mismatches """ events = [] + dropped_events = 0 # Go through each sentence to get the relations in that sentence for sent_start, sent_end in sent_idx_tups: @@ -432,6 +479,30 @@ def format_events_dygiepp(event_list, sent_idx_tups): sent_events = [] for event in event_list: + # Check to make sure the entities involved in the event all + # have token starts + # First, check the trigger + if event.trigger.tok_start == None or event.trigger.tok_end == None: + warnings.warn( + f'The trigger for event ID: {event.ID} ' + f'(trigger: {event.trigger.text} was dropped due ' + 'to tokenization mismatches. This event will be ' + 'dropped as a result.') + dropped_events += 1 + continue + # Then check all the arguments in the event + any_missing_arg = False + for arg_obj in event.args: + if arg_obj.tok_start == None or arg_obj.tok_end == None: + any_missing_arg = True + if any_missing_arg: + warnings.warn( + f'One or more arguments for event ID: ' + f'{event.ID} were dropped due to tokenization mismatches. ' + 'This event will be dropped as a result.') + dropped_events += 1 + continue + # Check if event is in sentence trigger_start = event.trigger.tok_start @@ -468,7 +539,7 @@ def format_events_dygiepp(event_list, sent_idx_tups): events.append(sent_events) - return events + return events, dropped_events class EquivRel: @@ -507,10 +578,29 @@ def format_corefs_dygiepp(equiv_rels_list): returns: corefs, list of list: dygiepp formatted coreference clusters + dropped_equiv_rels, int: number of equivalence relations dropped + due to entity tokenization mistmatches """ corefs = [] + dropped_equiv_rels = 0 for equiv_rel in equiv_rels_list: + + # Check that both entities exist + any_missing_args = False + for arg in equiv_rel.args: + if arg.tok_start == None or arg.tok_end == None: + any_missing_args = True + if any_missing_args: + arg_texts = [arg.text for arg in equiv_rel.args] + warnings.warn( + 'One or more arguments in the coreference ' + f'cluster {equiv_rel.label, arg_texts} was dropped ' + 'Due to entity tokenization mismatches. This ' + 'coreference will also be dropped as a reult.') + dropped_equiv_rels += 1 + continue + cluster = [[arg.tok_start, arg.tok_end] for arg in equiv_rel.args] corefs.append(cluster) - return corefs + return corefs, dropped_equiv_rels diff --git a/scripts/new-dataset/brat_to_input.py b/scripts/new-dataset/brat_to_input.py index ed6fa0f..c8b9eab 100644 --- a/scripts/new-dataset/brat_to_input.py +++ b/scripts/new-dataset/brat_to_input.py @@ -32,6 +32,8 @@ def format_annotated_document(fname_pair, dataset_name, nlp, coref): returns: res, json dict: formatted data + dropped_totals, dict: numbers of original and dropped entities, + binary and equivalence relations, and events for the document """ # Make annotated doc object annotated_doc = AnnotatedDoc.parse_ann(fname_pair[0], fname_pair[1], nlp, @@ -43,7 +45,19 @@ def format_annotated_document(fname_pair, dataset_name, nlp, coref): # Do the dygiepp conversion res = annotated_doc.format_dygiepp() - return res + # Get the numbers of dropped entities and relations for this document + dropped_totals = { + 'dropped_ents': annotated_doc.dropped_ents, + 'total_original_ents': annotated_doc.total_original_ents, + 'dropped_rels': annotated_doc.dropped_rels, + 'total_original_rels': annotated_doc.total_original_rels, + 'dropped_equiv_rels': annotated_doc.dropped_equiv_rels, + 'total_original_equiv_rels': annotated_doc.total_original_equiv_rels, + 'dropped_events': annotated_doc.dropped_events, + 'total_original_events': annotated_doc.total_original_events + } + + return res, dropped_totals def get_paired_files(all_files): @@ -101,10 +115,38 @@ def format_labeled_dataset(data_directory, output_file, dataset_name, paired_files = get_paired_files(all_files) # Format doc file pairs - res = [ - format_annotated_document(fname_pair, dataset_name, nlp, coref) - for fname_pair in paired_files - ] + overall_dropped_totals = { + 'dropped_ents': 0, + 'total_original_ents': 0, + 'dropped_rels': 0, + 'total_original_rels': 0, + 'dropped_equiv_rels': 0, + 'total_original_equiv_rels': 0, + 'dropped_events': 0, + 'total_original_events': 0 + } + res = [] + for fname_pair in paired_files: + r, dropped_totals = format_annotated_document(fname_pair, dataset_name, + nlp, coref) + res.append(r) + overall_dropped_totals = { + k: v + dropped_totals[k] + for k, v in overall_dropped_totals.items() + } + + print( + '\n\nCompleted conversion for entire dataset! ' + f'{overall_dropped_totals["dropped_ents"]} of ' + f'{overall_dropped_totals["total_original_ents"]} original entities ' + 'were dropped due to tokenization mismatches. As a result, ' + f'{overall_dropped_totals["dropped_rels"]} of ' + f'{overall_dropped_totals["total_original_rels"]} original relations, ' + f'{overall_dropped_totals["dropped_equiv_rels"]} of ' + f'{overall_dropped_totals["total_original_equiv_rels"]} coreference ' + f'clusters, and {overall_dropped_totals["dropped_events"]} of ' + f'{overall_dropped_totals["total_original_events"]} events ' + 'were dropped.') # Write out doc dictionaries with open(output_file, "w") as f: