diff --git a/dygie/tests/data/annotated_doc_test.py b/dygie/tests/data/annotated_doc_test.py index 184ea98..4a1db16 100644 --- a/dygie/tests/data/annotated_doc_test.py +++ b/dygie/tests/data/annotated_doc_test.py @@ -110,7 +110,6 @@ def setUp(self): nlp, dataset, coref=True) - self.annotated_doc.char_to_token() # Set up relation self.rel1 = ad.BinRel("R1\tMayor-Of Arg1:T2 Arg2:T3".split()) @@ -118,10 +117,36 @@ def setUp(self): # Right answer self.relations = [[], [[6, 7, 9, 11, "Mayor-Of"]], []] + # Missing entity annotations + missing_ann = ("T1\tCity 0 7\tSeattle\n" + "T2\tPerson 22 37\tJenny Durkan\n" + "T3\tCity 41 51\tthe city's\n" + "T4\tPerson 59 62\tShe\n" + "T5\tPersonnel.Election 67 74\telected\n" + "T6\tYear 78 82\t2017\n" + "R1\tMayor-Of Arg1:T2 Arg2:T3\n" + "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" + "*\tEQUIV T1 T3\n" + "*\tEQUIV T2 T4\n") + missing_ann_path = f'{self.tmpdir}/missing_myfile.ann' + with open(missing_ann_path, 'w') as f: + f.write(missing_ann) + + # Set up annotated_doc object + self.missing_annotated_doc = ad.AnnotatedDoc.parse_ann( + text_path, missing_ann_path, nlp, dataset, coref=True) + + # Right answer + self.missing_relations = [[], [], []] + def tearDown(self): shutil.rmtree(self.tmpdir) + # set_arg_objects is always called *before* char_to_token + # They will fail if run in the opposite order with entities that get + # dropped, but if they are only used with brat_to_input.py, the order is + # baked in and therefore safe def test_set_arg_objects(self): self.rel1.set_arg_objects(self.annotated_doc.ents) @@ -129,14 +154,31 @@ def test_set_arg_objects(self): self.assertEqual(self.rel1.arg1, self.annotated_doc.ents[1]) self.assertEqual(self.rel1.arg2, self.annotated_doc.ents[2]) + def test_set_arg_objects_missing_arg(self): + + self.rel1.set_arg_objects(self.missing_annotated_doc.ents) + + self.assertEqual(self.rel1.arg1, self.missing_annotated_doc.ents[1]) + self.assertEqual(self.rel1.arg2, self.missing_annotated_doc.ents[2]) + def test_format_bin_rels_dygiepp(self): self.rel1.set_arg_objects(self.annotated_doc.ents) - relations, dropped_rels = ad.BinRel.format_bin_rels_dygiepp([self.rel1], - self.sent_idx_tups) + self.annotated_doc.char_to_token() + relations, dropped_rels = ad.BinRel.format_bin_rels_dygiepp( + [self.rel1], self.sent_idx_tups) self.assertEqual(relations, self.relations) + def test_format_bin_rels_dygiepp_missing_arg(self): + + self.rel1.set_arg_objects(self.missing_annotated_doc.ents) + self.missing_annotated_doc.char_to_token() + relations, dropped_rels = ad.BinRel.format_bin_rels_dygiepp( + [self.rel1], self.sent_idx_tups) + + self.assertEqual(relations, self.missing_relations) + class TestEvent(unittest.TestCase): def setUp(self): @@ -175,7 +217,6 @@ def setUp(self): nlp, dataset, coref=True) - self.annotated_doc.char_to_token() # Set up events self.event1 = ad.Event( @@ -186,6 +227,28 @@ def setUp(self): [[[16, "Personnel.Election"], [14, 14, "Person"], [18, 18, "Year"]]]] + # Missing entity annotations + missing_ann = ("T1\tCity 0 7\tSeattle\n" + "T2\tPerson 22 37\tJenny Durkan\n" + "T3\tCity 41 51\tthe city's\n" + "T4\tPerson 59 62\tShe\n" + "T5\tPersonnel.Election 63 74\telected\n" + "T6\tYear 78 82\t2017\n" + "R1\tMayor-Of Arg1:T2 Arg2:T3\n" + "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" + "*\tEQUIV T1 T3\n" + "*\tEQUIV T2 T4\n") + missing_ann_path = f'{self.tmpdir}/missing_myfile.ann' + with open(missing_ann_path, 'w') as f: + f.write(missing_ann) + + # Set up annotated_doc object + self.missing_annotated_doc = ad.AnnotatedDoc.parse_ann( + text_path, missing_ann_path, nlp, dataset, coref=True) + + # Right answer + self.missing_events = [[], [], []] + def tearDown(self): shutil.rmtree(self.tmpdir) @@ -202,11 +265,32 @@ def test_set_arg_objects(self): def test_format_events_dygiepp(self): self.event1.set_arg_objects(self.annotated_doc.ents) - events, dropped_events = ad.Event.format_events_dygiepp([self.event1], - self.sent_idx_tups) + self.annotated_doc.char_to_token() + events, dropped_events = ad.Event.format_events_dygiepp( + [self.event1], self.sent_idx_tups) self.assertEqual(events, self.events) + def test_set_arg_objects_missing_ann(self): + + self.event1.set_arg_objects(self.missing_annotated_doc.ents) + + self.assertEqual(self.event1.trigger, + self.missing_annotated_doc.ents[4]) + self.assertEqual(self.event1.args, [ + self.missing_annotated_doc.ents[3], + self.missing_annotated_doc.ents[5] + ]) + + def test_format_events_dygiepp_missing_ann(self): + + self.event1.set_arg_objects(self.missing_annotated_doc.ents) + self.missing_annotated_doc.char_to_token() + events, dropped_events = ad.Event.format_events_dygiepp( + [self.event1], self.sent_idx_tups) + + self.assertEqual(events, self.missing_events) + class TestEquivRel(unittest.TestCase): def setUp(self): @@ -243,7 +327,6 @@ def setUp(self): nlp, dataset, coref=True) - self.annotated_doc.char_to_token() # Set up equivalence relations self.equivrel1 = ad.EquivRel("*\tEQUIV T1 T3".split()) @@ -252,6 +335,28 @@ def setUp(self): # The dygiepp-formatted correct answer self.corefs = [[[0, 0], [9, 11]], [[6, 7], [14, 14]]] + # Missing entity annotations + missing_ann = ("T1\tCity 0 7\tSeattle\n" + "T2\tPerson 22 37\tJenny Durkan\n" + "T3\tCity 41 51\tthe city's\n" + "T4\tPerson 59 62\tShe\n" + "T5\tPersonnel.Election 67 74\telected\n" + "T6\tYear 78 82\t2017\n" + "R1\tMayor-Of Arg1:T2 Arg2:T3\n" + "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" + "*\tEQUIV T1 T3\n" + "*\tEQUIV T2 T4\n") + missing_ann_path = f'{self.tmpdir}/missing_myfile.ann' + with open(missing_ann_path, 'w') as f: + f.write(missing_ann) + + # Set up annotated_doc object + self.missing_annotated_doc = ad.AnnotatedDoc.parse_ann( + text_path, missing_ann_path, nlp, dataset, coref=True) + + # The dygiepp-formatted correct answer + self.missing_corefs = [[[0, 0], [9, 11]]] + def tearDown(self): shutil.rmtree(self.tmpdir) @@ -272,11 +377,36 @@ def test_format_corefs_dygiepp(self): self.equivrel1.set_arg_objects(self.annotated_doc.ents) self.equivrel2.set_arg_objects(self.annotated_doc.ents) + self.annotated_doc.char_to_token() corefs, dropped_equiv_rels = ad.EquivRel.format_corefs_dygiepp( [self.equivrel1, self.equivrel2]) self.assertEqual(corefs, self.corefs) + def test_set_arg_objects_missing_ann(self): + + self.equivrel1.set_arg_objects(self.missing_annotated_doc.ents) + self.equivrel2.set_arg_objects(self.missing_annotated_doc.ents) + + self.assertEqual(self.equivrel1.args, [ + self.missing_annotated_doc.ents[0], + self.missing_annotated_doc.ents[2] + ]) + self.assertEqual(self.equivrel2.args, [ + self.missing_annotated_doc.ents[1], + self.missing_annotated_doc.ents[3] + ]) + + def test_format_corefs_dygiepp_missing_ann(self): + + self.equivrel1.set_arg_objects(self.missing_annotated_doc.ents) + self.equivrel2.set_arg_objects(self.missing_annotated_doc.ents) + self.missing_annotated_doc.char_to_token() + corefs, dropped_equiv_rels = ad.EquivRel.format_corefs_dygiepp( + [self.equivrel1, self.equivrel2]) + + self.assertEqual(corefs, self.missing_corefs) + class TestAnnotatedDoc(unittest.TestCase): """ @@ -362,7 +492,6 @@ def test_format_dygiepp(self): coref=True) annotated_doc.char_to_token() res = annotated_doc.format_dygiepp() - self.assertEqual(res, self.dygiepp_dict) diff --git a/scripts/new-dataset/annotated_doc.py b/scripts/new-dataset/annotated_doc.py index fcaac30..f3b12c9 100644 --- a/scripts/new-dataset/annotated_doc.py +++ b/scripts/new-dataset/annotated_doc.py @@ -80,7 +80,7 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref): if ';' in line[:second_tab]: idx = line[:line.index("\t")] warnings.warn(f'Entity "{line[second_tab:]}" (ID: ' - f'{idx}) is disjoint, and will be dropped.') + f'{idx}) is disjoint, and will be dropped.') else: lines_continuous.append(line) else: @@ -128,7 +128,6 @@ def parse_ann(cls, txt, ann, nlp, dataset, coref): return annotated_doc - def set_annotation_objects(self): """ For each type of annotation, replace the string IDs with the @@ -138,7 +137,6 @@ def set_annotation_objects(self): [event.set_arg_objects(self.ents) for event in self.events] [equiv_rel.set_arg_objects(self.ents) for equiv_rel in self.equiv_rels] - def format_dygiepp(self): """ Creates a dygiepp-formatted json for the doc, using each class' @@ -157,23 +155,27 @@ def format_dygiepp(self): # Format data ner = Ent.format_ner_dygiepp(self.ents, sent_idx_tups) - bin_rels, self.dropped_rels = BinRel.format_bin_rels_dygiepp(self.bin_rels, - sent_idx_tups) - print(f'Completed relation formatting for {self.doc_key}. {self.dropped_rels} of ' - f'{self.total_original_rels} relations were dropped due to tokenization mismatches.') + bin_rels, self.dropped_rels = BinRel.format_bin_rels_dygiepp( + self.bin_rels, sent_idx_tups) + print( + f'Completed relation formatting for {self.doc_key}. {self.dropped_rels} of ' + f'{self.total_original_rels} relations were dropped due to tokenization mismatches.' + ) if len(self.equiv_rels ) > 0 and self.coref: # Some datasets don't have coreferences - corefs, self.dropped_equiv_rels = EquivRel.format_corefs_dygiepp(self.equiv_rels) + corefs, self.dropped_equiv_rels = EquivRel.format_corefs_dygiepp( + self.equiv_rels) print(f'Completed coreference formatting for {self.doc_key}. ' - f'{self.dropped_equiv_rels} of ' - f'{self.total_original_equiv_rels} were dropped due to ' - 'tokenization mismatches.') + f'{self.dropped_equiv_rels} of ' + f'{self.total_original_equiv_rels} were dropped due to ' + 'tokenization mismatches.') if len(self.events) > 0: # Some datasets don't have events - events = Event.format_events_dygiepp(self.events, sent_idx_tups) + events, self.dropped_events = Event.format_events_dygiepp( + self.events, sent_idx_tups) print(f'Completed event formatting for {self.doc_key}. ' - f'{self.dropped_events} of ' - f'{self.total_original_events} were dropped due to ' - 'tokenization mismatches.') + f'{self.dropped_events} of ' + f'{self.total_original_events} were dropped due to ' + 'tokenization mismatches.') # Make dict res = { @@ -192,7 +194,6 @@ def format_dygiepp(self): return res - def char_to_token(self): """ Does the heavy lifting for converting brat format to dygiepp format. @@ -218,8 +219,9 @@ def char_to_token(self): # If the entity can't be found because there isn't an exact # match in the list, warn that it will be dropped - warnings.warn(f'The entity {ent.text} (ID: {ent.ID}) cannot ' - 'be aligned to the tokenization, and will be dropped.') + warnings.warn( + f'The entity {ent.text} (ID: {ent.ID}) cannot ' + 'be aligned to the tokenization, and will be dropped.') self.dropped_ents += 1 else: @@ -238,11 +240,13 @@ def char_to_token(self): # Double-check that the tokens from the annotation file match up # with the tokens in the source text. ent_tok_text = [tok.text.lower() for tok in processed_ent] - doc_tok_text = [tok.text.lower() for i, tok in enumerate(tok_text) - if i >= ent_tok_start and i <= ent_tok_end] + doc_tok_text = [ + tok.text.lower() for i, tok in enumerate(tok_text) + if i >= ent_tok_start and i <= ent_tok_end + ] if ent_tok_text != doc_tok_text: msg = ('The annotation file and source document disagree ' - f'on the tokens for entity {ent.text} (ID: ' + f'on the tokens for entity {ent.text} (ID: ' f'{ent.ID}). This entity will be dropped.') warnings.warn(msg) self.dropped_ents += 1 @@ -257,9 +261,10 @@ def char_to_token(self): # Set the list of entities that had token matches as ents for doc self.ents = ent_list_tokens - print(f'Completed character to token conversion for doc {self.doc_key}. ' - f'{self.dropped_ents} of {self.total_original_ents} entities ' - 'were dropped due to tokenization mismatches.') + print( + f'Completed character to token conversion for doc {self.doc_key}. ' + f'{self.dropped_ents} of {self.total_original_ents} entities ' + 'were dropped due to tokenization mismatches.') class Ent: @@ -379,11 +384,12 @@ def format_bin_rels_dygiepp(rel_list, sent_idx_tups): for rel in rel_list: # Check to make sure both entities actually have token starts if rel.arg1.tok_start == None or rel.arg2.tok_start == None: - warnings.warn('Either the start or end token for relation ' - f'{rel.arg1.text} -- {rel.label} -- {rel.arg2.text} ' - f'(ID: {rel.ID}) was dropped due to tokenization ' - 'mismatches. This relation will also be dropped ' - 'as a result.') + warnings.warn( + 'Either the start or end token for relation ' + f'{rel.arg1.text} -- {rel.label} -- {rel.arg2.text} ' + f'(ID: {rel.ID}) was dropped due to tokenization ' + 'mismatches. This relation will also be dropped ' + 'as a result.') dropped_rels += 1 continue rel_start = rel.arg1.tok_start @@ -477,10 +483,11 @@ def format_events_dygiepp(event_list, sent_idx_tups): # have token starts # First, check the trigger if event.trigger.tok_start == None or event.trigger.tok_end == None: - warnings.warn(f'The trigger for event ID: {event.ID} ' - f'(trigger: {event.trigger.text} was dropped due ' - 'to tokenization mismatches. This event will be ' - 'dropped as a result.') + warnings.warn( + f'The trigger for event ID: {event.ID} ' + f'(trigger: {event.trigger.text} was dropped due ' + 'to tokenization mismatches. This event will be ' + 'dropped as a result.') dropped_events += 1 continue # Then check all the arguments in the event @@ -489,9 +496,10 @@ def format_events_dygiepp(event_list, sent_idx_tups): if arg_obj.tok_start == None or arg_obj.tok_end == None: any_missing_arg = True if any_missing_arg: - warnings.warn(f'One or more arguments for event ID: ' - f'{event.ID} were dropped due to tokenization mismatches. ' - 'This event will be dropped as a result.') + warnings.warn( + f'One or more arguments for event ID: ' + f'{event.ID} were dropped due to tokenization mismatches. ' + 'This event will be dropped as a result.') dropped_events += 1 continue @@ -583,10 +591,12 @@ def format_corefs_dygiepp(equiv_rels_list): if arg.tok_start == None or arg.tok_end == None: any_missing_args = True if any_missing_args: - warnings.warn('One or more arguments in the coreference ' - f'cluster EquivRel ID: {equiv_rel.ID} was dropped ' - 'Due to entity tokenization mismatches. This ' - 'coreference will also be dropped as a reult.') + arg_texts = [arg.text for arg in equiv_rel.args] + warnings.warn( + 'One or more arguments in the coreference ' + f'cluster {equiv_rel.label, arg_texts} was dropped ' + 'Due to entity tokenization mismatches. This ' + 'coreference will also be dropped as a reult.') dropped_equiv_rels += 1 continue diff --git a/scripts/new-dataset/brat_to_input.py b/scripts/new-dataset/brat_to_input.py index e460697..c8b9eab 100644 --- a/scripts/new-dataset/brat_to_input.py +++ b/scripts/new-dataset/brat_to_input.py @@ -46,16 +46,16 @@ def format_annotated_document(fname_pair, dataset_name, nlp, coref): res = annotated_doc.format_dygiepp() # Get the numbers of dropped entities and relations for this document - dropped_totals = {'dropped_ents': annotated_doc.dropped_ents, - 'total_original_ents': annotated_doc.total_original_ents, - 'dropped_rels': annotated_doc.dropped_rels, - 'total_original_rels': annotated_doc.total_original_rels, - 'dropped_equiv_rels': annotated_doc.dropped_equiv_rels, - 'total_original_equiv_rels': - annotated_doc.total_original_equiv_rels, - 'dropped_events': annotated_doc.dropped_events, - 'total_original_events': - annotated_doc.total_original_events} + dropped_totals = { + 'dropped_ents': annotated_doc.dropped_ents, + 'total_original_ents': annotated_doc.total_original_ents, + 'dropped_rels': annotated_doc.dropped_rels, + 'total_original_rels': annotated_doc.total_original_rels, + 'dropped_equiv_rels': annotated_doc.dropped_equiv_rels, + 'total_original_equiv_rels': annotated_doc.total_original_equiv_rels, + 'dropped_events': annotated_doc.dropped_events, + 'total_original_events': annotated_doc.total_original_events + } return res, dropped_totals @@ -115,18 +115,28 @@ def format_labeled_dataset(data_directory, output_file, dataset_name, paired_files = get_paired_files(all_files) # Format doc file pairs - overall_dropped_totals = {'dropped_ents':0, 'total_original_ents':0, - 'dropped_rels':0, 'total_original_rels':0, 'dropped_equiv_rels':0, - 'total_original_equiv_rels':0, 'dropped_events':0, - 'total_original_events':0} + overall_dropped_totals = { + 'dropped_ents': 0, + 'total_original_ents': 0, + 'dropped_rels': 0, + 'total_original_rels': 0, + 'dropped_equiv_rels': 0, + 'total_original_equiv_rels': 0, + 'dropped_events': 0, + 'total_original_events': 0 + } res = [] for fname_pair in paired_files: - r, dropped_totals = format_annotated_document(fname_pair, dataset_name, nlp, coref) + r, dropped_totals = format_annotated_document(fname_pair, dataset_name, + nlp, coref) res.append(r) - overall_dropped_totals = {k: v + dropped_totals[k] for k, v in - overall_dropped_totals.items()} + overall_dropped_totals = { + k: v + dropped_totals[k] + for k, v in overall_dropped_totals.items() + } - print('\n\nCompleted conversion for entire dataset! ' + print( + '\n\nCompleted conversion for entire dataset! ' f'{overall_dropped_totals["dropped_ents"]} of ' f'{overall_dropped_totals["total_original_ents"]} original entities ' 'were dropped due to tokenization mismatches. As a result, '